From ef70f4863fe4768f267619556c7e41a128c7206c Mon Sep 17 00:00:00 2001
From: jiepan <jie.pan@intel.com>
Date: Thu, 22 Aug 2024 09:36:59 +0800
Subject: [PATCH] Add 256-bit AVX support (#21684)

Since webassembly only supports 128-bit fixed vector length, one 256-bit AVX intrinsic is
emulated by two 128-bit instrinsics.
---
 site/source/docs/porting/simd.rst |    6 +-
 system/include/compat/avxintrin.h | 1913 ++++++++++++++++++++++++++---
 test/sse/test_avx.cpp             |  354 +++++-
 test/sse/test_sse.h               | 1221 +++++++++++++++++-
 4 files changed, 3264 insertions(+), 230 deletions(-)

diff --git a/site/source/docs/porting/simd.rst b/site/source/docs/porting/simd.rst
index 7c67059f8ff8e..f5c12ff597509 100644
--- a/site/source/docs/porting/simd.rst
+++ b/site/source/docs/porting/simd.rst
@@ -12,7 +12,7 @@ Emscripten supports the `WebAssembly SIMD <https://github.com/webassembly/simd/>
 1. Enable LLVM/Clang SIMD autovectorizer to automatically target WebAssembly SIMD, without requiring changes to C/C++ source code.
 2. Write SIMD code using the GCC/Clang SIMD Vector Extensions (``__attribute__((vector_size(16)))``)
 3. Write SIMD code using the WebAssembly SIMD intrinsics (``#include <wasm_simd128.h>``)
-4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 or 128-bit subset of the AVX intrinsics (``#include <*mmintrin.h>``)
+4. Compile existing SIMD code that uses the x86 SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 or AVX intrinsics (``#include <*mmintrin.h>``)
 5. Compile existing SIMD code that uses the ARM NEON intrinsics (``#include <arm_neon.h>``)
 
 These techniques can be freely combined in a single program.
@@ -153,7 +153,7 @@ Emscripten supports compiling existing codebases that use x86 SSE instructions b
 * **SSE4.2**: pass ``-msse4.2`` and ``#include <nmmintrin.h>``. Use ``#ifdef __SSE4_2__`` to gate code.
 * **AVX**: pass ``-mavx`` and ``#include <immintrin.h>``. Use ``#ifdef __AVX__`` to gate code.
 
-Currently only the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and 128-bit AVX instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
+Currently only the SSE1, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and AVX instruction sets are supported. Each of these instruction sets add on top of the previous ones, so e.g. when targeting SSE3, the instruction sets SSE1 and SSE2 are also available.
 
 The following tables highlight the availability and expected performance of different SSE* intrinsics. This can be useful for understanding the performance limitations that the Wasm SIMD specification has when running on x86 hardware.
 
@@ -1136,7 +1136,7 @@ The following table highlights the availability and expected performance of diff
    * - _mm_testz_ps
      - 💣 emulated with complex SIMD+scalar sequence
 
-Only the 128-bit wide instructions from AVX instruction set are available. 256-bit wide AVX instructions are not provided.
+Only the 128-bit wide instructions from AVX instruction set are listed. The 256-bit wide AVX instructions are emulated by two 128-bit wide instructions.
 
 
 ====================================================== 
diff --git a/system/include/compat/avxintrin.h b/system/include/compat/avxintrin.h
index 50e2e7d130abb..dfc21469902b9 100644
--- a/system/include/compat/avxintrin.h
+++ b/system/include/compat/avxintrin.h
@@ -11,14 +11,505 @@
 #error "AVX instruction set not enabled"
 #endif
 
+#include <emmintrin.h>
+#include <emscripten/console.h>
 #include <nmmintrin.h>
+#include <pmmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_broadcast_ss(const float *__mem_addr)
-{
-  return (__m128)wasm_v32x4_load_splat(__mem_addr);
+typedef struct {
+  __m128d v0;
+  __m128d v1;
+} __m256d;
+
+typedef struct {
+  __m128 v0;
+  __m128 v1;
+} __m256;
+
+typedef struct {
+  __m128i v0;
+  __m128i v1;
+} __m256i;
+
+typedef long long __m128i_u;
+typedef struct {
+  __m128i_u v0;
+  __m128i_u v1;
+} __m256i_u;
+
+union m256_data {
+  __m256i int_view;
+  __m256d double_view;
+  __m256 float_view;
+  __m128i_u int_u_view;
+};
+
+#define UNIMPLEMENTED(name)                                                    \
+  emscripten_err("warning: unsupported avx intrinsic: " #name "\n")
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_add_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_add_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_add_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_add_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_sub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_sub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_sub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_sub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a,
+                                                            __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_addsub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_addsub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a,
+                                                           __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_addsub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_addsub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_div_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_div_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_div_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_div_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_max_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_max_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_max_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_max_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_min_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_min_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_min_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_min_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_mul_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_mul_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_mul_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_mul_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+  __m256d ret;
+  ret.v0 = _mm_sqrt_pd(__a.v0);
+  ret.v1 = _mm_sqrt_pd(__a.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_sqrt_ps(__a.v0);
+  ret.v1 = _mm_sqrt_ps(__a.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_rsqrt_ps(__a.v0);
+  ret.v1 = _mm_rsqrt_ps(__a.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_rcp_ps(__a.v0);
+  ret.v1 = _mm_rcp_ps(__a.v1);
+  return ret;
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_round_pd(__m256d __a,
+                                                             int __rounding) {
+  __m256d ret;
+  ret.v0 = _mm_round_pd(__a.v0, __rounding);
+  ret.v1 = _mm_round_pd(__a.v1, __rounding);
+  return ret;
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_round_ps(__m256 __a,
+                                                            int __rounding) {
+  __m256 ret;
+  ret.v0 = _mm_round_ps(__a.v0, __rounding);
+  ret.v1 = _mm_round_ps(__a.v1, __rounding);
+  return ret;
+}
+
+#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_and_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_and_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_and_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_and_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a,
+                                                            __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_andnot_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_andnot_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a,
+                                                           __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_andnot_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_andnot_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a,
+                                                        __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_or_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_or_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_or_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_or_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a,
+                                                         __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_xor_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_xor_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a,
+                                                        __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_xor_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_xor_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a,
+                                                          __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_hadd_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_hadd_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a,
+                                                         __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_hadd_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_hadd_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a,
+                                                          __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_hsub_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_hsub_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a,
+                                                         __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_hsub_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_hsub_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS _mm_permutevar_pd(__m128d __a,
+                                                             __m128i __c) {
+  return (__m128d)wasm_f64x2_make(
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 0) >> 1) & 1],
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 1) >> 1) & 1]);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a,
+                                                                __m256i __c) {
+  __m256d ret;
+  ret.v0 = _mm_permutevar_pd(__a.v0, __c.v0);
+  ret.v1 = _mm_permutevar_pd(__a.v1, __c.v1);
+  return ret;
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS _mm_permutevar_ps(__m128 __a,
+                                                            __m128i __c) {
+  return (__m128)wasm_f32x4_make(
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 0) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 1) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 2) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 3) & 3]);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a,
+                                                               __m256i __c) {
+  __m256 ret;
+  ret.v0 = _mm_permutevar_ps(__a.v0, __c.v0);
+  ret.v1 = _mm_permutevar_ps(__a.v1, __c.v1);
+  return ret;
+}
+
+#define _mm_permute_pd(__a, __imm)                                             \
+  __extension__({                                                              \
+    (__m128d) wasm_i64x2_shuffle(                                              \
+      (__m128d)(__a), (__m128d)(__a), ((__imm) & 1), (((__imm) >> 1) & 1));    \
+  })
+
+#define _mm256_permute_pd(__a, __imm)                                          \
+  __extension__({                                                              \
+    _mm256_set_m128d(_mm_permute_pd((__a).v1, (__imm) >> 2),                   \
+                     _mm_permute_pd((__a).v0, (__imm)));                       \
+  })
+
+#define _mm_permute_ps(__a, __imm)                                             \
+  __extension__({                                                              \
+    (__m128) wasm_i32x4_shuffle((__m128)(__a),                                 \
+                                (__m128)(__a),                                 \
+                                ((__imm) & 3),                                 \
+                                (((__imm) >> 2) & 3),                          \
+                                (((__imm) >> 4) & 3),                          \
+                                (((__imm) >> 6) & 3));                         \
+  })
+
+#define _mm256_permute_ps(__a, __imm)                                          \
+  __extension__({                                                              \
+    _mm256_set_m128(_mm_permute_ps((__a).v1, (__imm)),                         \
+                    _mm_permute_ps((__a).v0, (__imm)));                        \
+  })
+
+static inline __m128d select4d(__m256d __a, __m256d __b, const int imm8) {
+  const int index = imm8 & 0x3;
+  __m128d tmp;
+  switch (index) {
+    case 0:
+      tmp = __a.v0;
+      break;
+    case 1:
+      tmp = __a.v1;
+      break;
+    case 2:
+      tmp = __b.v0;
+      break;
+    case 3:
+      tmp = __b.v1;
+      break;
+  }
+  if (imm8 & 0x8) {
+    tmp = (__m128d)wasm_i64x2_const_splat(0);
+  }
+  return tmp;
+}
+
+static inline __m128 select4(__m256 __a, __m256 __b, const int imm8) {
+  const int index = imm8 & 0x3;
+  __m128 tmp;
+  switch (index) {
+    case 0:
+      tmp = __a.v0;
+      break;
+    case 1:
+      tmp = __a.v1;
+      break;
+    case 2:
+      tmp = __b.v0;
+      break;
+    case 3:
+      tmp = __b.v1;
+      break;
+  }
+  if (imm8 & 0x8) {
+    tmp = (__m128)wasm_i64x2_const_splat(0);
+  }
+  return tmp;
+}
+
+static inline __m128i select4i(__m256i __a, __m256i __b, const int imm8) {
+  const int index = imm8 & 0x3;
+  __m128i tmp;
+  switch (index) {
+    case 0:
+      tmp = __a.v0;
+      break;
+    case 1:
+      tmp = __a.v1;
+      break;
+    case 2:
+      tmp = __b.v0;
+      break;
+    case 3:
+      tmp = __b.v1;
+      break;
+  }
+  if (imm8 & 0x8) {
+    tmp = (__m128i)wasm_i64x2_const_splat(0);
+  }
+  return tmp;
+}
+
+static inline __m256d
+_mm256_permute2f128_pd(__m256d __a, __m256d __b, const int imm8) {
+  __m256d ret;
+  ret.v0 = select4d(__a, __b, imm8);
+  ret.v1 = select4d(__a, __b, imm8 >> 4);
+  return ret;
+}
+
+static inline __m256
+_mm256_permute2f128_ps(__m256 __a, __m256 __b, const int imm8) {
+  __m256 ret;
+  ret.v0 = select4(__a, __b, imm8);
+  ret.v1 = select4(__a, __b, imm8 >> 4);
+  return ret;
+}
+
+static inline __m256i
+_mm256_permute2f128_si256(__m256i __a, __m256i __b, const int imm8) {
+  __m256i ret;
+  ret.v0 = select4i(__a, __b, imm8);
+  ret.v1 = select4i(__a, __b, imm8 >> 4);
+  return ret;
+}
+
+#define _mm256_blend_pd(__a, __b, imm8)                                        \
+  __extension__({                                                              \
+    _mm256_set_m128d(_mm_blend_pd((__a).v1, (__b).v1, (imm8) >> 2),            \
+                     _mm_blend_pd((__a).v0, (__b).v0, (imm8)));                \
+  })
+
+#define _mm256_blend_ps(__a, __b, imm)                                         \
+  __extension__({                                                              \
+    _mm256_set_m128(_mm_blend_ps((__a).v1, (__b).v1, (imm) >> 4),              \
+                    _mm_blend_ps((__a).v0, (__b).v0, (imm)));                  \
+  })
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a,
+                                                            __m256d __b,
+                                                            __m256d __c) {
+  __m256d ret;
+  ret.v0 = _mm_blendv_pd(__a.v0, __b.v0, __c.v0);
+  ret.v1 = _mm_blendv_pd(__a.v1, __b.v1, __c.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a,
+                                                           __m256 __b,
+                                                           __m256 __c) {
+  __m256 ret;
+  ret.v0 = _mm_blendv_ps(__a.v0, __b.v0, __c.v0);
+  ret.v1 = _mm_blendv_ps(__a.v1, __b.v1, __c.v1);
+  return ret;
 }
 
+#define _mm256_dp_ps(__a, __b, imm)                                            \
+  __extension__({                                                              \
+    _mm256_set_m128(_mm_dp_ps((__a).v1, (__b).v1, (imm)),                      \
+                    _mm_dp_ps((__a).v0, (__b).v0, (imm)));                     \
+  })
+
+#define _mm256_shuffle_ps(__a, __b, mask)                                      \
+  __extension__({                                                              \
+    _mm256_set_m128(_mm_shuffle_ps((__a).v1, (__b).v1, (mask)),                \
+                    _mm_shuffle_ps((__a).v0, (__b).v0, (mask)));               \
+  })
+
+#define _mm256_shuffle_pd(__a, __b, mask)                                      \
+  __extension__({                                                              \
+    _mm256_set_m128d(_mm_shuffle_pd((__a).v1, (__b).v1, (mask) >> 2),          \
+                     _mm_shuffle_pd((__a).v0, (__b).v0, (mask)));              \
+  })
+
 #define _CMP_EQ_OQ 0
 #define _CMP_LT_OS 1
 #define _CMP_LE_OS 2
@@ -44,215 +535,1271 @@ _mm_broadcast_ss(const float *__mem_addr)
 #define _CMP_NLE_UQ 22
 #define _CMP_ORD_S 23
 #define _CMP_EQ_US 24
-#define _CMP_NGE_UQ  25
-#define _CMP_NGT_UQ  26
-#define _CMP_FALSE_OS  27
-#define _CMP_NEQ_OS  28
+#define _CMP_NGE_UQ 25
+#define _CMP_NGT_UQ 26
+#define _CMP_FALSE_OS 27
+#define _CMP_NEQ_OS 28
 #define _CMP_GE_OQ 29
 #define _CMP_GT_OQ 30
 #define _CMP_TRUE_US 31
 
-#define _mm_cmp_pd(__a, __b, __imm) __extension__ ({ \
-     __m128d __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_pd((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_or_pd(_mm_cmpeq_pd((__a), (__b)), _mm_cmpunord_pd((__a), (__b))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_pd((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)), _mm_cmpneq_pd((__a), (__b))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_pd((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_setzero_pd(); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_pd((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_pd((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = (__m128d)wasm_i8x16_splat(0xFF); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_pd((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_ps(__a, __b, __imm) __extension__ ({ \
-     __m128 __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_ps((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_or_ps(_mm_cmpeq_ps((__a), (__b)), _mm_cmpunord_ps((__a), (__b))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_ps((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)), _mm_cmpneq_ps((__a), (__b))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_ps((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_setzero_ps(); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_ps((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_ps((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = (__m128)wasm_i8x16_splat(0xFF); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_ps((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_sd(__a, __b, __imm) __extension__ ({ \
-     __m128d __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_sd((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_move_sd((__a), _mm_or_pd(_mm_cmpeq_sd((__a), (__b)), _mm_cmpunord_sd((__a), (__b)))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_sd((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_move_sd((__a), _mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)), _mm_cmpneq_sd((__a), (__b)))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_sd((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_move_sd((__a), _mm_setzero_pd()); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_sd((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_sd((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF)); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_sd((__a), (__b)); \
-     __ret; })
-
-#define _mm_cmp_ss(__a, __b, __imm) __extension__ ({ \
-     __m128 __ret; \
-     if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS) __ret = _mm_cmpeq_ss((__a), (__b)); \
-     if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US) __ret = _mm_move_ss((__a), _mm_or_ps(_mm_cmpeq_ss((__a), (__b)), _mm_cmpunord_ss((__a), (__b)))); \
-     if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ) __ret = _mm_cmplt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ) __ret = _mm_cmple_ss((__a), (__b)); \
-     if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S) __ret = _mm_cmpunord_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US) __ret = _mm_cmpneq_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS) __ret = _mm_move_ss((__a), _mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)), _mm_cmpneq_ss((__a), (__b)))); \
-     if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ) __ret = _mm_cmpnlt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S) __ret = _mm_cmpord_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ) __ret = _mm_cmpnge_ss((__a), (__b)); \
-     if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ) __ret = _mm_cmpngt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS) __ret = _mm_move_ss((__a), _mm_setzero_ps()); \
-     if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ) __ret = _mm_cmpge_ss((__a), (__b)); \
-     if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ) __ret = _mm_cmpgt_ss((__a), (__b)); \
-     if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US) __ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF)); \
-     if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ) __ret = _mm_cmpnle_ss((__a), (__b)); \
-     __ret; })
-
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_pd(const double *__mem_addr, __m128i __mask)
-{
-  // This may cause an out-of-bounds memory load since we first load and
-  // then mask, but since there are no segmentation faults in Wasm memory
-  // accesses, that is ok (as long as we are within the heap bounds -
-  // a negligible limitation in practice)
-  return _mm_and_pd(_mm_load_pd(__mem_addr), (__m128d)wasm_i64x2_shr(__mask, 63));
+#define _mm_cmp_pd(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128d __ret;                                                             \
+    if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS)                        \
+      __ret = _mm_cmpeq_pd((__a), (__b));                                      \
+    if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US)                        \
+      __ret =                                                                  \
+        _mm_or_pd(_mm_cmpeq_pd((__a), (__b)), _mm_cmpunord_pd((__a), (__b)));  \
+    if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ)                        \
+      __ret = _mm_cmplt_pd((__a), (__b));                                      \
+    if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ)                        \
+      __ret = _mm_cmple_pd((__a), (__b));                                      \
+    if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S)                    \
+      __ret = _mm_cmpunord_pd((__a), (__b));                                   \
+    if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US)                      \
+      __ret = _mm_cmpneq_pd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS)                      \
+      __ret = _mm_andnot_pd(_mm_cmpunord_pd((__a), (__b)),                     \
+                            _mm_cmpneq_pd((__a), (__b)));                      \
+    if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ)                      \
+      __ret = _mm_cmpnlt_pd((__a), (__b));                                     \
+    if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S)                        \
+      __ret = _mm_cmpord_pd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ)                      \
+      __ret = _mm_cmpnge_pd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ)                      \
+      __ret = _mm_cmpngt_pd((__a), (__b));                                     \
+    if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS)                  \
+      __ret = _mm_setzero_pd();                                                \
+    if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ)                        \
+      __ret = _mm_cmpge_pd((__a), (__b));                                      \
+    if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ)                        \
+      __ret = _mm_cmpgt_pd((__a), (__b));                                      \
+    if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US)                    \
+      __ret = (__m128d)wasm_i8x16_splat(0xFF);                                 \
+    if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ)                      \
+      __ret = _mm_cmpnle_pd((__a), (__b));                                     \
+    __ret;                                                                     \
+  })
+
+#define _mm_cmp_ps(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128 __ret;                                                              \
+    if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS)                        \
+      __ret = _mm_cmpeq_ps((__a), (__b));                                      \
+    if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US)                        \
+      __ret =                                                                  \
+        _mm_or_ps(_mm_cmpeq_ps((__a), (__b)), _mm_cmpunord_ps((__a), (__b)));  \
+    if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ)                        \
+      __ret = _mm_cmplt_ps((__a), (__b));                                      \
+    if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ)                        \
+      __ret = _mm_cmple_ps((__a), (__b));                                      \
+    if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S)                    \
+      __ret = _mm_cmpunord_ps((__a), (__b));                                   \
+    if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US)                      \
+      __ret = _mm_cmpneq_ps((__a), (__b));                                     \
+    if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS)                      \
+      __ret = _mm_andnot_ps(_mm_cmpunord_ps((__a), (__b)),                     \
+                            _mm_cmpneq_ps((__a), (__b)));                      \
+    if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ)                      \
+      __ret = _mm_cmpnlt_ps((__a), (__b));                                     \
+    if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S)                        \
+      __ret = _mm_cmpord_ps((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ)                      \
+      __ret = _mm_cmpnge_ps((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ)                      \
+      __ret = _mm_cmpngt_ps((__a), (__b));                                     \
+    if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS)                  \
+      __ret = _mm_setzero_ps();                                                \
+    if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ)                        \
+      __ret = _mm_cmpge_ps((__a), (__b));                                      \
+    if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ)                        \
+      __ret = _mm_cmpgt_ps((__a), (__b));                                      \
+    if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US)                    \
+      __ret = (__m128)wasm_i8x16_splat(0xFF);                                  \
+    if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ)                      \
+      __ret = _mm_cmpnle_ps((__a), (__b));                                     \
+    __ret;                                                                     \
+  })
+
+#define _mm_cmp_sd(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128d __ret;                                                             \
+    if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS)                        \
+      __ret = _mm_cmpeq_sd((__a), (__b));                                      \
+    if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US)                        \
+      __ret = _mm_move_sd(                                                     \
+        (__a),                                                                 \
+        _mm_or_pd(_mm_cmpeq_sd((__a), (__b)), _mm_cmpunord_sd((__a), (__b)))); \
+    if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ)                        \
+      __ret = _mm_cmplt_sd((__a), (__b));                                      \
+    if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ)                        \
+      __ret = _mm_cmple_sd((__a), (__b));                                      \
+    if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S)                    \
+      __ret = _mm_cmpunord_sd((__a), (__b));                                   \
+    if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US)                      \
+      __ret = _mm_cmpneq_sd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS)                      \
+      __ret = _mm_move_sd((__a),                                               \
+                          _mm_andnot_pd(_mm_cmpunord_sd((__a), (__b)),         \
+                                        _mm_cmpneq_sd((__a), (__b))));         \
+    if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ)                      \
+      __ret = _mm_cmpnlt_sd((__a), (__b));                                     \
+    if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S)                        \
+      __ret = _mm_cmpord_sd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ)                      \
+      __ret = _mm_cmpnge_sd((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ)                      \
+      __ret = _mm_cmpngt_sd((__a), (__b));                                     \
+    if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS)                  \
+      __ret = _mm_move_sd((__a), _mm_setzero_pd());                            \
+    if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ)                        \
+      __ret = _mm_cmpge_sd((__a), (__b));                                      \
+    if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ)                        \
+      __ret = _mm_cmpgt_sd((__a), (__b));                                      \
+    if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US)                    \
+      __ret = _mm_move_sd((__a), (__m128d)wasm_i8x16_splat(0xFF));             \
+    if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ)                      \
+      __ret = _mm_cmpnle_sd((__a), (__b));                                     \
+    __ret;                                                                     \
+  })
+
+#define _mm_cmp_ss(__a, __b, __imm)                                            \
+  __extension__({                                                              \
+    __m128 __ret;                                                              \
+    if ((__imm) == _CMP_EQ_OQ || (__imm) == _CMP_EQ_OS)                        \
+      __ret = _mm_cmpeq_ss((__a), (__b));                                      \
+    if ((__imm) == _CMP_EQ_UQ || (__imm) == _CMP_EQ_US)                        \
+      __ret = _mm_move_ss(                                                     \
+        (__a),                                                                 \
+        _mm_or_ps(_mm_cmpeq_ss((__a), (__b)), _mm_cmpunord_ss((__a), (__b)))); \
+    if ((__imm) == _CMP_LT_OS || (__imm) == _CMP_LT_OQ)                        \
+      __ret = _mm_cmplt_ss((__a), (__b));                                      \
+    if ((__imm) == _CMP_LE_OS || (__imm) == _CMP_LE_OQ)                        \
+      __ret = _mm_cmple_ss((__a), (__b));                                      \
+    if ((__imm) == _CMP_UNORD_Q || (__imm) == _CMP_UNORD_S)                    \
+      __ret = _mm_cmpunord_ss((__a), (__b));                                   \
+    if ((__imm) == _CMP_NEQ_UQ || (__imm) == _CMP_NEQ_US)                      \
+      __ret = _mm_cmpneq_ss((__a), (__b));                                     \
+    if ((__imm) == _CMP_NEQ_OQ || (__imm) == _CMP_NEQ_OS)                      \
+      __ret = _mm_move_ss((__a),                                               \
+                          _mm_andnot_ps(_mm_cmpunord_ss((__a), (__b)),         \
+                                        _mm_cmpneq_ss((__a), (__b))));         \
+    if ((__imm) == _CMP_NLT_US || (__imm) == _CMP_NLT_UQ)                      \
+      __ret = _mm_cmpnlt_ss((__a), (__b));                                     \
+    if ((__imm) == _CMP_ORD_Q || (__imm) == _CMP_ORD_S)                        \
+      __ret = _mm_cmpord_ss((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGE_US || (__imm) == _CMP_NGE_UQ)                      \
+      __ret = _mm_cmpnge_ss((__a), (__b));                                     \
+    if ((__imm) == _CMP_NGT_US || (__imm) == _CMP_NGT_UQ)                      \
+      __ret = _mm_cmpngt_ss((__a), (__b));                                     \
+    if ((__imm) == _CMP_FALSE_OQ || (__imm) == _CMP_FALSE_OS)                  \
+      __ret = _mm_move_ss((__a), _mm_setzero_ps());                            \
+    if ((__imm) == _CMP_GE_OS || (__imm) == _CMP_GE_OQ)                        \
+      __ret = _mm_cmpge_ss((__a), (__b));                                      \
+    if ((__imm) == _CMP_GT_OS || (__imm) == _CMP_GT_OQ)                        \
+      __ret = _mm_cmpgt_ss((__a), (__b));                                      \
+    if ((__imm) == _CMP_TRUE_UQ || (__imm) == _CMP_TRUE_US)                    \
+      __ret = _mm_move_ss((__a), (__m128)wasm_i8x16_splat(0xFF));              \
+    if ((__imm) == _CMP_NLE_US || (__imm) == _CMP_NLE_UQ)                      \
+      __ret = _mm_cmpnle_ss((__a), (__b));                                     \
+    __ret;                                                                     \
+  })
+
+#define _mm256_cmp_pd(__a, __b, imm)                                           \
+  __extension__({                                                              \
+    _mm256_set_m128d(_mm_cmp_pd((__a).v1, (__b).v1, (imm)),                    \
+                     _mm_cmp_pd((__a).v0, (__b).v0, (imm)));                   \
+  })
+
+#define _mm256_cmp_ps(__a, __b, imm)                                           \
+  __extension__({                                                              \
+    _mm256_set_m128(_mm_cmp_ps((__a).v1, (__b).v1, (imm)),                     \
+                    _mm_cmp_ps((__a).v0, (__b).v0, (imm)));                    \
+  })
+
+#define _mm256_extract_epi32(__a, index)                                       \
+  __extension__({                                                              \
+    (index) < 4 ? _mm_extract_epi32((__a).v0, (index))                         \
+                : _mm_extract_epi32((__a).v1, (index) - 4);                    \
+  })
+
+#define _mm256_extract_epi16(X, N)                                             \
+  __extension__({                                                              \
+    (N) < 8 ? _mm_extract_epi16((X).v0, (N))                                   \
+            : _mm_extract_epi16((X).v1, (N) - 8);                              \
+  })
+
+#define _mm256_extract_epi8(X, N)                                              \
+  __extension__({                                                              \
+    (N) < 16 ? _mm_extract_epi8((X).v0, (N))                                   \
+             : _mm_extract_epi8((X).v1, (N) - 16);                             \
+  })
+
+#define _mm256_extract_epi64(X, N)                                             \
+  __extension__({                                                              \
+    (N) < 2 ? _mm_extract_epi64((X).v0, (N))                                   \
+            : _mm_extract_epi64((X).v1, (N) - 2);                              \
+  })
+
+#define _mm256_insert_epi32(X, I, N)                                           \
+  __extension__({                                                              \
+    ((N) & 7) < 4                                                              \
+      ? _mm256_set_m128i((X).v1, _mm_insert_epi32((X).v0, (I), (N) & 3))       \
+      : _mm256_set_m128i(_mm_insert_epi32((X).v1, (I), (N) & 3), (X).v0);      \
+  })
+
+#define _mm256_insert_epi16(X, I, N)                                           \
+  ({                                                                           \
+    ((N) & 0xF) < 8                                                            \
+      ? _mm256_set_m128i((X).v1, _mm_insert_epi16((X).v0, (I), (N) & 0x7))     \
+      : _mm256_set_m128i(_mm_insert_epi16((X).v1, (I), (N) & 0x7), (X).v0);    \
+  })
+
+#define _mm256_insert_epi8(X, I, N)                                            \
+  ({                                                                           \
+    ((N) & 0x1F) < 16                                                          \
+      ? _mm256_set_m128i((X).v1, _mm_insert_epi8((X).v0, (I), (N) & 0xF))      \
+      : _mm256_set_m128i(_mm_insert_epi8((X).v1, (I), (N) & 0xF), (X).v0);     \
+  })
+
+#define _mm256_insert_epi64(X, I, N)                                           \
+  ({                                                                           \
+    ((N) & 3) < 2                                                              \
+      ? _mm256_set_m128i((X).v1, _mm_insert_epi64((X).v0, (I), (N) & 1))       \
+      : _mm256_set_m128i(_mm_insert_epi64((X).v1, (I), (N) & 1), (X).v0);      \
+  })
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a) {
+  __m256d ret;
+  ret.v0 = _mm_cvtepi32_pd(__a);
+  __m128i __a1 = wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
+  ret.v1 = _mm_cvtepi32_pd(__a1);
+  return ret;
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_maskload_ps(const float *__mem_addr, __m128i __mask)
-{
-  // This may cause an out-of-bounds memory load since we first load and
-  // then mask, but since there are no segmentation faults in Wasm memory
-  // accesses, that is ok (as long as we are within the heap bounds -
-  // a negligible limitation in practice)
-  return _mm_and_ps(_mm_load_ps(__mem_addr), (__m128)_mm_srai_epi32(__mask, 31));
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
-_mm_maskstore_pd(double *__mem_addr, __m128i __mask, __m128d __a)
-{
-  if ((wasm_i64x2_extract_lane(__mask, 0) & 0x8000000000000000ull) != 0)
-    __mem_addr[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
-  if ((wasm_i64x2_extract_lane(__mask, 1) & 0x8000000000000000ull) != 0)
-    __mem_addr[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
-_mm_maskstore_ps(float *__mem_addr, __m128i __mask, __m128 __a)
-{
-  if ((wasm_i32x4_extract_lane(__mask, 0) & 0x80000000ull) != 0)
-    __mem_addr[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
-  if ((wasm_i32x4_extract_lane(__mask, 1) & 0x80000000ull) != 0)
-    __mem_addr[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
-  if ((wasm_i32x4_extract_lane(__mask, 2) & 0x80000000ull) != 0)
-    __mem_addr[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
-  if ((wasm_i32x4_extract_lane(__mask, 3) & 0x80000000ull) != 0)
-    __mem_addr[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
-}
-
-#define _mm_permute_pd(__a, __imm) __extension__ ({ \
-  (__m128d)wasm_i64x2_shuffle((__m128d)(__a), (__m128d)(__a), \
-                              ((__imm) & 1), (((__imm) >> 1) & 1)); })
-
-#define _mm_permute_ps(__a, __imm) __extension__ ({ \
-  (__m128)wasm_i32x4_shuffle((__m128)(__a), (__m128)(__a), \
-                             ((__imm) & 3), (((__imm) >> 2) & 3), \
-                             (((__imm) >> 4) & 3), (((__imm) >> 6) & 3)); })
-
-static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
-_mm_permutevar_pd(__m128d __a, __m128d __b)
-{
-  return (__m128d)wasm_f64x2_make(
-    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 0) >> 1) & 1],
-    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 1) >> 1) & 1]);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a) {
+  __m256 ret;
+  ret.v0 = _mm_cvtepi32_ps(__a.v0);
+  ret.v1 = _mm_cvtepi32_ps(__a.v1);
+  return ret;
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a) {
+  __m128 low = _mm_cvtpd_ps(__a.v0);
+  __m128 high = _mm_cvtpd_ps(__a.v1);
+  __m128 ret = (__m128)wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a) {
+  __m256i ret;
+  ret.v0 = _mm_cvtps_epi32(__a.v0);
+  ret.v1 = _mm_cvtps_epi32(__a.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a) {
+  __m256d ret;
+  ret.v0 = _mm_cvtps_pd(__a);
+  __m128 __a1 = (__m128)wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0);
+  ret.v1 = _mm_cvtps_pd(__a1);
+  return ret;
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a) {
+  __m128i low = _mm_cvttpd_epi32(__a.v0);
+  __m128i high = _mm_cvttpd_epi32(__a.v1);
+  __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a) {
+  __m128i low = _mm_cvtpd_epi32(__a.v0);
+  __m128i high = _mm_cvtpd_epi32(__a.v1);
+  __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a) {
+  __m256i ret;
+  ret.v0 = _mm_cvttps_epi32(__a.v0);
+  ret.v1 = _mm_cvttps_epi32(__a.v1);
+  return ret;
+}
+
+static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a) {
+  return _mm_cvtsd_f64(__a.v0);
 }
 
-static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
-_mm_permutevar_ps(__m128 __a, __m128 __b)
-{
-  return (__m128)wasm_f32x4_make(((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 0) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 1) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 2) & 3],
-    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 3) & 3]);
+static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a) {
+  return _mm_cvtsi128_si32(__a.v0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testc_pd(__m128d __a, __m128d __b)
-{
-  v128_t __m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);
+static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a) {
+  return _mm_cvtss_f32(__a.v0);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_movehdup_ps(__a.v0);
+  ret.v1 = _mm_movehdup_ps(__a.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a) {
+  __m256 ret;
+  ret.v0 = _mm_moveldup_ps(__a.v0);
+  ret.v1 = _mm_moveldup_ps(__a.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a) {
+  __m256d ret;
+  ret.v0 = _mm_movedup_pd(__a.v0);
+  ret.v1 = _mm_movedup_pd(__a.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a,
+                                                              __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_unpackhi_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_unpackhi_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a,
+                                                              __m256d __b) {
+  __m256d ret;
+  ret.v0 = _mm_unpacklo_pd(__a.v0, __b.v0);
+  ret.v1 = _mm_unpacklo_pd(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a,
+                                                             __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_unpackhi_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_unpackhi_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a,
+                                                             __m256 __b) {
+  __m256 ret;
+  ret.v0 = _mm_unpacklo_ps(__a.v0, __b.v0);
+  ret.v1 = _mm_unpacklo_ps(__a.v1, __b.v1);
+  return ret;
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm_testz_pd(__m128d __a, __m128d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);
+  return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm_testc_pd(__m128d __a, __m128d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63);
   return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testc_ps(__m128 __a, __m128 __b)
-{
-  v128_t __m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);
+static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_pd(__m128d __a,
+                                                      __m128d __b) {
+  v128_t __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);
+  v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);
+  return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) &
+         (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm_testz_ps(__m128 __a, __m128 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);
   __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
   __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
   return wasm_i32x4_extract_lane(__m, 0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testnzc_pd(__m128d __a, __m128d __b)
-{
-  v128_t  __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63);
-  v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63);
-  return (wasm_i64x2_extract_lane(__m, 0)  | wasm_i64x2_extract_lane(__m, 1))
-       & (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1));
+static __inline int __DEFAULT_FN_ATTRS _mm_testc_ps(__m128 __a, __m128 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31);
+  __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
+  __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
+  return wasm_i32x4_extract_lane(__m, 0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testnzc_ps(__m128 __a, __m128 __b)
-{
-  v128_t  __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);
+static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_ps(__m128 __a, __m128 __b) {
+  v128_t __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31);
   v128_t __m2 = wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 31);
 
-  __m  = wasm_v128_or(__m,  (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
+  __m = wasm_v128_or(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
   __m2 = wasm_v128_or(__m2, (v128_t)_mm_movehl_ps((__m128)__m2, (__m128)__m2));
-  __m  = wasm_v128_or(__m,  _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
+  __m = wasm_v128_or(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
   __m2 = wasm_v128_or(__m2, _mm_shuffle_epi32(__m2, _MM_SHUFFLE(3, 2, 0, 1)));
 
   return wasm_i32x4_extract_lane(__m, 0) & wasm_i32x4_extract_lane(__m2, 0);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testz_pd(__m128d __a, __m128d __b)
-{
-  v128_t __m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63);
-  return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1);
+static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a,
+                                                       __m256d __b) {
+  return _mm_testz_pd(__a.v0, __b.v0) & _mm_testz_pd(__a.v1, __b.v1);
 }
 
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_mm_testz_ps(__m128 __a, __m128 __b)
-{
-  v128_t __m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31);
-  __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m));
-  __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1)));
-  return wasm_i32x4_extract_lane(__m, 0);
+static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a,
+                                                       __m256d __b) {
+  return _mm_testc_pd(__a.v0, __b.v0) & _mm_testc_pd(__a.v1, __b.v1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a,
+                                                         __m256d __b) {
+  v128_t __m =
+    wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 63);
+  v128_t __m1 =
+    wasm_u64x2_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 63);
+  v128_t __m2 =
+    wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 63);
+  v128_t __m3 =
+    wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 63);
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b) {
+  return _mm_testz_ps(__a.v0, __b.v0) & _mm_testz_ps(__a.v1, __b.v1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b) {
+  return _mm_testc_ps(__a.v0, __b.v0) & _mm_testc_ps(__a.v1, __b.v1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a,
+                                                         __m256 __b) {
+  v128_t __m =
+    wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v0, (v128_t)__b.v0), 31);
+  v128_t __m1 =
+    wasm_u32x4_shr(wasm_v128_and((v128_t)__a.v1, (v128_t)__b.v1), 31);
+  v128_t __m2 =
+    wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v0, (v128_t)__a.v0), 31);
+  v128_t __m3 =
+    wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b.v1, (v128_t)__a.v1), 31);
+
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a,
+                                                          __m256i __b) {
+  return _mm_testz_si128(__a.v0, __b.v0) & _mm_testz_si128(__a.v1, __b.v1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a,
+                                                          __m256i __b) {
+  return _mm_testc_si128(__a.v0, __b.v0) & _mm_testc_si128(__a.v1, __b.v1);
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a,
+                                                            __m256i __b) {
+  v128_t __m = wasm_v128_and(__a.v0, __b.v0);
+  v128_t __m1 = wasm_v128_and(__a.v1, __b.v1);
+  v128_t __m2 = wasm_v128_andnot(__b.v0, __a.v0);
+  v128_t __m3 = wasm_v128_andnot(__b.v1, __a.v1);
+  return wasm_v128_any_true(wasm_v128_or(__m, __m1)) &
+         wasm_v128_any_true(wasm_v128_or(__m2, __m3));
+}
+
+static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a) {
+  return _mm_movemask_pd(__a.v0) | (_mm_movemask_pd(__a.v1) << 2);
 }
 
+static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a) {
+  return _mm_movemask_ps(__a.v0) | (_mm_movemask_ps(__a.v1) << 4);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_zeroall(void) {
+  UNIMPLEMENTED("_mm256_zeroall");
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_zeroupper(void) {
+  UNIMPLEMENTED("_mm256_zeroupper");
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS _mm_broadcast_ss(float const* __a) {
+  return (__m128)wasm_v128_load32_splat(__a);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const* __a) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_v128_load64_splat(__a);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const* __a) {
+  __m256 ret;
+  ret.v1 = ret.v0 = _mm_broadcast_ss(__a);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const* __a) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_v128_load(__a);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const* __a) {
+  __m256 ret;
+  ret.v1 = ret.v0 = (__m128)wasm_v128_load(__a);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const* __p) {
+  __m256d ret;
+  ret.v0 = _mm_load_pd(__p);
+  ret.v1 = _mm_load_pd(__p + 2);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const* __p) {
+  __m256 ret;
+  ret.v0 = _mm_load_ps(__p);
+  ret.v1 = _mm_load_ps(__p + 4);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const* __p) {
+  __m256d ret;
+  ret.v0 = _mm_loadu_pd(__p);
+  ret.v1 = _mm_loadu_pd(__p + 2);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const* __p) {
+  __m256 ret;
+  ret.v0 = _mm_loadu_ps(__p);
+  ret.v1 = _mm_loadu_ps(__p + 4);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_load_si128((__m128i const*)__p);
+  ret.v1 = _mm_load_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i_u const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_loadu_si128((__m128i const*)__p);
+  ret.v1 = _mm_loadu_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i_u const* __p) {
+  __m256i ret;
+  ret.v0 = _mm_lddqu_si128((__m128i const*)__p);
+  ret.v1 = _mm_lddqu_si128(((__m128i const*)__p) + 1);
+  return ret;
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double* __p,
+                                                        __m256d __a) {
+  _mm_store_pd(__p, __a.v0);
+  _mm_store_pd(__p + 2, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float* __p,
+                                                        __m256 __a) {
+  _mm_store_ps(__p, __a.v0);
+  _mm_store_ps(__p + 4, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double* __p,
+                                                         __m256d __a) {
+  _mm_storeu_pd(__p, __a.v0);
+  _mm_storeu_pd(__p + 2, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float* __p,
+                                                         __m256 __a) {
+  _mm_storeu_ps(__p, __a.v0);
+  _mm_storeu_ps(__p + 4, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i* __p,
+                                                           __m256i __a) {
+  _mm_store_si128((__m128i*)__p, __a.v0);
+  _mm_store_si128(((__m128i*)__p) + 1, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i_u* __p,
+                                                            __m256i __a) {
+  _mm_storeu_si128((__m128i*)__p, __a.v0);
+  _mm_storeu_si128(((__m128i*)__p) + 1, __a.v1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS _mm_maskload_pd(double const* __p,
+                                                           __m128i __m) {
+  // This may cause an out-of-bounds memory load since we first load and
+  // then mask, but since there are no segmentation faults in Wasm memory
+  // accesses, that is ok (as long as we are within the heap bounds -
+  // a negligible limitation in practice)
+  return _mm_and_pd(_mm_load_pd(__p), (__m128d)wasm_i64x2_shr(__m, 63));
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const* __p,
+                                                              __m256i __m) {
+  __m256d ret;
+  ret.v0 = _mm_maskload_pd(__p, __m.v0);
+  ret.v1 = _mm_maskload_pd(__p + 2, __m.v1);
+  return ret;
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS _mm_maskload_ps(float const* __p,
+                                                          __m128i __m) {
+  // This may cause an out-of-bounds memory load since we first load and
+  // then mask, but since there are no segmentation faults in Wasm memory
+  // accesses, that is ok (as long as we are within the heap bounds -
+  // a negligible limitation in practice)
+  return _mm_and_ps(_mm_load_ps(__p), (__m128)_mm_srai_epi32(__m, 31));
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const* __p,
+                                                             __m256i __m) {
+  __m256 ret;
+  ret.v0 = _mm_maskload_ps(__p, __m.v0);
+  ret.v1 = _mm_maskload_ps(__p + 4, __m.v1);
+  return ret;
+}
+
+static __inline void
+  __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
+  _mm_maskstore_ps(float* __p, __m128i __m, __m128 __a) {
+  if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0)
+    __p[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
+  if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0)
+    __p[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
+  if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0)
+    __p[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
+  if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0)
+    __p[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float* __p,
+                                                            __m256i __m,
+                                                            __m256 __a) {
+  _mm_maskstore_ps(__p, __m.v0, __a.v0);
+  _mm_maskstore_ps(__p + 4, __m.v1, __a.v1);
+}
+
+static __inline void
+  __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
+  _mm_maskstore_pd(double* __p, __m128i __m, __m128d __a) {
+  if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0)
+    __p[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
+  if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0)
+    __p[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double* __p,
+                                                            __m256i __m,
+                                                            __m256d __a) {
+  _mm_maskstore_pd(__p, __m.v0, __a.v0);
+  _mm_maskstore_pd(__p + 2, __m.v1, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(void* __a,
+                                                            __m256i __b) {
+  _mm_stream_si128((__m128i*)__a, __b.v0);
+  _mm_stream_si128(((__m128i*)__a) + 1, __b.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(void* __a,
+                                                         __m256d __b) {
+  _mm_stream_pd((double*)__a, __b.v0);
+  _mm_stream_pd(((double*)__a) + 2, __b.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(void* __p,
+                                                         __m256 __a) {
+  _mm_stream_ps((float*)__p, __a.v0);
+  _mm_stream_ps(((float*)__p) + 4, __a.v1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void) {
+  __m256d val;
+  return val;
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) {
+  __m256 val;
+  return val;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void) {
+  __m256i val;
+  return val;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a,
+                                                         double __b,
+                                                         double __c,
+                                                         double __d) {
+  __m256d ret;
+  ret.v0 = _mm_set_pd(__c, __d);
+  ret.v1 = _mm_set_pd(__a, __b);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a,
+                                                        float __b,
+                                                        float __c,
+                                                        float __d,
+                                                        float __e,
+                                                        float __f,
+                                                        float __g,
+                                                        float __h) {
+  __m256 ret;
+  ret.v0 = _mm_set_ps(__e, __f, __g, __h);
+  ret.v1 = _mm_set_ps(__a, __b, __c, __d);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0,
+                                                            int __i1,
+                                                            int __i2,
+                                                            int __i3,
+                                                            int __i4,
+                                                            int __i5,
+                                                            int __i6,
+                                                            int __i7) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi32(__i4, __i5, __i6, __i7);
+  ret.v1 = _mm_set_epi32(__i0, __i1, __i2, __i3);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15,
+                                                            short __w14,
+                                                            short __w13,
+                                                            short __w12,
+                                                            short __w11,
+                                                            short __w10,
+                                                            short __w09,
+                                                            short __w08,
+                                                            short __w07,
+                                                            short __w06,
+                                                            short __w05,
+                                                            short __w04,
+                                                            short __w03,
+                                                            short __w02,
+                                                            short __w01,
+                                                            short __w00) {
+  __m256i ret;
+  ret.v0 =
+    _mm_set_epi16(__w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00);
+  ret.v1 =
+    _mm_set_epi16(__w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31,
+                                                           char __b30,
+                                                           char __b29,
+                                                           char __b28,
+                                                           char __b27,
+                                                           char __b26,
+                                                           char __b25,
+                                                           char __b24,
+                                                           char __b23,
+                                                           char __b22,
+                                                           char __b21,
+                                                           char __b20,
+                                                           char __b19,
+                                                           char __b18,
+                                                           char __b17,
+                                                           char __b16,
+                                                           char __b15,
+                                                           char __b14,
+                                                           char __b13,
+                                                           char __b12,
+                                                           char __b11,
+                                                           char __b10,
+                                                           char __b09,
+                                                           char __b08,
+                                                           char __b07,
+                                                           char __b06,
+                                                           char __b05,
+                                                           char __b04,
+                                                           char __b03,
+                                                           char __b02,
+                                                           char __b01,
+                                                           char __b00) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi8(__b15,
+                        __b14,
+                        __b13,
+                        __b12,
+                        __b11,
+                        __b10,
+                        __b09,
+                        __b08,
+                        __b07,
+                        __b06,
+                        __b05,
+                        __b04,
+                        __b03,
+                        __b02,
+                        __b01,
+                        __b00);
+  ret.v1 = _mm_set_epi8(__b31,
+                        __b30,
+                        __b29,
+                        __b28,
+                        __b27,
+                        __b26,
+                        __b25,
+                        __b24,
+                        __b23,
+                        __b22,
+                        __b21,
+                        __b20,
+                        __b19,
+                        __b18,
+                        __b17,
+                        __b16);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a,
+                                                             long long __b,
+                                                             long long __c,
+                                                             long long __d) {
+  __m256i ret;
+  ret.v0 = _mm_set_epi64x(__c, __d);
+  ret.v1 = _mm_set_epi64x(__a, __b);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a,
+                                                          double __b,
+                                                          double __c,
+                                                          double __d) {
+  return _mm256_set_pd(__d, __c, __b, __a);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a,
+                                                         float __b,
+                                                         float __c,
+                                                         float __d,
+                                                         float __e,
+                                                         float __f,
+                                                         float __g,
+                                                         float __h) {
+  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0,
+                                                             int __i1,
+                                                             int __i2,
+                                                             int __i3,
+                                                             int __i4,
+                                                             int __i5,
+                                                             int __i6,
+                                                             int __i7) {
+  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15,
+                                                             short __w14,
+                                                             short __w13,
+                                                             short __w12,
+                                                             short __w11,
+                                                             short __w10,
+                                                             short __w09,
+                                                             short __w08,
+                                                             short __w07,
+                                                             short __w06,
+                                                             short __w05,
+                                                             short __w04,
+                                                             short __w03,
+                                                             short __w02,
+                                                             short __w01,
+                                                             short __w00) {
+  return _mm256_set_epi16(__w00,
+                          __w01,
+                          __w02,
+                          __w03,
+                          __w04,
+                          __w05,
+                          __w06,
+                          __w07,
+                          __w08,
+                          __w09,
+                          __w10,
+                          __w11,
+                          __w12,
+                          __w13,
+                          __w14,
+                          __w15);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31,
+                                                            char __b30,
+                                                            char __b29,
+                                                            char __b28,
+                                                            char __b27,
+                                                            char __b26,
+                                                            char __b25,
+                                                            char __b24,
+                                                            char __b23,
+                                                            char __b22,
+                                                            char __b21,
+                                                            char __b20,
+                                                            char __b19,
+                                                            char __b18,
+                                                            char __b17,
+                                                            char __b16,
+                                                            char __b15,
+                                                            char __b14,
+                                                            char __b13,
+                                                            char __b12,
+                                                            char __b11,
+                                                            char __b10,
+                                                            char __b09,
+                                                            char __b08,
+                                                            char __b07,
+                                                            char __b06,
+                                                            char __b05,
+                                                            char __b04,
+                                                            char __b03,
+                                                            char __b02,
+                                                            char __b01,
+                                                            char __b00) {
+  return _mm256_set_epi8(__b00,
+                         __b01,
+                         __b02,
+                         __b03,
+                         __b04,
+                         __b05,
+                         __b06,
+                         __b07,
+                         __b08,
+                         __b09,
+                         __b10,
+                         __b11,
+                         __b12,
+                         __b13,
+                         __b14,
+                         __b15,
+                         __b16,
+                         __b17,
+                         __b18,
+                         __b19,
+                         __b20,
+                         __b21,
+                         __b22,
+                         __b23,
+                         __b24,
+                         __b25,
+                         __b26,
+                         __b27,
+                         __b28,
+                         __b29,
+                         __b30,
+                         __b31);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a,
+                                                              long long __b,
+                                                              long long __c,
+                                                              long long __d) {
+  return _mm256_set_epi64x(__d, __c, __b, __a);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w) {
+  __m256d ret;
+  ret.v1 = ret.v0 = (__m128d)wasm_f64x2_splat(__w);
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w) {
+  __m256 ret;
+  ret.v1 = ret.v0 = (__m128)wasm_f32x4_splat(__w);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i32x4_splat(__i);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i16x8_splat(__w);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i8x16_splat(__b);
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q) {
+  __m256i ret;
+  ret.v1 = ret.v0 = wasm_i64x2_splat(__q);
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void) {
+  __m256d ret;
+  ret.v1 = ret.v0 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void) {
+  __m256 ret;
+  ret.v1 = ret.v0 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void) {
+  __m256i ret;
+  ret.v1 = ret.v0 = _mm_setzero_si128();
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a) {
+  m256_data ret;
+  ret.double_view = __a;
+  return ret.float_view;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a) {
+  m256_data ret;
+  ret.double_view = __a;
+  return ret.int_view;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a) {
+  m256_data ret;
+  ret.float_view = __a;
+  return ret.double_view;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a) {
+  m256_data ret;
+  ret.float_view = __a;
+  return ret.int_view;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a) {
+  m256_data ret;
+  ret.int_view = __a;
+  return ret.float_view;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a) {
+  m256_data ret;
+  ret.int_view = __a;
+  return ret.double_view;
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a) {
+  return __a.v0;
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a) {
+  return __a.v0;
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a) {
+  return __a.v0;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a) {
+  __m256d ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a) {
+  __m256 ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a) {
+  __m256i ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_si128();
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a) {
+  __m256d ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_pd();
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a) {
+  __m256 ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_ps();
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a) {
+  __m256i ret;
+  ret.v0 = __a;
+  ret.v1 = _mm_setzero_si128();
+  return ret;
+}
+
+static inline __m256
+_mm256_insertf128_ps(__m256 __a, __m128 __b, const int imm8) {
+  __m256 ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static inline __m256d
+_mm256_insertf128_pd(__m256d __a, __m128d __b, const int imm8) {
+  __m256d ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static inline __m256i
+_mm256_insertf128_si256(__m256i __a, __m128i __b, const int imm8) {
+  __m256i ret = __a;
+  if (imm8 & 0x1) {
+    ret.v1 = __b;
+  } else {
+    ret.v0 = __b;
+  }
+  return ret;
+}
+
+static inline __m128 _mm256_extractf128_ps(__m256 __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static inline __m128d _mm256_extractf128_pd(__m256d __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static inline __m128i _mm256_extractf128_si256(__m256i __a, const int imm8) {
+  if (imm8 & 0x1) {
+    return __a.v1;
+  } else {
+    return __a.v0;
+  }
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi,
+                                                          __m128 __lo) {
+  __m256 ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi,
+                                                            __m128d __lo) {
+  __m256d ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi,
+                                                            __m128i __lo) {
+  __m256i ret;
+  ret.v0 = __lo;
+  ret.v1 = __hi;
+  return ret;
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo,
+                                                           __m128 __hi) {
+  return _mm256_set_m128(__hi, __lo);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo,
+                                                             __m128d __hi) {
+  return (__m256d)_mm256_set_m128d(__hi, __lo);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo,
+                                                             __m128i __hi) {
+  return (__m256i)_mm256_set_m128i(__hi, __lo);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const* __addr_hi, float const* __addr_lo) {
+  return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const* __addr_hi, double const* __addr_lo) {
+  return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i_u const* __addr_hi, __m128i_u const* __addr_lo) {
+  return _mm256_set_m128i(_mm_loadu_si128((__m128i const*)__addr_hi),
+                          _mm_loadu_si128((__m128i const*)__addr_lo));
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float* __addr_hi,
+                                                            float* __addr_lo,
+                                                            __m256 __a) {
+  _mm_storeu_ps(__addr_lo, __a.v0);
+  _mm_storeu_ps(__addr_hi, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double* __addr_hi,
+                                                             double* __addr_lo,
+                                                             __m256d __a) {
+  _mm_storeu_pd(__addr_lo, __a.v0);
+  _mm_storeu_pd(__addr_hi, __a.v1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i_u* __addr_hi, __m128i_u* __addr_lo, __m256i __a) {
+  _mm_storeu_si128((__m128i*)__addr_lo, __a.v0);
+  _mm_storeu_si128((__m128i*)__addr_hi, __a.v1);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __emscripten_avxintrin_h__ */
diff --git a/test/sse/test_avx.cpp b/test/sse/test_avx.cpp
index 44410d75e856b..e9d84e04629b9 100644
--- a/test/sse/test_avx.cpp
+++ b/test/sse/test_avx.cpp
@@ -4,43 +4,351 @@
  * University of Illinois/NCSA Open Source License.  Both these licenses can be
  * found in the LICENSE file.
  */
-// This file uses AVX by calling different functions with different interesting inputs and prints the results.
-// Use a diff tool to compare the results between platforms.
+// This file uses AVX by calling different functions with different interesting
+// inputs and prints the results. Use a diff tool to compare the results between
+// platforms.
 
+// immintrin.h must be included before test_sse.h
+// clang-format off
 #include <immintrin.h>
 #include "test_sse.h"
+// clang-format on
 
 bool testNaNBits = true;
 
-float *interesting_floats = get_interesting_floats();
-int numInterestingFloats = sizeof(interesting_floats_)/sizeof(interesting_floats_[0]);
-uint32_t *interesting_ints = get_interesting_ints();
-int numInterestingInts = sizeof(interesting_ints_)/sizeof(interesting_ints_[0]);
-double *interesting_doubles = get_interesting_doubles();
-int numInterestingDoubles = sizeof(interesting_doubles_)/sizeof(interesting_doubles_[0]);
+float* interesting_floats = get_interesting_floats();
+int numInterestingFloats =
+  sizeof(interesting_floats_) / sizeof(interesting_floats_[0]);
+uint32_t* interesting_ints = get_interesting_ints();
+int numInterestingInts =
+  sizeof(interesting_ints_) / sizeof(interesting_ints_[0]);
+double* interesting_doubles = get_interesting_doubles();
+int numInterestingDoubles =
+  sizeof(interesting_doubles_) / sizeof(interesting_doubles_[0]);
 
-int main() {
-  assert(numInterestingFloats % 4 == 0);
-  assert(numInterestingInts % 4 == 0);
-  assert(numInterestingDoubles % 4 == 0); 
+void test_arithmetic(void) {
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_add_pd);
+  Ret_M256_M256(__m256, _mm256_add_ps);
 
-  Ret_FloatPtr(__m128, _mm_broadcast_ss, 1, 1);
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_sub_pd);
+  Ret_M256_M256(__m256, _mm256_sub_ps);
+
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_addsub_pd);
+  Ret_M256_M256(__m256, _mm256_addsub_ps);
+
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_div_pd);
+  Ret_M256_M256(__m256, _mm256_div_ps);
+
+  testNaNBits = false;
+  Ret_M256d_M256d(__m256d, _mm256_mul_pd);
+  Ret_M256_M256(__m256, _mm256_mul_ps);
+
+  Ret_M256d_M256d(__m256d, _mm256_hadd_pd);
+  Ret_M256_M256(__m256, _mm256_hadd_ps);
+
+  testNaNBits = true;
+  Ret_M256d_M256d(__m256d, _mm256_hsub_pd);
+  Ret_M256_M256(__m256, _mm256_hsub_ps);
+
+  testNaNBits = false;
+  Ret_M256_M256_Tint(__m256, _mm256_dp_ps);
+}
+
+void test_special_math(void) {
+  Ret_M256d_M256d(__m256d, _mm256_max_pd);
+  Ret_M256_M256(__m256, _mm256_max_ps);
+
+  Ret_M256d_M256d(__m256d, _mm256_min_pd);
+  Ret_M256_M256(__m256, _mm256_min_ps);
+
+  Ret_M256d_Tint(__m256d, _mm256_round_pd);
+  Ret_M256_Tint(__m256, _mm256_round_ps);
+
+  Ret_M256d(__m256d, _mm256_ceil_pd);
+  Ret_M256(__m256, _mm256_ceil_ps);
+
+  Ret_M256d(__m256d, _mm256_floor_pd);
+  Ret_M256(__m256, _mm256_floor_ps);
+}
+
+void test_elementary_math(void) {
+  Ret_M256d(__m256d, _mm256_sqrt_pd);
+  Ret_M256approx(__m256, _mm256_sqrt_ps);
+  Ret_M256approx(__m256, _mm256_rsqrt_ps);
+  Ret_M256approx(__m256, _mm256_rcp_ps);
+}
+
+void test_logical(void) {
+  Ret_M128d_M128d(__m128d, _mm_and_pd);
+  Ret_M128_M128(__m128, _mm_and_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_andnot_pd);
+  Ret_M128_M128(__m128, _mm_andnot_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_or_pd);
+  Ret_M128_M128(__m128, _mm_or_ps);
+
+  Ret_M128d_M128d(__m128d, _mm_xor_pd);
+  Ret_M128_M128(__m128, _mm_xor_ps);
+
+  Ret_M128d_M128d(int, _mm_testz_pd);
+  Ret_M128d_M128d(int, _mm_testc_pd);
+  Ret_M128d_M128d(int, _mm_testnzc_pd);
+
+  Ret_M128_M128(int, _mm_testz_ps);
+  Ret_M128_M128(int, _mm_testc_ps);
+  Ret_M128_M128(int, _mm_testnzc_ps);
+
+  Ret_M256d_M256d(int, _mm256_testz_pd);
+  Ret_M256d_M256d(int, _mm256_testc_pd);
+  Ret_M256d_M256d(int, _mm256_testnzc_pd);
+
+  Ret_M256_M256(int, _mm256_testz_ps);
+  Ret_M256_M256(int, _mm256_testc_ps);
+  Ret_M256_M256(int, _mm256_testnzc_ps);
+
+  Ret_M256i_M256i(int, _mm256_testz_si256);
+  Ret_M256i_M256i(int, _mm256_testc_si256);
+  Ret_M256i_M256i(int, _mm256_testnzc_si256);
+}
+
+void test_swizzle(void) {
+  Ret_M128d_M128i(__m128d, _mm_permutevar_pd);
+  Ret_M256d_M256i(__m256d, _mm256_permutevar_pd);
+
+  Ret_M128_M128i(__m128, _mm_permutevar_ps);
+  Ret_M256_M256i(__m256, _mm256_permutevar_ps);
+
+  Ret_M128d_Tint(__m128d, _mm_permute_pd);
+  Ret_M256d_Tint(__m256d, _mm256_permute_pd);
+
+  Ret_M128_Tint(__m128, _mm_permute_ps);
+  Ret_M256_Tint(__m256, _mm256_permute_ps);
+
+  Ret_M256d_M256d_Tint(__m256d, _mm256_permute2f128_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_permute2f128_ps);
+  Ret_M256i_M256i_Tint(__m256i, _mm256_permute2f128_si256);
+
+  Ret_M256d_M256d_Tint(__m256d, _mm256_blend_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_blend_ps);
+
+  Ret_M256d_M256d_M256d(__m256d, _mm256_blendv_pd);
+  Ret_M256_M256_M256(__m256, _mm256_blendv_ps);
+
+  Ret_M256d_M256d_Tint(__m256d, _mm256_shuffle_pd);
+  Ret_M256_M256_Tint(__m256, _mm256_shuffle_ps);
+
+  Ret_M256i_Tint(int, _mm256_extract_epi32);
+  Ret_M256i_Tint(int, _mm256_extract_epi16);
+  Ret_M256i_Tint(int, _mm256_extract_epi8);
+  Ret_M256i_Tint(int64_t, _mm256_extract_epi64);
+
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi32);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi16);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi8);
+  Ret_M256i_int_Tint(__m256i, _mm256_insert_epi64);
+
+  Ret_M256d_M256d(__m256d, _mm256_unpackhi_pd);
+  Ret_M256d_M256d(__m256d, _mm256_unpacklo_pd);
+  Ret_M256_M256(__m256, _mm256_unpackhi_ps);
+  Ret_M256_M256(__m256, _mm256_unpacklo_ps);
+
+  Ret_M256d_M128d_Tint(__m256d, _mm256_insertf128_pd);
+  Ret_M256_M128_Tint(__m256, _mm256_insertf128_ps);
+  Ret_M256i_M128i_Tint(__m256i, _mm256_insertf128_si256);
+
+  Ret_M256d_Tint(__m128d, _mm256_extractf128_pd);
+  Ret_M256_Tint(__m128, _mm256_extractf128_ps);
+  Ret_M256i_Tint(__m128i, _mm256_extractf128_si256);
+}
+
+void test_convert(void) {
+  Ret_M128i(__m256d, _mm256_cvtepi32_pd);
+  Ret_M256i(__m256, _mm256_cvtepi32_ps);
+
+  Ret_M256d(__m128, _mm256_cvtpd_ps);
+
+  Ret_M256(__m256i, _mm256_cvtps_epi32);
+  Ret_M128(__m256d, _mm256_cvtps_pd);
+
+  Ret_M256d(__m128i, _mm256_cvttpd_epi32);
+
+  Ret_M256d(__m128i, _mm256_cvtpd_epi32);
+
+  Ret_M256(__m256i, _mm256_cvttps_epi32);
+
+  Ret_M256d(double, _mm256_cvtsd_f64);
+
+  Ret_M256i(int, _mm256_cvtsi256_si32);
+
+  Ret_M256(float, _mm256_cvtss_f32);
+}
+
+void test_move(void) {
+  Ret_M256(__m256, _mm256_movehdup_ps);
+  Ret_M256(__m256, _mm256_moveldup_ps);
+  Ret_M256d(__m256d, _mm256_movedup_pd);
+}
+
+void test_compare(void) {
   Ret_M128d_M128d_Tint_5bits(__m128d, _mm_cmp_pd);
   Ret_M128_M128_Tint_5bits(__m128, _mm_cmp_ps);
   Ret_M128d_M128d_Tint_5bits(__m128d, _mm_cmp_sd);
   Ret_M128_M128_Tint_5bits(__m128, _mm_cmp_ss);
+
+  Ret_M256d_M256d_Tint_5bits(__m256d, _mm256_cmp_pd);
+  Ret_M256_M256_Tint_5bits(__m256, _mm256_cmp_ps);
+}
+
+void test_misc(void) {
+  Ret_M256d(int, _mm256_movemask_pd);
+  Ret_M256(int, _mm256_movemask_ps);
+}
+
+void test_load(void) {
+  Ret_FloatPtr(__m128, _mm_broadcast_ss, 1, 1);
+  Ret_DoublePtr(__m256d, _mm256_broadcast_sd, 1, 1);
+  Ret_FloatPtr(__m256, _mm256_broadcast_ss, 1, 1);
+
+  // TODO reuse Ret_DoublePtr?
+  Ret_M128dPtr(__m256d, _mm256_broadcast_pd);
+  Ret_M128Ptr(__m256, _mm256_broadcast_ps); // must aligned? not sure
+
+  Ret_DoublePtr(__m256d, _mm256_load_pd, 4, 4); // error, input not aligned
+  Ret_FloatPtr(__m256, _mm256_load_ps, 8, 8);   // error, align
+  Ret_DoublePtr(__m256d, _mm256_loadu_pd, 4, 1);
+  Ret_FloatPtr(__m256, _mm256_loadu_ps, 8, 1);
+
+  Ret_IntPtr(__m256i, _mm256_load_si256, __m256i*, 8, 8); // error, align
+  Ret_IntPtr(__m256i, _mm256_loadu_si256, __m256i_u*, 8, 1);
+  Ret_IntPtr(__m256i, _mm256_lddqu_si256, __m256i_u*, 8, 1);
+
   Ret_DoublePtr_M128i(__m128d, _mm_maskload_pd, 2, 2);
+  Ret_DoublePtr_M256i(__m256d, _mm256_maskload_pd, 4, 4);
   Ret_FloatPtr_M128i(__m128, _mm_maskload_ps, 4, 4);
+  Ret_FloatPtr_M256i(__m256, _mm256_maskload_ps, 8, 8);
+
+  Ret_DoublePtr_DoublePtr(__m256d, _mm256_loadu2_m128d, 2, 2);
+  Ret_FloatPtr_FloatPtr(__m256, _mm256_loadu2_m128, 4, 4);
+  Ret_IntPtr_IntPtr(__m256i, _mm256_loadu2_m128i, __m128i_u*, 4, 4);
+}
+
+void test_store(void) {
+
+  void_OutDoublePtr_M256d(_mm256_store_pd, double*, 32, 32);
+  void_OutFloatPtr_M256(_mm256_store_ps, float*, 32, 32);
+  void_OutDoublePtr_M256d(_mm256_storeu_pd, double*, 32, 1);
+  void_OutFloatPtr_M256(_mm256_storeu_ps, float*, 32, 1);
+  void_OutIntPtr_M256i(_mm256_store_si256, __m256i*, 32, 32);
+  void_OutIntPtr_M256i(_mm256_storeu_si256, __m256i_u*, 32, 1);
+
   void_OutDoublePtr_M128i_M128d(_mm_maskstore_pd, double*, 16, 8);
+  void_OutDoublePtr_M256i_M256d(_mm256_maskstore_pd, double*, 32, 8);
   void_OutFloatPtr_M128i_M128(_mm_maskstore_ps, float*, 16, 4);
-  Ret_M128d_Tint(__m128d, _mm_permute_pd);
-  Ret_M128_Tint(__m128, _mm_permute_ps);
-  Ret_M128d_M128d(__m128d, _mm_permutevar_pd);
-  Ret_M128_M128(__m128, _mm_permutevar_ps);
-  Ret_M128d_M128d(int, _mm_testc_pd);
-  Ret_M128_M128(int, _mm_testc_ps);
-  Ret_M128d_M128d(int, _mm_testnzc_pd);
-  Ret_M128_M128(int, _mm_testnzc_ps);
-  Ret_M128d_M128d(int, _mm_testz_pd);
-  Ret_M128_M128(int, _mm_testz_ps);
+  void_OutFloatPtr_M256i_M256(_mm256_maskstore_ps, float*, 32, 4);
+
+  void_OutIntPtr_M256i(_mm256_stream_si256, __m256i*, 32, 32);
+  void_OutDoublePtr_M256d(_mm256_stream_pd, double*, 32, 32);
+  void_OutFloatPtr_M256(_mm256_stream_ps, float*, 32, 32);
+
+  void_OutFloatPtr_OutFloatPtr_M256(_mm256_storeu2_m128, float*, 32, 1);
+  void_OutDoublePtr_OutDoublePtr_M256d(_mm256_storeu2_m128d, double*, 32, 1);
+  void_OutIntPtr_OutIntPtr_M256i(_mm256_storeu2_m128i, __m128i_u*, 32, 1)
+}
+
+void test_undef(void) {
+#ifdef __EMSCRIPTEN__
+  _mm256_undefined_pd();
+  _mm256_undefined_ps();
+  _mm256_undefined_si256();
+#endif
+}
+
+void test_set(void) {
+  Ret_Double4(__m256d, _mm256_set_pd, 1);
+  Ret_Float8(__m256, _mm256_set_ps, 1);
+  Ret_Int8(__m256i, _mm256_set_epi32, 1);
+  Ret_Short16(__m256i, _mm256_set_epi16, 2);
+  Ret_Char32(__m256i, _mm256_set_epi8, 4);
+  Ret_Longlong4(__m256i, _mm256_set_epi64x, 1);
+
+  Ret_Double4(__m256d, _mm256_setr_pd, 1);
+  Ret_Float8(__m256, _mm256_setr_ps, 1);
+  Ret_Int8(__m256i, _mm256_setr_epi32, 1);
+  Ret_Short16(__m256i, _mm256_setr_epi16, 2);
+  Ret_Char32(__m256i, _mm256_setr_epi8, 4);
+  Ret_Longlong4(__m256i, _mm256_setr_epi64x, 1);
+
+  Ret_Double(__m256d, _mm256_set1_pd, 1);
+  Ret_Float(__m256, _mm256_set1_ps, 1);
+  Ret_Int(__m256i, _mm256_set1_epi32, 1);
+  Ret_Int(__m256i, _mm256_set1_epi16, 1);
+  Ret_Int(__m256i, _mm256_set1_epi8, 1);
+  Ret_Int(__m256i, _mm256_set1_epi64x, 1);
+
+  char str[256] = {};
+  __m256d zerod = _mm256_setzero_pd();
+  tostr(&zerod, str);
+  printf("_mm256_setzero_pd() = %s\n", str);
+
+  __m256 zero = _mm256_setzero_ps();
+  tostr(&zero, str);
+  printf("_mm256_setzero_ps() = %s\n", str);
+
+  __m256i zeroi = _mm256_setzero_si256();
+  tostr(&zeroi, str);
+  printf("_mm256_setzero_si256() = %s\n", str);
+
+  Ret_M128_M128(__m256, _mm256_set_m128);
+  Ret_M128d_M128d(__m256d, _mm256_set_m128d);
+  Ret_M128i_M128i(__m256i, _mm256_set_m128i);
+
+  Ret_M128_M128(__m256, _mm256_setr_m128);
+  Ret_M128d_M128d(__m256d, _mm256_setr_m128d);
+  Ret_M128i_M128i(__m256i, _mm256_setr_m128i);
+}
+
+void test_cast(void) {
+  Ret_M256d(__m256, _mm256_castpd_ps);
+  Ret_M256d(__m256i, _mm256_castpd_si256);
+  Ret_M256(__m256d, _mm256_castps_pd);
+  Ret_M256(__m256i, _mm256_castps_si256);
+  Ret_M256i(__m256d, _mm256_castsi256_pd);
+  Ret_M256i(__m256, _mm256_castsi256_ps);
+
+  Ret_M256d(__m128d, _mm256_castpd256_pd128);
+  Ret_M256(__m128, _mm256_castps256_ps128);
+  Ret_M256i(__m128i, _mm256_castsi256_si128);
+  Ret_M128d(__m256d, _mm256_castpd128_pd256);
+  Ret_M128(__m256, _mm256_castps128_ps256);
+  Ret_M128i(__m256i, _mm256_castsi128_si256);
+
+  Ret_M128d(__m256d, _mm256_zextpd128_pd256);
+  Ret_M128(__m256, _mm256_zextps128_ps256);
+  Ret_M128i(__m256i, _mm256_zextsi128_si256);
+}
+
+int main() {
+  assert(numInterestingFloats % 8 == 0);
+  assert(numInterestingInts % 8 == 0);
+  assert(numInterestingDoubles % 4 == 0);
+
+  test_arithmetic();
+  test_special_math();
+  test_elementary_math();
+  test_logical();
+  test_swizzle();
+  test_convert();
+  test_move();
+  test_compare();
+  test_misc();
+  test_load();
+  test_store();
+  test_undef();
+  test_set();
+  test_cast();
 }
diff --git a/test/sse/test_sse.h b/test/sse/test_sse.h
index ca162c703fdde..efe2e53a8f35b 100644
--- a/test/sse/test_sse.h
+++ b/test/sse/test_sse.h
@@ -36,23 +36,118 @@ double ucastd(uint64_t t) { return *(double*)&t; }
 // Data used in test. Store them global and access via a getter to confuse optimizer to not "solve" the whole test suite at compile-time,
 // so that the operation will actually be performed at runtime, and not at compile-time. (Testing the capacity of the compiler to perform
 // SIMD ops at compile-time would be interesting as well, but that's for another test)
-float interesting_floats_[] = { -INFINITY, -FLT_MAX, -2.5f, -1.5f, -1.4f, -1.0f, -0.5f, -0.2f, -FLT_MIN, -0.f, 0.f, 
-                                1.401298464e-45f, FLT_MIN, 0.3f, 0.5f, 0.8f, 1.0f, 1.5f, 2.5f, 3.5f, 3.6f, FLT_MAX, INFINITY, NAN,
-                                ucastf(0x01020304), ucastf(0x80000000), ucastf(0x7FFFFFFF), ucastf(0xFFFFFFFF)
-                            };
-
-double interesting_doubles_[] = { -INFINITY, -FLT_MAX, -2.5, -1.5, -1.4, -1.0, -0.5, -0.2, -FLT_MIN, -0.0, 0.0, 
-                                1.401298464e-45, FLT_MIN, 0.3, 0.5, 0.8, 1.0, 1.5, 2.5, 3.5, 3.6, FLT_MAX, INFINITY, NAN,
-                                ucastd(0x0102030405060708ULL), ucastd(0x8000000000000000ULL),
-                                ucastd(0x7FFFFFFFFFFFFFFFULL), ucastd(0xFFFFFFFFFFFFFFFFULL)
-                                };
-
-uint32_t interesting_ints_[] = { 0, 1, 2, 3, 0x01020304, 0x10203040, 0x7FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x12345678, 0x9ABCDEF1, 0x80000000,
-                                 0x80808080, 0x7F7F7F7F, 0x01010101, 0x11111111, 0x20202020, 0x0F0F0F0F, 0xF0F0F0F0,
-                                 fcastu(-INFINITY), fcastu(-FLT_MAX), fcastu(-2.5f), fcastu(-1.5f), fcastu(-1.4f), fcastu(-1.0f), fcastu(-0.5f),
-                                 fcastu(-0.2f), fcastu(-FLT_MIN), 0xF9301AB9, 0x0039AB12, 0x19302BCD,
-                                 fcastu(1.401298464e-45f), fcastu(FLT_MIN), fcastu(0.3f), fcastu(0.5f), fcastu(0.8f), fcastu(1.0f), fcastu(1.5f),
-                                 fcastu(2.5f), fcastu(3.5f), fcastu(3.6f), fcastu(FLT_MAX), fcastu(INFINITY), fcastu(NAN) };
+float interesting_floats_[] __attribute__((aligned(32))) = {-INFINITY,
+                                                            -FLT_MAX,
+                                                            -2.5f,
+                                                            -1.5f,
+                                                            -1.4f,
+                                                            -1.0f,
+                                                            -0.5f,
+                                                            -0.2f,
+                                                            -FLT_MIN,
+                                                            -0.f,
+                                                            0.f,
+                                                            1.401298464e-45f,
+                                                            FLT_MIN,
+                                                            0.3f,
+                                                            0.5f,
+                                                            0.8f,
+                                                            1.0f,
+                                                            1.5f,
+                                                            2.5f,
+                                                            3.5f,
+                                                            3.6f,
+                                                            FLT_MAX,
+                                                            INFINITY,
+                                                            NAN,
+                                                            ucastf(0x01020304),
+                                                            ucastf(0x80000000),
+                                                            ucastf(0x7FFFFFFF),
+                                                            ucastf(0xFFFFFFFF),
+                                                            -2.70497e+38f,
+                                                            -3.2995e-21f,
+                                                            3.40282e+38f,
+                                                            3.38211e+19f};
+
+double interesting_doubles_[]
+  __attribute__((aligned(32))) = {-INFINITY,
+                                  -FLT_MAX,
+                                  -2.5,
+                                  -1.5,
+                                  -1.4,
+                                  -1.0,
+                                  -0.5,
+                                  -0.2,
+                                  -FLT_MIN,
+                                  -0.0,
+                                  0.0,
+                                  1.401298464e-45,
+                                  FLT_MIN,
+                                  0.3,
+                                  0.5,
+                                  0.8,
+                                  1.0,
+                                  1.5,
+                                  2.5,
+                                  3.5,
+                                  3.6,
+                                  FLT_MAX,
+                                  INFINITY,
+                                  NAN,
+                                  ucastd(0x0102030405060708ULL),
+                                  ucastd(0x8000000000000000ULL),
+                                  ucastd(0x7FFFFFFFFFFFFFFFULL),
+                                  ucastd(0xFFFFFFFFFFFFFFFFULL)};
+
+uint32_t interesting_ints_[]
+  __attribute__((aligned(32))) = {0,
+                                  1,
+                                  2,
+                                  3,
+                                  0x01020304,
+                                  0x10203040,
+                                  0x7FFFFFFF,
+                                  0xFFFFFFFF,
+                                  0xFFFFFFFE,
+                                  0x12345678,
+                                  0x9ABCDEF1,
+                                  0x80000000,
+                                  0x80808080,
+                                  0x7F7F7F7F,
+                                  0x01010101,
+                                  0x11111111,
+                                  0x20202020,
+                                  0x0F0F0F0F,
+                                  0xF0F0F0F0,
+                                  fcastu(-INFINITY),
+                                  fcastu(-FLT_MAX),
+                                  fcastu(-2.5f),
+                                  fcastu(-1.5f),
+                                  fcastu(-1.4f),
+                                  fcastu(-1.0f),
+                                  fcastu(-0.5f),
+                                  fcastu(-0.2f),
+                                  fcastu(-FLT_MIN),
+                                  0xF9301AB9,
+                                  0x0039AB12,
+                                  0x19302BCD,
+                                  fcastu(1.401298464e-45f),
+                                  fcastu(FLT_MIN),
+                                  fcastu(0.3f),
+                                  fcastu(0.5f),
+                                  fcastu(0.8f),
+                                  fcastu(1.0f),
+                                  fcastu(1.5f),
+                                  fcastu(2.5f),
+                                  fcastu(3.5f),
+                                  fcastu(3.6f),
+                                  fcastu(FLT_MAX),
+                                  fcastu(INFINITY),
+                                  fcastu(NAN),
+                                  0x000003FF,
+                                  0xDDDDDDDD,
+                                  0x88888888,
+                                  0xEEEEEEEE};
 
 bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.
 
@@ -192,17 +287,25 @@ void tostr(align1_double *m, char *outstr) {
 }
 
 void tostr(align1_double *m, int numElems, char *outstr) {
-  char s[2][64];
+  assert(numElems <= 4);
+  char s[4][64];
   for(int i = 0; i < numElems; ++i)
     SerializeDouble(m[i], s[i]);
   switch(numElems) {
     case 1: sprintf(outstr, "{%s}", s[0]); break;
     case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
+    case 3:
+      sprintf(outstr, "{%s,%s,%s}", s[0], s[1], s[2]);
+      break;
+    case 4:
+      sprintf(outstr, "{%s,%s,%s,%s}", s[0], s[1], s[2], s[3]);
+      break;
   }
 }
 
 void tostr(align1_float *m, int numElems, char *outstr) {
-  char s[4][64];
+  assert(numElems <= 8);
+  char s[8][64];
   for(int i = 0; i < numElems; ++i)
     SerializeFloat(m[i], s[i]);
   switch(numElems) {
@@ -210,22 +313,125 @@ void tostr(align1_float *m, int numElems, char *outstr) {
     case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
     case 3: sprintf(outstr, "{%s,%s,%s}", s[0], s[1], s[2]); break;
     case 4: sprintf(outstr, "{%s,%s,%s,%s}", s[0], s[1], s[2], s[3]); break;
+    case 5:
+      sprintf(outstr, "{%s,%s,%s,%s,%s}", s[0], s[1], s[2], s[3], s[4]);
+      break;
+    case 6:
+      sprintf(
+        outstr, "{%s,%s,%s,%s,%s,%s}", s[0], s[1], s[2], s[3], s[4], s[5]);
+      break;
+    case 7:
+      sprintf(outstr,
+              "{%s,%s,%s,%s,%s,%s,%s}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6]);
+      break;
+    case 8:
+      sprintf(outstr,
+              "{%s,%s,%s,%s,%s,%s,%s,%s}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6],
+              s[7]);
+      break;
   }
 }
 
 void tostr(align1_int *s, int numElems, char *outstr) {
+  assert(numElems <= 8);
   switch(numElems) {
     case 1: sprintf(outstr, "{0x%08X}", s[0]); break;
     case 2: sprintf(outstr, "{0x%08X,0x%08X}", s[0], s[1]); break;
     case 3: sprintf(outstr, "{0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2]); break;
     case 4: sprintf(outstr, "{0x%08X,0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2], s[3]); break;
+    case 5:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4]);
+      break;
+    case 6:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5]);
+      break;
+    case 7:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6]);
+      break;
+    case 8:
+      sprintf(outstr,
+              "{0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X}",
+              s[0],
+              s[1],
+              s[2],
+              s[3],
+              s[4],
+              s[5],
+              s[6],
+              s[7]);
+      break;
   }
 }
 
 void tostr(align1_int64 *m, int numElems, char *outstr) {
+  assert(numElems <= 4);
   switch(numElems) {
     case 1: sprintf(outstr, "{0x%08X%08X}", (int)(*m >> 32), (int)*m); break;
-    case 2: sprintf(outstr, "{0x%08X%08X,0x%08X%08X}", (int)(*m >> 32), (int)*m, (int)(m[1] >> 32), (int)m[1]);
+    case 2:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1]);
+      break;
+    case 3:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1],
+              (int)(m[2] >> 32),
+              (int)m[2]);
+      break;
+    case 4:
+      sprintf(outstr,
+              "{0x%08X%08X,0x%08X%08X,0x%08X%08X,0x%08X%08X}",
+              (int)(*m >> 32),
+              (int)*m,
+              (int)(m[1] >> 32),
+              (int)m[1],
+              (int)(m[2] >> 32),
+              (int)m[2],
+              (int)(m[3] >> 32),
+              (int)m[3]);
+      break;
   }
 }
 
@@ -593,7 +799,7 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
       printf("%s(%s) = %s\n", #func, str, str2); \
     }
 
-float tempOutFloatStore[16];
+float tempOutFloatStore[32];
 float *getTempOutFloatStore(int alignmentBytes) {
   memset(tempOutFloatStore, 0, sizeof(tempOutFloatStore));
   uintptr_t addr = (uintptr_t)tempOutFloatStore;
@@ -933,3 +1139,976 @@ double *getTempOutDoubleStore(int alignmentBytes) { return (double*)getTempOutFl
         char str3[256]; tostr(&ret, str3); \
         printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
       }
+
+#ifdef __AVX__
+
+void tostr(__m256* m, char* outstr) {
+  union {
+    __m256 m;
+    float val[8];
+  } u;
+  u.m = *m;
+  char s[8][32];
+  for (int i = 0; i < 8; i++) {
+    SerializeFloat(u.val[i], s[i]);
+  }
+  sprintf(outstr,
+          "[%s,%s,%s,%s,%s,%s,%s,%s]",
+          s[7],
+          s[6],
+          s[5],
+          s[4],
+          s[3],
+          s[2],
+          s[1],
+          s[0]);
+}
+
+void tostr(__m256i* m, char* outstr) {
+  union {
+    __m256i m;
+    uint32_t val[8];
+  } u;
+  u.m = *m;
+  sprintf(outstr,
+          "[0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X,0x%08X]",
+          u.val[7],
+          u.val[6],
+          u.val[5],
+          u.val[4],
+          u.val[3],
+          u.val[2],
+          u.val[1],
+          u.val[0]);
+}
+
+void tostr(__m256d* m, char* outstr) {
+  union {
+    __m256d m;
+    double val[4];
+  } u;
+  u.m = *m;
+  char s[4][64];
+  SerializeDouble(u.val[0], s[0]);
+  SerializeDouble(u.val[1], s[1]);
+  SerializeDouble(u.val[2], s[2]);
+  SerializeDouble(u.val[3], s[3]);
+  sprintf(outstr, "[%s,%s,%s,%s]", s[3], s[2], s[1], s[0]);
+}
+
+void tostr_approx(__m256* m, char* outstr, bool approximate) {
+  union {
+    __m256 m;
+    float val[8];
+  } u;
+  u.m = *m;
+  char s[8][32];
+
+  for (int i = 0; i < 8; i++) {
+    SerializeFloat(u.val[i], s[i], approximate);
+  }
+  sprintf(outstr,
+          "[%s,%s,%s,%s,%s,%s,%s,%s]",
+          s[7],
+          s[6],
+          s[5],
+          s[4],
+          s[3],
+          s[2],
+          s[1],
+          s[0]);
+}
+
+#define Ret_M128_M128i(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128 m1 = E1(interesting_floats, i * 4 + k, numInterestingFloats);   \
+        __m128i m2 =                                                           \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M128d_M128i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128d m1 =                                                           \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m128i m2 =                                                           \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256d(Ret_type, func)                                              \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k) {                                              \
+      __m128d tmp =                                                            \
+        E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);      \
+      __m256d m1 = _mm256_set_m128d(tmp, tmp);                                 \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256(Ret_type, func)                                               \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256approx(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr_approx(&ret, str2, true /*approximate*/);                          \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256d_M256d(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);    \
+        __m256d m2 = _mm256_set_m128d(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2);                                           \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+        /* b op a */                                                           \
+        ret = func(m2, m1);                                                    \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256(Ret_type, func)                                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        tmp = E2(interesting_floats, j * 4, numInterestingFloats);             \
+        __m256 m2 = _mm256_set_m128(tmp, tmp);                                 \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256i_M256i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        tmp = (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);    \
+        __m256i m2 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256d_M256i(Ret_type, func)                                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128d tmp1 =                                                         \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp1, tmp1);                             \
+        __m128i tmp2 =                                                         \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        __m256i m2 = _mm256_set_m128i(tmp2, tmp2);                             \
+        Ret_type ret = func(m1, m2);                                           \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256i(Ret_type, func)                                         \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128 tmp1 = E1(interesting_floats, i * 4 + k, numInterestingFloats); \
+        __m256 m1 = _mm256_set_m128(tmp1, tmp1);                               \
+        __m128i tmp2 =                                                         \
+          (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);        \
+        __m256i m2 = _mm256_set_m128i(tmp2, tmp2);                             \
+        Ret_type ret = func(m1, m2);                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s) = %s\n", #func, str, str2, str3);                   \
+      }
+
+#define Ret_M256_M256_M256(Ret_type, func)                                     \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j)                       \
+        for (int l = 0; l < numInterestingFloats / 4; ++l) {                   \
+          __m128 tmp =                                                         \
+            E1(interesting_floats, i * 4 + k, numInterestingFloats);           \
+          __m256 m1 = _mm256_set_m128(tmp, tmp);                               \
+          tmp = E2(interesting_floats, j * 4, numInterestingFloats);           \
+          __m256 m2 = _mm256_set_m128(tmp, tmp);                               \
+          tmp = E1(interesting_floats, l * 4, numInterestingFloats);           \
+          __m256 m3 = _mm256_set_m128(tmp, tmp);                               \
+          Ret_type ret = func(m1, m2, m3);                                     \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(&m3, str3);                                                    \
+          char str4[256];                                                      \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+        }
+
+#define Ret_M256d_M256d_M256d(Ret_type, func)                                  \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j)                      \
+        for (int l = 0; l < numInterestingDoubles / 2; ++l) {                  \
+          __m128d tmp =                                                        \
+            E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);  \
+          __m256d m1 = _mm256_set_m128d(tmp, tmp);                             \
+          tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);  \
+          __m256d m2 = _mm256_set_m128d(tmp, tmp);                             \
+          tmp = E1_Double(interesting_doubles, l * 2, numInterestingDoubles);  \
+          __m256d m3 = _mm256_set_m128d(tmp, tmp);                             \
+          Ret_type ret = func(m1, m2, m3);                                     \
+          /* a, b, c */                                                        \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(&m3, str3);                                                    \
+          char str4[256];                                                      \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+          /* b, c, a */                                                        \
+          ret = func(m2, m3, m1);                                              \
+          tostr(&m1, str);                                                     \
+          tostr(&m2, str2);                                                    \
+          tostr(&m3, str3);                                                    \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+          /* c, a, b */                                                        \
+          ret = func(m3, m1, m2);                                              \
+          tostr(&m1, str);                                                     \
+          tostr(&m2, str2);                                                    \
+          tostr(&m3, str3);                                                    \
+          tostr(&ret, str4);                                                   \
+          printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4);       \
+        }
+
+#define Ret_M256i(Ret_type, func)                                              \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);      \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(m1);                                                 \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M128dPtr(Ret_type, func)                                           \
+  for (int i = 0; i + 2 <= numInterestingDoubles; i += 2) {                    \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func((__m128d*)ptr);                                        \
+    char str[256];                                                             \
+    tostr(ptr, 2, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_M128Ptr(Ret_type, func)                                            \
+  for (int i = 0; i + 4 <= numInterestingFloats; i += 4) {                     \
+    float* ptr = interesting_floats + i;                                       \
+    Ret_type ret = func((__m128*)ptr);                                         \
+    char str[256];                                                             \
+    tostr(ptr, 4, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_DoublePtr_DoublePtr(Ret_type, func, numElemsAccessed, inc)         \
+  for (int i = 0; i + numElemsAccessed <= numInterestingDoubles; i += inc) {   \
+    double* ptr1 = interesting_doubles + i;                                    \
+    for (int j = 0; j + numElemsAccessed <= numInterestingDoubles; j += inc) { \
+      double* ptr2 = interesting_doubles + j;                                  \
+      Ret_type ret = func(ptr1, ptr2);                                         \
+      char str1[256];                                                          \
+      tostr(ptr1, numElemsAccessed, str1);                                     \
+      char str2[256];                                                          \
+      tostr(ptr2, numElemsAccessed, str2);                                     \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s, %s) = %s\n", #func, str1, str2, str3);                    \
+    }                                                                          \
+  }
+
+#define Ret_FloatPtr_FloatPtr(Ret_type, func, numElemsAccessed, inc)           \
+  for (int i = 0; i + numElemsAccessed <= numInterestingFloats; i += inc) {    \
+    float* ptr1 = interesting_floats + i;                                      \
+    for (int j = 0; j + numElemsAccessed <= numInterestingFloats; j += inc) {  \
+      float* ptr2 = interesting_floats + j;                                    \
+      Ret_type ret = func(ptr1, ptr2);                                         \
+      char str1[256];                                                          \
+      tostr(ptr1, numElemsAccessed, str1);                                     \
+      char str2[256];                                                          \
+      tostr(ptr2, numElemsAccessed, str2);                                     \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s,%s) = %s\n", #func, str1, str2, str3);                     \
+    }                                                                          \
+  }
+
+#define Ret_IntPtr_IntPtr(Ret_type, func, Ptr_type, numElemsAccessed, inc)     \
+  for (int i = 0; i + numElemsAccessed <= numInterestingInts; i += inc) {      \
+    uint32_t* ptr1 = interesting_ints + i;                                     \
+    for (int j = 0; j + numElemsAccessed <= numInterestingInts; j += inc) {    \
+      uint32_t* ptr2 = interesting_ints + j;                                   \
+      Ret_type ret = func((Ptr_type)ptr1, (Ptr_type)ptr2);                     \
+      char str1[256];                                                          \
+      tostr((int*)ptr1, numElemsAccessed, str1);                               \
+      char str2[256];                                                          \
+      tostr((int*)ptr2, numElemsAccessed, str2);                               \
+      char str3[256];                                                          \
+      tostr(&ret, str3);                                                       \
+      printf("%s(%s, %s) = %s\n", #func, str1, str2, str3);                    \
+    }                                                                          \
+  }
+
+#define Ret_DoublePtr_M256i(Ret_type, func, numElemsAccessed, inc)             \
+  for (int i = 0; i + numElemsAccessed <= numInterestingDoubles; i += inc)     \
+    for (int j = 0; j < numInterestingInts / 4; ++j) {                         \
+      double* ptr = interesting_doubles + i;                                   \
+      __m128i tmp =                                                            \
+        (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);          \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(ptr, m1);                                            \
+      char str[256];                                                           \
+      tostr(ptr, numElemsAccessed, str);                                       \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_FloatPtr_M256i(Ret_type, func, numElemsAccessed, inc)              \
+  for (int i = 0; i + numElemsAccessed <= numInterestingFloats; i += inc)      \
+    for (int j = 0; j < numInterestingInts / 4; ++j) {                         \
+      float* ptr = interesting_floats + i;                                     \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);          \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(ptr, m1);                                            \
+      char str[256];                                                           \
+      tostr(ptr, numElemsAccessed, str);                                       \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s) = %s\n", #func, str, str2);                               \
+    }
+
+#define Ret_M256d_M256d_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        tmp = E2_Double(interesting_doubles, j * 2, numInterestingDoubles);    \
+        __m256d m2 = _mm256_set_m128d(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_M256_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        tmp = E2(interesting_floats, j * 4, numInterestingFloats);             \
+        __m256 m2 = _mm256_set_m128(tmp, tmp);                                 \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256i_M256i_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        tmp = (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);    \
+        __m256i m2 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        /* a op b */                                                           \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+        /* b op a */                                                           \
+        ret = func(m2, m1, Tint);                                              \
+        tostr(&m1, str);                                                       \
+        tostr(&m2, str2);                                                      \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_Tint_body(Ret_type, func, Tint)                               \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);    \
+      __m256 m1 = _mm256_set_m128(tmp, tmp);                                   \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256d_Tint_body(Ret_type, func, Tint)                              \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k) {                                              \
+      __m128d tmp =                                                            \
+        E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);      \
+      __m256d m1 = _mm256_set_m128d(tmp, tmp);                                 \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256i_Tint_body(Ret_type, func, Tint)                              \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k) {                                              \
+      __m128i tmp =                                                            \
+        (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);      \
+      __m256i m1 = _mm256_set_m128i(tmp, tmp);                                 \
+      Ret_type ret = func(m1, Tint);                                           \
+      char str[256];                                                           \
+      tostr(&m1, str);                                                         \
+      char str2[256];                                                          \
+      tostr(&ret, str2);                                                       \
+      printf("%s(%s, %d) = %s\n", #func, str, Tint, str2);                     \
+    }
+
+#define Ret_M256i_int_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int j = 0; j < numInterestingInts; ++j)                               \
+      for (int k = 0; k < 4; ++k) {                                            \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        Ret_type ret = func(m1, interesting_ints[j], Tint);                    \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&ret, str2);                                                     \
+        printf("%s(%s, 0x%08X, %d) = %s\n",                                    \
+               #func,                                                          \
+               str,                                                            \
+               interesting_ints[j],                                            \
+               Tint,                                                           \
+               str2);                                                          \
+      }
+
+#define Ret_M256i_M128i_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingInts / 4; ++j) {                       \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        __m128i m2 =                                                           \
+          (__m128i)E2_Int(interesting_ints, j * 4, numInterestingInts);        \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256d_M128d_Tint_body(Ret_type, func, Tint)                        \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int k = 0; k < 2; ++k)                                                \
+      for (int j = 0; j < numInterestingDoubles / 2; ++j) {                    \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        __m128d m2 =                                                           \
+          E2_Double(interesting_doubles, j * 2, numInterestingDoubles);        \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_M128_Tint_body(Ret_type, func, Tint)                          \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int k = 0; k < 4; ++k)                                                \
+      for (int j = 0; j < numInterestingFloats / 4; ++j) {                     \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        __m128 m2 = E2(interesting_floats, j * 4, numInterestingFloats);       \
+        Ret_type ret = func(m1, m2, Tint);                                     \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(&m2, str2);                                                      \
+        char str3[256];                                                        \
+        tostr(&ret, str3);                                                     \
+        printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3);         \
+      }
+
+#define Ret_M256_Tint(Ret_type, func)                                          \
+  const_int8_unroll(Ret_type, Ret_M256_Tint_body, func)
+#define Ret_M256d_Tint(Ret_type, func)                                         \
+  const_int8_unroll(Ret_type, Ret_M256d_Tint_body, func)
+
+#define Ret_M256i_M256i_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256i_M256i_Tint_body, func)
+#define Ret_M256d_M256d_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256d_M256d_Tint_body, func)
+#define Ret_M256_M256_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256_M256_Tint_body, func)
+#define Ret_M256i_Tint(Ret_type, func)                                         \
+  const_int8_unroll(Ret_type, Ret_M256i_Tint_body, func)
+
+#define Ret_M256i_int_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256i_int_Tint_body, func)
+
+#define Ret_M256i_M128i_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256i_M128i_Tint_body, func)
+#define Ret_M256d_M128d_Tint(Ret_type, func)                                   \
+  const_int8_unroll(Ret_type, Ret_M256d_M128d_Tint_body, func)
+#define Ret_M256_M128_Tint(Ret_type, func)                                     \
+  const_int8_unroll(Ret_type, Ret_M256_M128_Tint_body, func)
+
+#define Ret_M256d_M256d_Tint_5bits(Ret_type, func)                             \
+  const_int5_full_unroll(Ret_type, Ret_M256d_M256d_Tint_body, func)
+#define Ret_M256_M256_Tint_5bits(Ret_type, func)                               \
+  const_int5_full_unroll(Ret_type, Ret_M256_M256_Tint_body, func)
+
+#define void_OutDoublePtr_M256d(                                               \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 2; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);                 \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        align1_double* out = (align1_double*)(base + offset);                  \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, numBytesWritten / sizeof(double), str2);                    \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutFloatPtr_M256(func, Ptr_type, numBytesWritten, alignmentBytes) \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutFloatStore(32);                  \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        align1_float* out = (align1_float*)(base + offset);                    \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, numBytesWritten / sizeof(float), str2);                     \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutIntPtr_M256i(func, Ptr_type, numBytesWritten, alignmentBytes)  \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutIntStore(32);                    \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        align1_int* out = (align1_int*)(base + offset);                        \
+        func((Ptr_type)out, m1);                                               \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out, (numBytesWritten + sizeof(int) - 1) / sizeof(int), str2);   \
+        printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2);         \
+      }
+
+#define void_OutDoublePtr_M256i_M256d(                                         \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int j = 0; j < numInterestingInts / 4; ++j)                           \
+      for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
+        for (int k = 0; k < 2; ++k) {                                          \
+          uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);               \
+          __m128i tmp1 =                                                       \
+            (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);      \
+          __m256i m1 = _mm256_set_m128i(tmp1, tmp1);                           \
+          __m128d tmp2 =                                                       \
+            E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);  \
+          __m256d m2 = _mm256_set_m128d(tmp2, tmp2);                           \
+          align1_double* out = (align1_double*)(base + offset);                \
+          func((Ptr_type)out, m1, m2);                                         \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(out, numBytesWritten / sizeof(double), str3);                  \
+          printf(                                                              \
+            "%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3);  \
+        }
+
+#define void_OutFloatPtr_M256i_M256(                                           \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int j = 0; j < numInterestingInts / 4; ++j)                           \
+      for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
+        for (int k = 0; k < 4; ++k) {                                          \
+          uintptr_t base = (uintptr_t)getTempOutFloatStore(16);                \
+          __m128i tmp1 =                                                       \
+            (__m128i)E1_Int(interesting_ints, j * 4, numInterestingInts);      \
+          __m256i m1 = _mm256_set_m128i(tmp1, tmp1);                           \
+          __m128 tmp2 =                                                        \
+            E1(interesting_floats, i * 4 + k, numInterestingFloats);           \
+          __m256 m2 = _mm256_set_m128(tmp2, tmp2);                             \
+          align1_float* out = (align1_float*)(base + offset);                  \
+          func((Ptr_type)out, m1, m2);                                         \
+          char str[256];                                                       \
+          tostr(&m1, str);                                                     \
+          char str2[256];                                                      \
+          tostr(&m2, str2);                                                    \
+          char str3[256];                                                      \
+          tostr(out, numBytesWritten / sizeof(float), str3);                   \
+          printf(                                                              \
+            "%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3);  \
+        }
+
+#define void_OutFloatPtr_OutFloatPtr_M256(                                     \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingFloats / 4; ++i)                           \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutFloatStore(32);                  \
+        __m128 tmp = E1(interesting_floats, i * 4 + k, numInterestingFloats);  \
+        __m256 m1 = _mm256_set_m128(tmp, tmp);                                 \
+        align1_float* out1 = (align1_float*)(base + offset);                   \
+        align1_float* out2 = out1 + 4;                                         \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out1, numBytesWritten / 2 / sizeof(float), str2);                \
+        char str3[256];                                                        \
+        tostr(out2, numBytesWritten / 2 / sizeof(float), str3);                \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define void_OutDoublePtr_OutDoublePtr_M256d(                                  \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingDoubles / 2; ++i)                          \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 2; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutDoubleStore(32);                 \
+        __m128d tmp =                                                          \
+          E1_Double(interesting_doubles, i * 2 + k, numInterestingDoubles);    \
+        __m256d m1 = _mm256_set_m128d(tmp, tmp);                               \
+        align1_double* out1 = (align1_double*)(base + offset);                 \
+        align1_double* out2 = out1 + 2;                                        \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(out1, numBytesWritten / 2 / sizeof(double), str2);               \
+        char str3[256];                                                        \
+        tostr(out2, numBytesWritten / 2 / sizeof(double), str3);               \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define void_OutIntPtr_OutIntPtr_M256i(                                        \
+  func, Ptr_type, numBytesWritten, alignmentBytes)                             \
+  for (int i = 0; i < numInterestingInts / 4; ++i)                             \
+    for (int offset = 0; offset < numBytesWritten; offset += alignmentBytes)   \
+      for (int k = 0; k < 4; ++k) {                                            \
+        uintptr_t base = (uintptr_t)getTempOutIntStore(32);                    \
+        __m128i tmp =                                                          \
+          (__m128i)E1_Int(interesting_ints, i * 4 + k, numInterestingInts);    \
+        __m256i m1 = _mm256_set_m128i(tmp, tmp);                               \
+        align1_int* out1 = (align1_int*)(base + offset);                       \
+        align1_int* out2 = out1 + 4;                                           \
+        func((Ptr_type)out1, (Ptr_type)out2, m1);                              \
+        char str[256];                                                         \
+        tostr(&m1, str);                                                       \
+        char str2[256];                                                        \
+        tostr(                                                                 \
+          out1, (numBytesWritten + sizeof(int) - 1) / 2 / sizeof(int), str2);  \
+        char str3[256];                                                        \
+        tostr(                                                                 \
+          out2, (numBytesWritten + sizeof(int) - 1) / 2 / sizeof(int), str3);  \
+        printf(                                                                \
+          "%s(p:align=%d, %s) = %s,%s\n", #func, offset, str, str2, str3);     \
+      }
+
+#define Ret_Double2(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 2 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(ptr[0], ptr[1]);                                       \
+    char str[256];                                                             \
+    tostr(ptr, 2, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Double4(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 4 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(ptr[0], ptr[1], ptr[2], ptr[3]);                       \
+    char str[256];                                                             \
+    tostr(ptr, 4, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Float8(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 8 <= numInterestingFloats; i += inc) {                   \
+    float* ptr = interesting_floats + i;                                       \
+    Ret_type ret =                                                             \
+      func(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);    \
+    char str[256];                                                             \
+    tostr(ptr, 8, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Int8(Ret_type, func, inc)                                          \
+  for (int i = 0; i + 8 <= numInterestingInts; i += inc) {                     \
+    int* ptr = (int*)interesting_ints + i;                                     \
+    Ret_type ret =                                                             \
+      func(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);    \
+    char str[256];                                                             \
+    tostr(ptr, 8, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Short16(Ret_type, func, inc)                                       \
+  for (int i = 0; i + 16 <= numInterestingInts * 2; i += inc) {                \
+    short* ptr = ((short*)interesting_ints) + i;                               \
+    Ret_type ret = func(ptr[0],                                                \
+                        ptr[1],                                                \
+                        ptr[2],                                                \
+                        ptr[3],                                                \
+                        ptr[4],                                                \
+                        ptr[5],                                                \
+                        ptr[6],                                                \
+                        ptr[7],                                                \
+                        ptr[8],                                                \
+                        ptr[9],                                                \
+                        ptr[10],                                               \
+                        ptr[11],                                               \
+                        ptr[12],                                               \
+                        ptr[13],                                               \
+                        ptr[14],                                               \
+                        ptr[15]);                                              \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Char32(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 32 <= numInterestingInts * 4; i += inc) {                \
+    char* ptr = ((char*)interesting_ints) + i;                                 \
+    Ret_type ret = func(ptr[0],                                                \
+                        ptr[1],                                                \
+                        ptr[2],                                                \
+                        ptr[3],                                                \
+                        ptr[4],                                                \
+                        ptr[5],                                                \
+                        ptr[6],                                                \
+                        ptr[7],                                                \
+                        ptr[8],                                                \
+                        ptr[9],                                                \
+                        ptr[10],                                               \
+                        ptr[11],                                               \
+                        ptr[12],                                               \
+                        ptr[13],                                               \
+                        ptr[14],                                               \
+                        ptr[15],                                               \
+                        ptr[16],                                               \
+                        ptr[17],                                               \
+                        ptr[18],                                               \
+                        ptr[19],                                               \
+                        ptr[20],                                               \
+                        ptr[21],                                               \
+                        ptr[22],                                               \
+                        ptr[23],                                               \
+                        ptr[24],                                               \
+                        ptr[25],                                               \
+                        ptr[26],                                               \
+                        ptr[27],                                               \
+                        ptr[28],                                               \
+                        ptr[29],                                               \
+                        ptr[30],                                               \
+                        ptr[31]);                                              \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Longlong4(Ret_type, func, inc)                                     \
+  for (int i = 0; i + 4 <= numInterestingInts / 2; i += inc) {                 \
+    long long* ptr = ((long long*)interesting_ints) + i;                       \
+    Ret_type ret = func(ptr[0], ptr[1], ptr[2], ptr[3]);                       \
+    char str[256];                                                             \
+    tostr((int*)ptr, 8, str);                                                  \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Double(Ret_type, func, inc)                                        \
+  for (int i = 0; i + 1 <= numInterestingDoubles; i += inc) {                  \
+    double* ptr = interesting_doubles + i;                                     \
+    Ret_type ret = func(*ptr);                                                 \
+    char str[256];                                                             \
+    tostr(ptr, 1, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#define Ret_Int(Ret_type, func, inc)                                           \
+  for (int i = 0; i + 1 <= numInterestingInts; i += inc) {                     \
+    int* ptr = ((int*)interesting_ints) + i;                                   \
+    Ret_type ret = func(*ptr);                                                 \
+    char str[256];                                                             \
+    tostr(ptr, 1, str);                                                        \
+    char str2[256];                                                            \
+    tostr(&ret, str2);                                                         \
+    printf("%s(%s) = %s\n", #func, str, str2);                                 \
+  }
+
+#endif