diff --git a/.gitignore b/.gitignore
index 5d155efb..9e41d884 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ cmake_install.cmake
 *~
 .DS_Store
 build
+tags
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 00000000..b8570dd3
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,100 @@
+before_script:
+    #- export http_proxy=http://proxyout.lanl.gov:8080
+    #- export HTTP_PROXY=http://proxyout.lanl.gov:8080
+    #- export https_proxy=https://proxyout.lanl.gov:8080
+    #- export HTTPS_PROXY=https://proxyout.lanl.gov:8080
+    - . /mnt/local/ssd/vpic/spack/share/spack/setup-env.sh
+    - . $(spack location -i lmod)/lmod/lmod/init/zsh
+    - . /mnt/local/ssd/vpic/spack/share/spack/setup-env.sh
+    - module unuse /mnt/local/ssd/vpic/spack/share/spack/modules/linux-centos7-x86_64
+    - module use /mnt/local/ssd/vpic/spack/share/spack/lmod/linux-centos7-x86_64
+
+stages:
+    - build_intel
+    - build_gcc
+
+# TODO: add werror build
+# TODO: Add v8/v16/etc
+
+.build_gcc:
+  variables:
+    GIT_SUBMODULE_STRATEGY: recursive
+  stage: build_gcc
+  script:
+    - module load Core/gcc/8.2.0-sxbf4jq
+    - module load cmake/3.13.3-zd4lpat
+    - module load openmpi/3.1.3-atscp4j
+    - j="$(grep -c processor /proc/cpuinfo 2>/dev/null)" || j=0; ((j++))
+    - build_dir=build-${CI_JOB_NAME}-${CI_JOB_ID}
+    - mkdir -p ${build_dir}
+    - pushd ${build_dir}
+    - CC=${_CC} CXX=${_CXX} cmake
+      -DENABLE_INTEGRATED_TESTS=ON
+      -DENABLE_UNIT_TESTS=ON
+      -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      ..
+    - make -j ${j} -l ${l} VERBOSE=1
+    - make test CTEST_OUTPUT_ON_FAILURE=1
+    - make install DESTDIR=${PWD}/install
+
+.build_intel:
+  variables:
+    GIT_SUBMODULE_STRATEGY: recursive
+  stage: build_intel
+  script:
+    - module load Core/gcc/8.2.0-sxbf4jq
+    - module load gcc/8.2.0/cmake/3.13.3-zd4lpat
+    - source /mnt/local/ssd/vpic/spack/opt/spack/linux-centos7-x86_64/gcc-8.2.0/intel-19.0.1-p7galop7xyykgiz67bwmth44xwitbbfg/compilers_and_libraries_2019.1.144/linux/bin/compilervars.sh intel64
+    - module load  intel/19.0.1.144/openmpi/3.1.3-dhknpg2
+    - j="$(grep -c processor /proc/cpuinfo 2>/dev/null)" || j=0; ((j++))
+    - build_dir=build-${CI_JOB_NAME}-${CI_JOB_ID}
+    - mkdir -p ${build_dir}
+    - pushd ${build_dir}
+    - CC=${_CC} CXX=${_CXX} cmake
+      -DENABLE_INTEGRATED_TESTS=ON
+      -DENABLE_UNIT_TESTS=ON
+      -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      ..
+    - make -j ${j} -l ${l} VERBOSE=1
+    - make test CTEST_OUTPUT_ON_FAILURE=1
+    - make install DESTDIR=${PWD}/install
+
+.gcc:
+  variables:
+    _CC: "gcc"
+    _CXX: "g++"
+  extends: .build_gcc
+
+Release-GCC:
+  variables:
+    CMAKE_BUILD_TYPE: "Release"
+  extends: .gcc
+
+Debug-GCC:
+  variables:
+    CMAKE_BUILD_TYPE: "Debug"
+  extends: .gcc
+
+  #.clang:
+  #variables:
+  #_CC: "clang"
+  #_CXX: "clang++"
+  #extends: .build
+
+  #Release-Clang:
+  #variables:
+  #CMAKE_BUILD_TYPE: "Release"
+  #extends: .clang
+
+.intel:
+    variables:
+        #_CC: "icc"
+        #_CXX: "icpc"
+        _CC: "mpicc"
+        _CXX: "mpic++"
+    extends: .build_intel
+
+Release-Intel:
+  variables:
+      CMAKE_BUILD_TYPE: "Release"
+  extends: .intel
diff --git a/.travis.yml b/.travis.yml
index 16e1a095..232151e6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,15 +9,15 @@ addons:
     packages:
       - ccache
       - cmake
+      - libhwloc-dev
       - libopenmpi-dev
       - openmpi-bin
       - gcc-4.9
       - g++-4.9
-
 before_install:
  - mkdir -p "$HOME/bin"
  - if [[ ${CMAKE_VERSION} ]]; then wget --no-check-certificate -qO- http://www.cmake.org/files/v${CMAKE_VERSION:0:3}/cmake-${CMAKE_VERSION}.tar.gz | tar -xz && ln -s $PWD/cmake-${CMAKE_VERSION}/bin/cmake "$HOME/bin/cmake"; fi
- - if [[ ${COVERAGE}  ]]; then pip install --user codecov; fi 
+ - if [[ ${COVERAGE}  ]]; then pip install --user codecov; fi
 
 env: #maybe add mpich later
   global:
@@ -25,12 +25,17 @@ env: #maybe add mpich later
     - CMAKE_VERSION=3.1.3-Linux-x86_64
     - GVER=4.9
   matrix:
-    - 
     - COVERAGE=ON
+    - USE_V4_SSE=ON
+    - USE_V4_AVX2=ON
+    - USE_V4_PORTABLE=ON
+    - USE_V4_AVX2=ON USE_V8_AVX2=ON
+    - USE_V4_PORTABLE=ON USE_V8_PORTABLE=ON
+    - USE_V4_PORTABLE=ON USE_V16_PORTABLE=ON
 
 script:
-  - mkdir build && cd build && 
-    PATH="$HOME/bin:/usr/lib/ccache:$PATH" CC=gcc-${GVER} CXX=g++-${GVER} cmake -DENABLE_INTEGRATED_TESTS=ON -DENABLE_UNIT_TESTS=ON ${COVERAGE:+-DENABLE_COVERAGE_BUILD=ON} .. && 
+  - mkdir build && cd build &&
+    PATH="$HOME/bin:/usr/lib/ccache:$PATH" CC=gcc-${GVER} CXX=g++-${GVER} cmake -DENABLE_INTEGRATED_TESTS=ON -DENABLE_UNIT_TESTS=ON ${USE_V4_SSE:+-DUSE_V4_SSE=ON} ${USE_V4_PORTABLE:+-DUSE_V4_PORTABLE=ON} ${COVERAGE:+-DENABLE_COVERAGE_BUILD=ON} .. &&
     make -j4 VERBOSE=1 && make test CTEST_OUTPUT_ON_FAILURE=1 && make install DESTDIR=$PWD
 
 after_success:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28921e11..2f9902c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,13 @@ include_directories(${MPI_C_INCLUDE_PATH})
 # Add build options
 #------------------------------------------------------------------------------#
 
-option(ENABLE_INTEGRATED_TESTS "enable integrated tests" OFF)
+option(ENABLE_INTEGRATED_TESTS "Enable integrated tests" OFF)
+
+option(ENABLE_UNIT_TESTS "Enable unit tests" OFF)
+
+option(USE_OPENMP "Use OpenMP" OFF)
+
+option(USE_PTHREADS "Use Pthreads" ON)
 
 option(USE_V4_ALTIVEC "Enable V4 Altivec" OFF)
 
@@ -45,8 +51,34 @@ option(USE_V4_PORTABLE "Enable V4 Portable" OFF)
 
 option(USE_V4_SSE "Enable V4 SSE" OFF)
 
+option(USE_V4_AVX "Enable V4 AVX" OFF)
+
+option(USE_V4_AVX2 "Enable V4 AVX2" OFF)
+
+option(USE_V8_PORTABLE "Enable V8 Portable" OFF)
+
+option(USE_V8_AVX "Enable V8 AVX" OFF)
+
+option(USE_V8_AVX2 "Enable V8 AVX2" OFF)
+
+option(USE_V16_PORTABLE "Enable V16 Portable" OFF)
+
+option(USE_V16_AVX512 "Enable V16 AVX512" OFF)
+
+option(USE_LEGACY_SORT "Enable Legacy Sort Implementation" OFF)
+
+#option(USE_ADVANCE_P_AUTOVEC "Enable Explicit Autovec" OFF)
+
+option(VPIC_PRINT_MORE_DIGITS "Print more digits in VPIC timer info" OFF)
+
 option(ENABLE_OPENSSL "Enable OpenSSL support for checksums" OFF)
 
+option(DISABLE_DYNAMIC_RESIZING "Prevent particle arrays from dynamically resizing during a run" OFF)
+
+# option to set minimum number of particles
+set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing")
+
+
 #------------------------------------------------------------------------------#
 # Create include and link aggregates
 #
@@ -72,6 +104,14 @@ endif("${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo")
 string(REPLACE ";" " " string_libraries "${MPI_CXX_LIBRARIES} ${MPI_C_LIBRARIES}")
 set(VPIC_CXX_LIBRARIES "${string_libraries}")
 
+if(DISABLE_DYNAMIC_RESIZING)
+  add_definitions(-DDISABLE_DYNAMIC_RESIZING)
+endif(DISABLE_DYNAMIC_RESIZING)
+
+if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO")
+    add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES})
+endif()
+
 #------------------------------------------------------------------------------#
 # OpenSSL
 #------------------------------------------------------------------------------#
@@ -90,28 +130,156 @@ find_package(Threads REQUIRED)
 # Act on build options set in project.cmake
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Add options for building with the legacy particle sort implementation.
+#------------------------------------------------------------------------------#
+
+if(USE_LEGACY_SORT)
+  add_definitions(-DVPIC_USE_LEGACY_SORT)
+    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT")
+endif(USE_LEGACY_SORT)
+
+#------------------------------------------------------------------------------#
+# Add options for building with a threading model.
+#------------------------------------------------------------------------------#
+
+# We don't want both PTHREADS and OpenMP
+if ((USE_PTHREADS) AND (USE_OPENMP))
+     message( FATAL_ERROR "Only one threading model can be selected" )
+endif()
+
+
+if(USE_PTHREADS)
+  add_definitions(-DVPIC_USE_PTHREADS)
+    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_PTHREADS")
+endif(USE_PTHREADS)
+
+if(USE_OPENMP)
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    add_definitions(-DVPIC_USE_OPENMP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_OPENMP")
+  endif(OPENMP_FOUND)
+endif(USE_OPENMP)
+
+#------------------------------------------------------------------------------#
+# Add options for building with v4 simd vector support.
+#------------------------------------------------------------------------------#
+
 set(USE_V4)
-if(USE_V4_ALTIVEC)
-  add_definitions(-DUSE_V4_ALTIVEC)
-  set(USE_V4 True)
-endif(USE_V4_ALTIVEC)
 
 if(USE_V4_PORTABLE)
   add_definitions(-DUSE_V4_PORTABLE)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_PORTABLE")
   set(USE_V4 True)
 endif(USE_V4_PORTABLE)
 
 if(USE_V4_SSE)
   add_definitions(-DUSE_V4_SSE)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_SSE")
   set(USE_V4 True)
 endif(USE_V4_SSE)
 
+if(USE_V4_AVX)
+  add_definitions(-DUSE_V4_AVX)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_AVX")
+  set(USE_V4 True)
+endif(USE_V4_AVX)
+
+if(USE_V4_AVX2)
+  add_definitions(-DUSE_V4_AVX2)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_AVX2")
+  set(USE_V4 True)
+endif(USE_V4_AVX2)
+
+if(USE_V4_ALTIVEC)
+  add_definitions(-DUSE_V4_ALTIVEC)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_ALTIVEC")
+  set(USE_V4 True)
+endif(USE_V4_ALTIVEC)
+
+#------------------------------------------------------------------------------#
+# Add options for building with v8 simd vector support.
+#------------------------------------------------------------------------------#
+
+set(USE_V8)
+
+if(USE_V8_PORTABLE)
+  add_definitions(-DUSE_V8_PORTABLE)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V8_PORTABLE")
+  set(USE_V8 True)
+endif(USE_V8_PORTABLE)
+
+if(USE_V8_AVX)
+  add_definitions(-DUSE_V8_AVX)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V8_AVX")
+  set(USE_V8 True)
+endif(USE_V8_AVX)
+
+if(USE_V8_AVX2)
+  add_definitions(-DUSE_V8_AVX2)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V8_AVX2")
+  set(USE_V8 True)
+endif(USE_V8_AVX2)
+
+#------------------------------------------------------------------------------#
+# Add options for building with v16 simd vector support.
+#------------------------------------------------------------------------------#
+
+set(USE_V16)
+
+if(USE_V16_PORTABLE)
+  add_definitions(-DUSE_V16_PORTABLE)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V16_PORTABLE")
+  set(USE_V16 True)
+endif(USE_V16_PORTABLE)
+
+if(USE_V16_AVX512)
+  add_definitions(-DUSE_V16_AVX512)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V16_AVX512")
+  set(USE_V16 True)
+endif(USE_V16_AVX512)
+
+# TODO: Can we improve the way this is done so it's detection of a positive not
+# multiple negatives?
+if (NOT USE_V4 AND NOT USE_V8 AND NOT USE_V16)
+    # This option is intended to indicate that we're not using any of the
+    # specialized intrinsics code paths, and we can expect numerical answer to
+    # be the same as the "reference" (serial/auto-vec) implementation. This can
+    # be especially useful when trying to test small differences in expressions
+    # and we want bitwise answers to a known implementation
+    set(NO_EXPLICIT_VECTOR True)
+endif()
+
+
+#------------------------------------------------------------------------------#
+# Add options for building with explicit autovec support.
+#------------------------------------------------------------------------------#
+
+#if(USE_ADVANCE_P_AUTOVEC)
+  #add_definitions(-DADVANCE_P_AUTOVEC)
+  #set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DADVANCE_P_AUTOVEC")
+  #set(ADVANCE_P_AUTOVEC True)
+#endif(USE_ADVANCE_P_AUTOVEC)
+
+#------------------------------------------------------------------------------#
+# Miscellaneous options.
+#------------------------------------------------------------------------------#
+
 if(ENABLE_OPENSSL)
   add_definitions(-DENABLE_OPENSSL)
 endif(ENABLE_OPENSSL)
 
+if(VPIC_PRINT_MORE_DIGITS)
+  add_definitions(-DVPIC_PRINT_MORE_DIGITS)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_PRINT_MORE_DIGITS")
+endif(VPIC_PRINT_MORE_DIGITS)
+
 #------------------------------------------------------------------------------#
-# Handle vpic compile script last
+# Handle vpic compile script last.
 #------------------------------------------------------------------------------#
 
 # Allow files to be references from the root, (eg #include "src/vpic/vpic.h")
@@ -166,7 +334,11 @@ file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VPIC_CXX_FLAGS}")
 
 file(GLOB_RECURSE VPIC_SRC src/*.c src/*.cc)
-file(GLOB_RECURSE VPIC_NOT_SRC src/util/v4/test/v4.cc src/util/rng/test/rng.cc)
+file(GLOB_RECURSE VPIC_NOT_SRC
+  src/util/v4/test/v4.cc
+  src/util/v8/test/v8.cc
+  src/util/v16/test/v16.cc
+  src/util/rng/test/rng.cc)
 list(REMOVE_ITEM VPIC_SRC ${VPIC_NOT_SRC})
 option(NO_LIBVPIC "Don't build a libvpic, but all in one" OFF)
 if(NO_LIBVPIC)
@@ -230,6 +402,8 @@ if(ENABLE_UNIT_TESTS)
   target_link_libraries(rng vpic)
   add_test(NAME rng COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ./rng)
 
+  add_subdirectory(test/unit)
+
 endif(ENABLE_UNIT_TESTS)
 #~---------------------------------------------------------------------------~-#
 # vim: set tabstop=2 shiftwidth=2 expandtab :
diff --git a/README.md b/README.md
index 25cf577e..d11a8856 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ description for the electric and magnetic fields evolved via a second-
 order finite-difference-time-domain (FDTD) solve. The VPIC code has been
 optimized for modern computing architectures and uses Message Passing
 Interface (MPI) calls for multi-node application as well as data
-parallelism using pthreads. VPIC employs a variety of short-vector,
+parallelism using threads. VPIC employs a variety of short-vector,
 single-instruction-multiple-data (SIMD) intrinsics for high performance
 and has been designed so that the data structures align with cache
 boundaries. The current feature set for VPIC includes a flexible input
@@ -52,12 +52,17 @@ with VPIC and Roadrunner, Journal of Physics: Conference Series 180,
 
 # Getting the Code
 
-VPIC uses nested submodules.  This requires the addition of the *--recursive*
-flag when cloning the repository:
+To checkout the VPIC source, do the following:
 
-    % git clone https://github.com/lanl/vpic.git
+```bash
+    git clone https://github.com/lanl/vpic.git
+```
 
-This command will check out the VPIC source code.
+## Branches
+
+The stable release of vpic exists on `master`, the default branch.
+
+For more cutting edge features, consider using the `devel` branch.
 
 # Requirements
 
@@ -66,42 +71,84 @@ an up-to-date version of MPI.
 
 # Build Instructions
 
-    % cd vpic 
+```bash
+    cd vpic 
+```
 
 VPIC uses the CMake build system. To configure a build, do the following from
 the top-level source directory:
   
-    % mkdir build
-    % cd build
+```bash
+    mkdir build
+    cd build
+```
+
+The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building, but the user is left to select which compiler they wish to use.  The scripts are largely organized into folders by compiler, with specific flags and options set to match the target compiler.
+
+Any of the arch scripts can be invoked specifying the file name from inside a build directory:
+
+```bash
+    ../arch/reference-Debug
+```
+
+After configuration, simply type: 
 
-Then call the curses version of CMake:
+```bash
+    make
+```
 
-    % ccmake ..
+Three scripts in the `./arch` directory are of particular note: lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1. These scripts
+provide a default way to build VPIC on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL
+ATS-1 clusters are the first generation of DOE Advanced Technology Systems and consist of a partition of dual socket Intel
+Haswell nodes and a partition of single socket Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation
+of DOE Commodity Technology Systems and consist of dual socket Intel Broadwell nodes running the TOSS 3.3 operating system.
+The lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1 scripts are heavily documented and can be configured to provide a large
+variety of custom builds for their respective platform types. These scripts could also serve as a good starting point for
+development of a build script for other platform types. Because these scripts also configure the users build environment
+via the use of module commands, the scripts run both the cmake and make commands.
 
-The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building
+From the user created build directory, these scripts can be invoked as follows:
 
-They can be invoked using something like:
+```bash
+    ../arch/lanl-ats1-hsw
+```
 
-    % ../arch/generic-Release
+or
 
-GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`)
+```bash
+    ../arch/lanl-ats1-knl
+```
+
+or
+
+```bash
+    ../arch/lanl-cts1
+```
+
+Advanced users may choose to instead invoke `cmake` directly and hand select options. Documentation on valid ways
+to select these options may be found in the lanl-ats1 and lanl-cts1 build scripts mentioned above.
+
+GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`).
 
-After configuration, simply type 'make'.
 
 # Building an example input deck
 
 After you have successfully built VPIC, you should have an executable in
-the *bin* directory called *vpic*.  To build an executable from one of
-the sample input decks, simply run:
+the `bin` directory called `vpic` (`./bin/vpic`).  To build an executable from one of
+the sample input decks (found in `./sample`), simply run:
 
-    % bin/vpic input_deck
+```bash
+    ./bin/vpic input_deck
+```
 
 where *input_deck* is the name of your sample deck.  For example, to build
 the *harris* input deck in the *sample* subdirectory
 *(assuming that your build directory is located in the top-level
 source directory)*:
 
-    % bin/vpic ../sample/harris
+```bash
+    ./bin/vpic ../sample/harris
+```
 
 Beginners are advised to read the harris deck thoroughly, as it provides many examples of common uses cases.
 
@@ -117,13 +164,18 @@ The following specific syntax is available to the users:
 
 Threading (per MPI rank) can be enabled using the following syntax: 
 
-`./binary.Linux --tpp n`
+```bash
+    ./binary.Linux --tpp n
+```
 
 Where n specifies the number of threads
 
 ### Example:
 
-`mpirun -n 2 ./binary.Linux --tpp 2`
+```bash
+    mpirun -n 2 ./binary.Linux --tpp 2
+```
+
 
 To run with VPIC with two threads per MPI rank.
 
@@ -131,17 +183,136 @@ To run with VPIC with two threads per MPI rank.
 
 VPIC can restart from a checkpoint dump file, using the following syntax:
 
-`./binary.Linux --restore <path to file>`
+```bash
+    ./binary.Linux --restore <path to file>
+```
 
 ### Example:
 
-`./binary.Linux --restore ./restart/restart0`
+```bash
+    ./binary.Linux --restore ./restart/restart0 
+```
 
 To restart VPIC using the restart file `./restart/restart0`
 
+# Compile Time Arguments
+
+Currently, the following options are exposed at compile time for the users consideration:
+
+## Particle Array Resizing
+
+- `DISABLE_DYNAMIC_RESIZING` (default `OFF`): Enable to disable the use of dynamic particle resizing
+- `SET_MIN_NUM_PARTICLES` (default 128 [4kb]): Set the minimum number of particles allowable when dynamically resizing
+
+## Threading Model
+
+ - `USE_PTHREADS`: Use Pthreads for threading model, (default `ON`)
+ - `USE_OPENMP`:   Use OpenMP for threading model
+
+## Vectorization
+
+The following CMake variables are used to control the vector implementation that
+VPIC uses for each SIMD width.  Currently, there is support for 128 bit, 256 bit
+and 512 bit SIMD widths.  The default is for each of these CMake variables to be
+disabled which means that an unvectorized reference implementation of functions
+will be used.
+
+ - `USE_V4_SSE`:       Enable 4 wide (128-bit) SSE
+ - `USE_V4_AVX`:       Enable 4 wide (128-bit) AVX
+ - `USE_V4_AVX2`:      Enable 4 wide (128-bit) AVX2
+ - `USE_V4_ALTIVEC`:   Enable 4 wide (128-bit) Altivec
+ - `USE_V4_PORTABLE`:  Enable 4 wide (128-bit) portable implementation
+
+ - `USE_V8_AVX`:       Enable 8 wide (256-bit) AVX
+ - `USE_V8_AVX2`:      Enable 8 wide (256-bit) AVX2
+ - `USE_V8_PORTABLE`:  Enable 8 wide (256-bit) portable implementation
+
+ - `USE_V16_AVX512`:   Enable 16 wide (512-bit) AVX512
+ - `USE_V16_PORTABLE`: Enable 16 wide (512-bit) portable implementation
+
+Several functions in VPIC have vector implementations for each of the three SIMD
+widths.  Some only have a single implementation.  An example of the latter is
+move_p which only has a reference implementation and a V4 implementation.
+
+It is possible to have a single CMake vector variable configured as ON for each
+of the three supported SIMD vector widths.  It is recommended to always have a
+CMake variable configured as ON for the 128 bit SIMD vector width so that move_p
+will be vectorized.  In addition, it is recommended to configure as ON the CMake
+variable that is associated with the native SIMD vector width of the processor
+that VPIC is targeting.  If a CMake variable is configured as ON for each of the
+three available SIMD vector widths, then for a given function in VPIC, the
+implementation which supports the largest SIMD vector length will be chosen.  If
+a V16 implementation exists, it will be chosen.  If a V16 implementation does not
+exist but V8 and V4 implementations exist, the V8 implementation will be chosen.
+If V16 and V8 implementations do not exist but a V4 implementation does, it will
+be chosen.  If no SIMD vector implementation exists, the unvectorized reference
+implementation will be chosen.
+
+In summary, when using vector versions on a machine with 256 bit SIMD, the
+V4 and V8 implementations should be configured as ON. When using a machine
+with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+When choosing a vector implementation for a given SIMD vector length, the
+implementation that is closest to the SIMD instruction set for the targeted
+processor should be chosen.  The portable versions are most commonly used for
+debugging the implementation of new intrinsics versions.  However, the portable
+versions are generally more performant than the unvectorized reference
+implemenation.  So, one might consider using the V4_PORTABLE version on ARM
+processors until a V4_NEON implementation becomes available.
+
+## Output 
+
+ - `VPIC_PRINT_MORE_DIGITS`: Enable more digits in timing output of status reports
+
+## Particle sorting implementation
+
+The CMake variable below allows building VPIC to use the legacy, thread serial
+implementation of the particle sort algorithm.
+
+ - `USE_LEGACY_SORT`: Use legacy thread serial particle sort, (default `OFF`)
+
+The legacy particle sort implementation is the thread serial particle sort
+implementation from the legacy v407 version of VPIC. This implementation
+supports both in-place and out-of-place sorting of the particles. It is very
+competitive with the thread parallel sort implementation for a small number
+of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+the particles in-place allows the fraction of particles stored in High
+Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+of VPIC is reduced by the memory of a particle array which can be significant
+for particle dominated problems.
+
+The default particle sort implementation is a thread parallel implementation.
+Currently, it can only perform out-of-place sorting of the particles. It will
+be more performant than the legacy implementation when using many threads per
+MPI rank but uses more memory because of the out-of-place sort.
+
+# Workflow
+
+Contributors are asked to be aware of the following workflow:
+
+1) Pull requests are accepted into `devel` upon tests passing
+2) `master` should reflect the *stable* state of the code
+3) Periodic releases will be made from `devel` into `master`
+
 # Feedback
 
-Feedback, comments, or issues can be raised through [GitHub issues](https://github.com/lanl/vpic/issues)
+Feedback, comments, or issues can be raised through [GitHub issues](https://github.com/lanl/vpic/issues).
+
+A mailing list for open collaboration can also be found [here](https://groups.google.com/forum/#!forum/vpic-users)
+
+# Versioning
+
+Version release summary: 
+
+## V1.1 (March 2019)
+
+- Added V8 and V16 functionality
+- Improved documentation and build processes
+- Significantly improved testing and correctness capabilities
+
+## V1.0
+
+Initial release
+
 
 # Release
 
diff --git a/arch/cray-Release-sse b/arch/cray/cray-Release-sse
similarity index 68%
rename from arch/cray-Release-sse
rename to arch/cray/cray-Release-sse
index 9cd73d36..e4f9b187 100755
--- a/arch/cray-Release-sse
+++ b/arch/cray/cray-Release-sse
@@ -1,7 +1,4 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
 
 # To build on ciel(it)o:
 # module swap PrgEnv-pgi PrgEnv-intel
@@ -13,16 +10,11 @@
 # ../arch/cray-Release-sse
 # make -j
 
-
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
 #------------------------------------------------------------------------------#
 
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
+src_dir="${0%/*}/../.."
 
 #------------------------------------------------------------------------------#
 # Call CMake command
@@ -56,12 +48,3 @@ cmake \
 #  -DENABLE_INTEGRATED_TESTS=OFF \
 #  -DMPIEXEC=`which aprun` \
 #  -DMPIEXEC_NUMPROC_FLAG="-n" \
-
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-gcc b/arch/gcc/reference-Release
similarity index 54%
rename from arch/generic-gcc
rename to arch/gcc/reference-Release
index 030dc7bc..489d60e2 100755
--- a/arch/generic-gcc
+++ b/arch/gcc/reference-Release
@@ -1,17 +1,9 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
 #------------------------------------------------------------------------------#
 
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
+src_dir="${0%/*}/../.."
 
 #------------------------------------------------------------------------------#
 # Call CMake command
@@ -28,11 +20,3 @@ cmake \
   -DCMAKE_C_FLAGS="-rdynamic -fno-strict-aliasing" \
   -DCMAKE_CXX_FLAGS="-rdynamic -fno-strict-aliasing" \
   $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-gcc-sse b/arch/gcc/v4-sse
similarity index 54%
rename from arch/generic-gcc-sse
rename to arch/gcc/v4-sse
index 2220448c..dfaa4b6c 100755
--- a/arch/generic-gcc-sse
+++ b/arch/gcc/v4-sse
@@ -1,17 +1,9 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
 #------------------------------------------------------------------------------#
 
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
+src_dir="${0%/*}/../.."
 
 #------------------------------------------------------------------------------#
 # Call CMake command
@@ -29,11 +21,3 @@ cmake \
   -DCMAKE_C_FLAGS="-rdynamic -fno-strict-aliasing" \
   -DCMAKE_CXX_FLAGS="-rdynamic -fno-strict-aliasing" \
   $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/gcc/v8-avx2 b/arch/gcc/v8-avx2
new file mode 100755
index 00000000..632eaeda
--- /dev/null
+++ b/arch/gcc/v8-avx2
@@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/../.."
+
+#------------------------------------------------------------------------------#
+# Call CMake command
+#------------------------------------------------------------------------------#
+
+# The flag -rdynamic removes warnings of the form:
+# Unable to find a safely writable symbol that corresponds to address 432af0
+# (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").  Writing out
+# the raw address instead and keeping my fingers crossed.
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_INTEGRATED_TESTS=ON \
+  -DUSE_V4_AVX2=ON \
+  -DUSE_V8_AVX2=ON \
+  -DCMAKE_C_FLAGS="-rdynamic -fno-strict-aliasing" \
+  -DCMAKE_CXX_FLAGS="-rdynamic -fno-strict-aliasing" \
+  $src_dir
diff --git a/arch/generic-Debug-sse b/arch/generic-Debug-sse
deleted file mode 100755
index 22c1d57d..00000000
--- a/arch/generic-Debug-sse
+++ /dev/null
@@ -1,39 +0,0 @@
-#! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
-#------------------------------------------------------------------------------#
-# Get the path to the project from which this script was called
-#------------------------------------------------------------------------------#
-
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
-#------------------------------------------------------------------------------#
-# Call CMake command
-#------------------------------------------------------------------------------#
-
-# The flag -rdynamic removes warnings of the form:
-# Unable to find a safely writable symbol that corresponds to address 432af0
-# (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").  Writing out
-# the raw address instead and keeping my fingers crossed.
-
-cmake \
-  -DCMAKE_BUILD_TYPE=Debug \
-  -DENABLE_INTEGRATED_TESTS=ON \
-  -DUSE_V4_SSE=ON \
-  -DCMAKE_C_FLAGS="-rdynamic" \
-  -DCMAKE_CXX_FLAGS="-rdynamic" \
-  $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-RelWithDebInfo b/arch/generic-RelWithDebInfo
deleted file mode 100755
index c4fd0114..00000000
--- a/arch/generic-RelWithDebInfo
+++ /dev/null
@@ -1,38 +0,0 @@
-#! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
-#------------------------------------------------------------------------------#
-# Get the path to the project from which this script was called
-#------------------------------------------------------------------------------#
-
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
-#------------------------------------------------------------------------------#
-# Call CMake command
-#------------------------------------------------------------------------------#
-
-# The flag -rdynamic removes warnings of the form:
-# Unable to find a safely writable symbol that corresponds to address 432af0
-# (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").  Writing out
-# the raw address instead and keeping my fingers crossed.
-
-cmake \
-  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-  -DENABLE_INTEGRATED_TESTS=ON \
-  -DCMAKE_C_FLAGS="-rdynamic" \
-  -DCMAKE_CXX_FLAGS="-rdynamic" \
-  $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-RelWithDebInfo-sse b/arch/generic-RelWithDebInfo-sse
deleted file mode 100755
index 089363da..00000000
--- a/arch/generic-RelWithDebInfo-sse
+++ /dev/null
@@ -1,39 +0,0 @@
-#! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
-#------------------------------------------------------------------------------#
-# Get the path to the project from which this script was called
-#------------------------------------------------------------------------------#
-
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
-#------------------------------------------------------------------------------#
-# Call CMake command
-#------------------------------------------------------------------------------#
-
-# The flag -rdynamic removes warnings of the form:
-# Unable to find a safely writable symbol that corresponds to address 432af0
-# (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").  Writing out
-# the raw address instead and keeping my fingers crossed.
-
-cmake \
-  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-  -DENABLE_INTEGRATED_TESTS=ON \
-  -DUSE_V4_SSE=ON \
-  -DCMAKE_C_FLAGS="-rdynamic" \
-  -DCMAKE_CXX_FLAGS="-rdynamic" \
-  $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/intel/v16-avx2 b/arch/intel/v16-avx2
new file mode 100755
index 00000000..bd4573de
--- /dev/null
+++ b/arch/intel/v16-avx2
@@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/../.."
+
+#------------------------------------------------------------------------------#
+# Call CMake command
+#------------------------------------------------------------------------------#
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_INTEGRATED_TESTS=OFF \
+  -DENABLE_UNIT_TESTS=OFF \
+  -DCMAKE_C_COMPILER=mpiicc \
+  -DCMAKE_CXX_COMPILER=mpiicpc \
+  -DCMAKE_C_FLAGS="-O3 -rdynamic -inline-forceinline  \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DCMAKE_CXX_FLAGS="-O3 -rdynamic -inline-forceinline \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DUSE_V4_AVX2=ON \
+  -DUSE_V16_AVX512=ON \
+  $src_dir
diff --git a/arch/intel/v16-knl b/arch/intel/v16-knl
new file mode 100755
index 00000000..28d6bdca
--- /dev/null
+++ b/arch/intel/v16-knl
@@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/../.."
+
+#------------------------------------------------------------------------------#
+# Call CMake command
+#------------------------------------------------------------------------------#
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_INTEGRATED_TESTS=OFF \
+  -DENABLE_UNIT_TESTS=OFF \
+  -DCMAKE_C_COMPILER=mpiicc \
+  -DCMAKE_CXX_COMPILER=mpiicpc \
+  -DCMAKE_C_FLAGS="-O3 -rdynamic -inline-forceinline  \
+  -qoverride-limits -no-ansi-alias -xMIC-AVX512" \
+  -DCMAKE_CXX_FLAGS="-O3 -rdynamic -inline-forceinline \
+  -qoverride-limits -no-ansi-alias -xMIC-AVX512" \
+  -DUSE_V4_AVX2=ON \
+  -DUSE_V16_AVX512=ON \
+  $src_dir
diff --git a/arch/intel/v4-sse b/arch/intel/v4-sse
new file mode 100755
index 00000000..d0726dbb
--- /dev/null
+++ b/arch/intel/v4-sse
@@ -0,0 +1,23 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/../.."
+
+#------------------------------------------------------------------------------#
+# Call CMake command
+#------------------------------------------------------------------------------#
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_INTEGRATED_TESTS=OFF \
+  -DENABLE_UNIT_TESTS=OFF \
+  -DCMAKE_C_COMPILER=mpiicc \
+  -DCMAKE_CXX_COMPILER=mpiicpc \
+  -DCMAKE_C_FLAGS="-O3 -rdynamic -inline-forceinline  \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DCMAKE_CXX_FLAGS="-O3 -rdynamic -inline-forceinline \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DUSE_V4_SSE=ON \
+  $src_dir
diff --git a/arch/intel/v8-avx2 b/arch/intel/v8-avx2
new file mode 100755
index 00000000..b53a1e41
--- /dev/null
+++ b/arch/intel/v8-avx2
@@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/../.."
+
+#------------------------------------------------------------------------------#
+# Call CMake command
+#------------------------------------------------------------------------------#
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_INTEGRATED_TESTS=OFF \
+  -DENABLE_UNIT_TESTS=OFF \
+  -DCMAKE_C_COMPILER=mpiicc \
+  -DCMAKE_CXX_COMPILER=mpiicpc \
+  -DCMAKE_C_FLAGS="-O3 -rdynamic -inline-forceinline  \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DCMAKE_CXX_FLAGS="-O3 -rdynamic -inline-forceinline \
+  -qoverride-limits -no-ansi-alias -xHost" \
+  -DUSE_V4_AVX2=ON \
+  -DUSE_V8_AVX2=ON \
+  $src_dir
diff --git a/arch/knl-whitebox-RelWithDebInfo-sse b/arch/knl-whitebox-RelWithDebInfo-sse
deleted file mode 100755
index 976c1d47..00000000
--- a/arch/knl-whitebox-RelWithDebInfo-sse
+++ /dev/null
@@ -1,41 +0,0 @@
-#! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
-#------------------------------------------------------------------------------#
-# Get the path to the project from which this script was called
-#------------------------------------------------------------------------------#
-
-src_dir="${0%/*}/.."
-
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
-#------------------------------------------------------------------------------#
-# Call CMake command
-#------------------------------------------------------------------------------#
-
-# The flag -rdynamic removes warnings of the form:
-# Unable to find a safely writable symbol that corresponds to address 432af0
-# (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").  Writing out
-# the raw address instead and keeping my fingers crossed.
-
-cmake \
-  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-  -DENABLE_INTEGRATED_TESTS=ON \
-  -DUSE_V4_SSE=ON \
-  -DCMAKE_C_COMPILER=mpiicc \
-  -DCMAKE_CXX_COMPILER=mpiicpc \
-  -DCMAKE_C_FLAGS="-xmic-avx512 -rdynamic" \
-  -DCMAKE_CXX_FLAGS="-xmic-avx512 -rdynamic" \
-  $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/lanl-ats1-hsw b/arch/lanl-ats1-hsw
new file mode 100755
index 00000000..56a18692
--- /dev/null
+++ b/arch/lanl-ats1-hsw
@@ -0,0 +1,963 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on ATS-1 machines at Los Alamos National
+# Laboratory (LANL) for Haswell nodes. These machines run the Cray Linux
+# Environment Operating System and have two compute partitions, a Haswell
+# partition and a Knights Landing (KNL) partition. Both processor types are
+# Intel processors. These machines provide three compiler choices: Intel, GNU
+# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open
+# MPI.
+#
+# Normal users should not need to change this script if building VPIC to run
+# on the Haswell nodes of ATS-1 machines and happy with defaults.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the nine variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON. When using a machine
+# with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+SET_V16_AVX512="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+#SET_V16_AVX512="ON"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+# the particles in-place allows the fraction of particles stored in High
+# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+# of VPIC is reduced by the memory of a particle array which can be significant
+# for particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=19.0.1
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+
+#VERSION_GNU=7.3.0
+
+#VERSION_CCE=9.0.0.21672
+#VERSION_CRAY_MPICH=7.7.4.4
+#VERSION_CRAY_PERF_TOOLS=7.0.4
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="cc"
+VPIC_COMPILER_CXX="CC"
+
+if [ "$VMPI" = "OMPI" ]
+then
+    VPIC_COMPILER_C="mpicc"
+    VPIC_COMPILER_CXX="mpicxx"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+    FLAGS_CXX_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+    FLAGS_C_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes g++ to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -march=haswell"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes gcc to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -march=haswell"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Cray compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "CCE" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    #FLAGS_CXX_COMPILER+=" -hlist=ad"
+    #FLAGS_CXX_COMPILER+=" -hipa5"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_CXX_COMPILER+=" -rdynamic"
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    #FLAGS_C_COMPILER+=" -hlist=ad"
+    #FLAGS_C_COMPILER+=" -hipa5"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_C_COMPILER+=" -rdynamic"
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on ATS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    SET_LEGACY_SORT="ON"
+fi
+
+#------------------------------------------------------------------------------#
+# Make sure the Cray programming environment is configured as the default of
+# PrgEnv-intel.
+#------------------------------------------------------------------------------#
+
+if [ "$CRAY_PRGENVGNU" = "loaded" ]
+then
+    module swap PrgEnv-gnu PrgEnv-intel
+fi
+
+if [ "$CRAY_PRGENVCRAY" = "loaded" ]
+then
+    module swap PrgEnv-cray PrgEnv-intel
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+module load cmake
+echo "module load cmake" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module swap cmake cmake/$VERSION_CMAKE
+    echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules
+fi
+
+module unload craype-hugepages2M
+echo "module unload craype-hugepages2M" >> bashrc.modules
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module swap intel intel/$VERSION_INTEL
+        echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    module swap PrgEnv-intel PrgEnv-gnu
+    echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules
+
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module swap gcc gcc/$VERSION_GNU
+        echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "CCE" ]
+then
+    module swap PrgEnv-intel PrgEnv-cray
+    echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules
+
+    if [ ! "x$VERSION_CCE" = "x" ]
+    then
+        module swap cce cce/$VERSION_CCE
+        echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "CMPI" ]
+then
+    if [ ! "x$VERSION_CRAY_MPICH" = "x" ]
+    then
+        module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH
+        echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules
+    fi
+
+    export MPI_ROOT=$MPICH_DIR
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    module unload cray-mpich
+    echo "module unload cray-mpich" >> bashrc.modules
+
+    module unload cray-libsci
+    echo "module unload cray-libsci" >> bashrc.modules
+
+    module load openmpi
+    echo "module load openmpi" >> bashrc.modules
+
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module swap openmpi openmpi/$VERSION_OPEN_MPI
+        echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to an ATS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DUSE_V16_AVX512=$SET_V16_AVX512 \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#
diff --git a/arch/lanl-ats1-knl b/arch/lanl-ats1-knl
new file mode 100755
index 00000000..68c2e12a
--- /dev/null
+++ b/arch/lanl-ats1-knl
@@ -0,0 +1,988 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on ATS-1 machines at Los Alamos National
+# Laboratory (LANL) for Knights Landing nodes. These machines run the Cray
+# Linux Environment Operating System and have two compute partitions, a Haswell
+# partition and a Knights Landing (KNL) partition. Both processor types are
+# Intel processors. These machines provide three compiler choices: Intel, GNU
+# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open
+# MPI.
+#
+# Normal users should not need to change this script if building VPIC to run
+# on the KNL nodes of ATS-1 machines and happy with defaults.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the nine variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON. When using a machine
+# with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+SET_V16_AVX512="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+#SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+SET_V16_AVX512="ON"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+# the particles in-place allows the fraction of particles stored in High
+# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+# of VPIC is reduced by the memory of a particle array which can be significant
+# for particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=19.0.1
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+
+#VERSION_GNU=7.3.0
+
+#VERSION_CCE=9.0.0.21672
+#VERSION_CRAY_MPICH=7.7.4.4
+#VERSION_CRAY_PERF_TOOLS=7.0.4
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="cc"
+VPIC_COMPILER_CXX="CC"
+
+if [ "$VMPI" = "OMPI" ]
+then
+    VPIC_COMPILER_C="mpicc"
+    VPIC_COMPILER_CXX="mpicxx"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+    FLAGS_CXX_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code,
+    # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512
+    # code causes the CPU core to down clock or throttle to avoid overheating.
+    # The default is for the compiler to use some internal limits on how much
+    # AVX-512 instructions are used. This is relevant on ATS-1 systems only
+    # for KNL processors.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+    FLAGS_C_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code,
+    # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512
+    # code causes the CPU core to down clock or throttle to avoid overheating.
+    # The default is for the compiler to use some internal limits on how much
+    # AVX-512 instructions are used. This is relevant on ATS-1 systems only
+    # for KNL processors.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-zmm-usage=high"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes g++ to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -march=knl"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes gcc to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -march=knl"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Cray compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "CCE" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    #FLAGS_CXX_COMPILER+=" -hlist=ad"
+    #FLAGS_CXX_COMPILER+=" -hipa5"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_CXX_COMPILER+=" -rdynamic"
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    #FLAGS_C_COMPILER+=" -hlist=ad"
+    #FLAGS_C_COMPILER+=" -hipa5"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_C_COMPILER+=" -rdynamic"
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on ATS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    SET_LEGACY_SORT="ON"
+fi
+
+#------------------------------------------------------------------------------#
+# Make sure the Cray programming environment is configured as the default of
+# PrgEnv-intel.
+#------------------------------------------------------------------------------#
+
+if [ "$CRAY_PRGENVGNU" = "loaded" ]
+then
+    module swap PrgEnv-gnu PrgEnv-intel
+fi
+
+if [ "$CRAY_PRGENVCRAY" = "loaded" ]
+then
+    module swap PrgEnv-cray PrgEnv-intel
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+module load cmake
+echo "module load cmake" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module swap cmake cmake/$VERSION_CMAKE
+    echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules
+fi
+
+module unload craype-hugepages2M
+echo "module unload craype-hugepages2M" >> bashrc.modules
+
+module swap craype-haswell craype-mic-knl
+echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module swap intel intel/$VERSION_INTEL
+        echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    module swap PrgEnv-intel PrgEnv-gnu
+    echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules
+
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module swap gcc gcc/$VERSION_GNU
+        echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "CCE" ]
+then
+    module swap PrgEnv-intel PrgEnv-cray
+    echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules
+
+    if [ ! "x$VERSION_CCE" = "x" ]
+    then
+        module swap cce cce/$VERSION_CCE
+        echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "CMPI" ]
+then
+    if [ ! "x$VERSION_CRAY_MPICH" = "x" ]
+    then
+        module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH
+        echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules
+    fi
+
+    export MPI_ROOT=$MPICH_DIR
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    module unload cray-mpich
+    echo "module unload cray-mpich" >> bashrc.modules
+
+    module unload cray-libsci
+    echo "module unload cray-libsci" >> bashrc.modules
+
+    module load openmpi
+    echo "module load openmpi" >> bashrc.modules
+
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module swap openmpi openmpi/$VERSION_OPEN_MPI
+        echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to an ATS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DUSE_V16_AVX512=$SET_V16_AVX512 \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
new file mode 100755
index 00000000..6e17d194
--- /dev/null
+++ b/arch/lanl-cts1
@@ -0,0 +1,875 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on CTS-1 machines at Los Alamos National
+# Laboratory (LANL). These machines run the Tri-lab TOSS 3.3 Operating System,
+# a customized version of Red Hat Enterprise Linux 7.5. CTS-1 machines have
+# dual socket 18 core Broadwell nodes. These machines provide three compiler
+# choices: Intel, GNU and PGI. Three MPI implementations are provided: Open
+# MPI, Intel MPI and Mvapich.
+#
+# Normal users should not need to change this script if happy with defaults.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# PGI: Portland Group compilers, now part of Nvidia
+#
+# Note that selecting PGI for Portland Group compilers has not been tested
+# and probably does not work.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="PGI"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines
+# IMPI: Intel MPI
+#
+# Choose Intel MPI if you want to use the Intel Application Performance
+# Snapshot performance analysis tool to analyze MPI performance of VPIC or
+# other Intel analysis tools which provide analysis of MPI usage.
+#------------------------------------------------------------------------------#
+
+VMPI="OMPI"
+#VMPI="IMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the eight variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less. Also, the memory footprint of VPIC
+# is reduced by the memory of a particle array which can be significant for
+# particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=18.0.3
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+#VERSION_INTEL_MPI=2019.1
+
+#VERSION_GNU=7.3.0
+
+#VERSION_PGI=18.10
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="mpicc"
+VPIC_COMPILER_CXX="mpicxx"
+
+if [ "$VMPI" = "IMPI" ]
+then
+    VPIC_COMPILER_C="mpiicc"
+    VPIC_COMPILER_CXX="mpiicpc"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=broadwell" causes g++ to generate code specific to and
+    # optimized for the architecture of Broadwell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -march=broadwell"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=broadwell" causes gcc to generate code specific to and
+    # optimized for the architecture of Broadwell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -march=broadwell"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the PGI compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "PGI" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on CTS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    SET_LEGACY_SORT="ON"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module purge
+echo "module purge" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module load cmake/$VERSION_CMAKE
+    echo "module load cmake/$VERSION_CMAKE" >> bashrc.modules
+else
+    module load cmake
+    echo "module load cmake" >> bashrc.modules
+fi
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module load intel/$VERSION_INTEL
+        echo "module load intel/$VERSION_INTEL" >> bashrc.modules
+    else
+        module load intel
+        echo "module load intel" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module load gcc/$VERSION_GNU
+        echo "module load gcc/$VERSION_GNU" >> bashrc.modules
+    else
+        module load gcc
+        echo "module load gcc" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "PGI" ]
+then
+    if [ ! "x$VERSION_PGI" = "x" ]
+    then
+        module load pgi/$VERSION_PGI
+        echo "module load pgi/$VERSION_PGI" >> bashrc.modules
+    else
+        module load pgi
+        echo "module load pgi" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module load openmpi/$VERSION_OPEN_MPI
+        echo "module load openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    else
+        module load openmpi
+        echo "module load openmpi" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "IMPI" ]
+then
+    if [ ! "x$VERSION_INTEL_MPI" = "x" ]
+    then
+        module load intel-mpi/$VERSION_INTEL_MPI
+        echo "module load intel-mpi/$VERSION_INTEL_MPI" >> bashrc.modules
+    else
+        module load intel-mpi
+        echo "module load intel-mpi" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to a CTS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#
diff --git a/arch/CrayConfig.cmake b/arch/other/CrayConfig.cmake
similarity index 100%
rename from arch/CrayConfig.cmake
rename to arch/other/CrayConfig.cmake
diff --git a/arch/generic-Debug b/arch/reference-Debug
similarity index 54%
rename from arch/generic-Debug
rename to arch/reference-Debug
index 8b4992c4..2d5699ba 100755
--- a/arch/generic-Debug
+++ b/arch/reference-Debug
@@ -1,18 +1,10 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
 #------------------------------------------------------------------------------#
 
 src_dir="${0%/*}/.."
 
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
 #------------------------------------------------------------------------------#
 # Call CMake command
 #------------------------------------------------------------------------------#
@@ -29,11 +21,3 @@ cmake \
   -DCMAKE_C_FLAGS="-rdynamic" \
   -DCMAKE_CXX_FLAGS="-rdynamic" \
   $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-Release b/arch/reference-Release
similarity index 54%
rename from arch/generic-Release
rename to arch/reference-Release
index 6dcc99e3..4f29eb05 100755
--- a/arch/generic-Release
+++ b/arch/reference-Release
@@ -1,18 +1,10 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
-
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
 #------------------------------------------------------------------------------#
 
 src_dir="${0%/*}/.."
 
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
 #------------------------------------------------------------------------------#
 # Call CMake command
 #------------------------------------------------------------------------------#
@@ -28,11 +20,3 @@ cmake \
   -DCMAKE_C_FLAGS="-rdynamic" \
   -DCMAKE_CXX_FLAGS="-rdynamic" \
   $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/arch/generic-Release-sse b/arch/v4
similarity index 54%
rename from arch/generic-Release-sse
rename to arch/v4
index 841ae663..efc8287c 100755
--- a/arch/generic-Release-sse
+++ b/arch/v4
@@ -1,7 +1,4 @@
 #! /usr/bin/env bash
-#~----------------------------------------------------------------------------~#
-# placeholder
-#~----------------------------------------------------------------------------~#
 
 #------------------------------------------------------------------------------#
 # Get the path to the project from which this script was called
@@ -9,10 +6,6 @@
 
 src_dir="${0%/*}/.."
 
-#------------------------------------------------------------------------------#
-# Check required environment variables
-#------------------------------------------------------------------------------#
-
 #------------------------------------------------------------------------------#
 # Call CMake command
 #------------------------------------------------------------------------------#
@@ -29,11 +22,3 @@ cmake \
   -DCMAKE_C_FLAGS="-rdynamic" \
   -DCMAKE_CXX_FLAGS="-rdynamic" \
   $src_dir
-
-#------------------------------------------------------------------------------#
-# vim: syntax=sh
-#------------------------------------------------------------------------------#
-
-#~---------------------------------------------------------------------------~-#
-# placeholder
-#~---------------------------------------------------------------------------~-#
diff --git a/deck/main.cc b/deck/main.cc
index c796e943..001baff4 100644
--- a/deck/main.cc
+++ b/deck/main.cc
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -10,98 +10,142 @@
 
 #include "vpic/vpic.h"
 
-/* The simulation variable is set up this way so both the checkpt
-   service and main can see it.  This allows main to find where
-   the restored objects are after a restore. */
-
+// The simulation variable is set up this way so both the checkpt
+// service and main can see it.  This allows main to find where
+// the restored objects are after a restore.
 vpic_simulation * simulation = NULL;
 
-void
-checkpt_main( vpic_simulation ** _simulation ) {
-  CHECKPT_PTR( simulation );
-}
 
-vpic_simulation **
-restore_main( void ) {
-  RESTORE_PTR( simulation );
-  return &simulation;
+/**
+ * @brief Function to checkout main simulation object for restarting
+ *
+ * @param _simulation Simulation object to checkpoint
+ */
+void checkpt_main(vpic_simulation** _simulation)
+{
+    CHECKPT_PTR( simulation );
 }
 
-void
-checkpt( const char * fbase,
-         int tag ) {
-  char fname[256];
-  if( !fbase ) ERROR(( "NULL filename base" ));
-  sprintf( fname, "%s.%i.%i", fbase, tag, world_rank );
-  if( world_rank==0 ) log_printf( "*** Checkpointing to \"%s\"\n", fbase );
-  checkpt_objects( fname );
+/**
+ * @brief Function to handle the recovery of the main simulation object at
+ * restart
+ *
+ * @return Returns a double pointer (**) to the simulation object
+ */
+vpic_simulation** restore_main(void)
+{
+    RESTORE_PTR( simulation );
+    return &simulation;
 }
 
-int
-main( int argc,
-      char **argv ) {
-  boot_services( &argc, &argv );
- 
-  const char * fbase = strip_cmdline_string(&argc, &argv, "--restore", NULL);
-  if( fbase ) {
-
-    // We are restoring from a checkpoint.  Determine checkpt file
-    // for this process, restore all the objects in that file,
-    // wait for all other processes to finishing restoring (such
-    // that communication within reanimate functions is safe),
-    // reanimate all the objects and issue a final barrier to
-    // so that all processes come of a restore together.
-
-    if( world_rank==0 ) log_printf( "*** Restoring from \"%s\"\n", fbase );
+/**
+ * @brief Main checkpoint function to trigger a full checkpointing
+ *
+ * @param fbase File name base for dumping
+ * @param tag File tag to label what this checkpoint is (often used: time step)
+ */
+void checkpt(const char* fbase, int tag)
+{
     char fname[256];
-    sprintf( fname, "%s.%i", fbase, world_rank );
-    restore_objects( fname );
-    mp_barrier();
-    reanimate_objects();
-    mp_barrier();
-
-  } else {
-
-    // We are initializing from scratch.
-
-    if( world_rank==0 ) log_printf( "*** Initializing\n" );
-    simulation = new vpic_simulation;
-    simulation->initialize( argc, argv );
-    REGISTER_OBJECT( &simulation, checkpt_main, restore_main, NULL );
-
-  }
- 
-  // Do any post init/restore simulation modifications
-  // FIXME-KJB: STRIP_CMDLINE COULD MAKE THIS CLEANER AND MORE POWERFUL.
- 
-  fbase = strip_cmdline_string( &argc, &argv, "--modify", NULL );
-  if( fbase ) {
-    if( world_rank==0 ) log_printf( "*** Modifying from \"%s\"\n", fbase );
-    simulation->modify( fbase );  
-  }
- 
-  // Advance the simulation
-
-  if( world_rank==0 ) log_printf( "*** Advancing\n" );
-  double elapsed = wallclock();
-  while( simulation->advance() ); 
-  elapsed = wallclock() - elapsed;
-  if( world_rank==0 ) {
-    int  s = (int)elapsed, m  = s/60, h  = m/60, d  = h/24, w = d/ 7;
-    /**/ s -= m*60,        m -= h*60, h -= d*24, d -= w*7;
-    log_printf( "*** Done (%gs / %iw:%id:%ih:%im:%is elapsed)\n",
-                elapsed, w, d, h, m, s );
-  }
-
-  // Cleaning up
- 
-  if( world_rank==0 ) log_printf( "*** Cleaning up\n" );
-  UNREGISTER_OBJECT( &simulation );
-  simulation->finalize();
-  delete simulation;
-  if( world_rank==0 ) log_printf( "normal exit\n" ); 
-
-  halt_services();
-  return 0;
+    if( !fbase ) ERROR(( "NULL filename base" ));
+    sprintf( fname, "%s.%i.%i", fbase, tag, world_rank );
+    if( world_rank==0 ) log_printf( "*** Checkpointing to \"%s\"\n", fbase );
+    checkpt_objects( fname );
 }
 
+/**
+ * @brief Program main which triggers a vpic run
+ *
+ * @param argc Standard arguments
+ * @param argv Standard arguments
+ *
+ * @return Application error code
+ */
+int main(int argc, char** argv)
+{
+
+    // Initialize underlying threads and services
+    boot_services( &argc, &argv );
+
+    // TODO: this would be better if it was bool-like in nature
+    const char * fbase = strip_cmdline_string(&argc, &argv, "--restore", NULL);
+
+    // Detect if we should perform a restore as per the user request
+    if( fbase )
+    {
+
+        // We are restoring from a checkpoint.  Determine checkpt file
+        // for this process, restore all the objects in that file,
+        // wait for all other processes to finishing restoring (such
+        // that communication within reanimate functions is safe),
+        // reanimate all the objects and issue a final barrier to
+        // so that all processes come of a restore together.
+        if( world_rank==0 ) log_printf( "*** Restoring from \"%s\"\n", fbase );
+        char fname[256];
+        sprintf( fname, "%s.%i", fbase, world_rank );
+        restore_objects( fname );
+        mp_barrier();
+        reanimate_objects();
+        mp_barrier();
+
+    }
+    else // We are initializing from scratch.
+    {
+        // Perform basic initialization
+        if( world_rank==0 )
+        {
+            log_printf( "*** Initializing\n" );
+        }
+        simulation = new vpic_simulation();
+        simulation->initialize( argc, argv );
+        REGISTER_OBJECT( &simulation, checkpt_main, restore_main, NULL );
+    }
+
+    // Do any post init/restore simulation modifications
+
+    // Detec if the "modify" option is passed, which allows users to change
+    // options (such as quota, num_step, etc) when restoring
+    fbase = strip_cmdline_string( &argc, &argv, "--modify", NULL );
+    if( fbase )
+    {
+        if( world_rank==0 ) log_printf( "*** Modifying from \"%s\"\n", fbase );
+        simulation->modify( fbase );
+    }
+
+    // Perform the main simulation
+    if( world_rank==0 ) log_printf( "*** Advancing\n" );
+    double elapsed = wallclock();
+
+    // Call the actual advance until it's done
+    // TODO: Can we make this into a bounded loop
+    while( simulation->advance() );
+
+    elapsed = wallclock() - elapsed;
+
+    // Report run time information on rank 0
+    if( world_rank==0 )
+    {
+        // Calculate time info
+        int  s = (int)elapsed, m  = s/60, h  = m/60, d  = h/24, w = d/ 7;
+        s -= m*60;
+        m -= h*60;
+        h -= d*24;
+        d -= w*7;
+
+        log_printf( "*** Done (%gs / %iw:%id:%ih:%im:%is elapsed)\n",
+                elapsed, w, d, h, m, s );
+    }
+
+    if( world_rank==0 ) log_printf( "*** Cleaning up\n" );
+
+    // Perform Clean up, including de-registering objects
+    UNREGISTER_OBJECT( &simulation );
+    simulation->finalize();
+    delete simulation;
+
+    // Check everything went well
+    if( world_rank==0 ) log_printf( "normal exit\n" );
+
+    halt_services();
+    return 0;
+}
diff --git a/deck/wrapper.cc b/deck/wrapper.cc
index f8098f63..633502c3 100644
--- a/deck/wrapper.cc
+++ b/deck/wrapper.cc
@@ -1,415 +1,4 @@
-/* 
- * Written by:
- *   Kevin J. Bowers, Ph.D.
- *   Plasma Physics Group (X-1)
- *   Applied Physics Division
- *   Los Alamos National Lab
- * March/April 2004 - Revised and extended from earlier V4PIC versions
- *
- */
-
-#include <iostream> // For std::cerr and friends
-
-#include "vpic/vpic.h"
-#include "util/util_base.h"
-
-//-----------------------------------------------------------------------------
-
-#define begin_globals struct user_global_t
-#define global ((struct user_global_t *)user_global)
-
-#define begin_initialization                                      \
-void                                                              \
-vpic_simulation::user_initialization( int num_cmdline_arguments,  \
-                                      char ** cmdline_argument )
-
-#define begin_diagnostics \
-void                      \
-vpic_simulation::user_diagnostics( void ) 
-
-#define begin_particle_injection \
-void                             \
-vpic_simulation::user_particle_injection( void )
-
-#define begin_current_injection \
-void                            \
-vpic_simulation::user_current_injection( void )
-
-#define begin_field_injection \
-void                          \
-vpic_simulation::user_field_injection( void )
-
-#define begin_particle_collisions \
-void                              \
-vpic_simulation::user_particle_collisions( void )
-
-#define repeat( count ) for( int64_t _remain=(int64_t)(count); _remain; _remain-- )
-
-#define _SIM_LOG_PREFIX \
-  __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank() << "]: "
-#define sim_log_local(x) std::cerr << _SIM_LOG_PREFIX << x << std::endl
-#define sim_log(x) do {                                \
-    if( rank()==0 ) {                                  \
-      std::cerr << _SIM_LOG_PREFIX << x << std::endl;  \
-      std::cerr.flush();                               \
-    }                                                  \
-  } while(0)
-
-//-----------------------------------------------------------------------------
-
-// These macros provide support for setting materials, boundary
-// conditions and field values inside and on the surface of regions.
-//
-// Most macros work by providing a logical expression in terms of
-// double precision coordinates x,y,z that are non-zero if a point is
-// inside the intended region and 0 if not. The field macros also take
-// several other equations to set field values. For example:
-//
-// set_region_field( x>0 && sqrt(x*x+y*y+z*z)<1,  // A half-sphere region 
-//                   sin(k*x), 0, 0,              // electric field
-//                   0, sin(k*x), bz );           // magnetic field
-//
-// There are two types of regions, point regions and regular regions.
-//
-// A material value or field component is inside a point region if its
-// location is inside the region. A boundary condition face is inside
-// a point region if all corners of the face are inside the
-// region. Otherwise, a face is on the partially inside a point region
-// if some corner of the face is inside the region.
-//
-// A regular region has two parts: an interior and a
-// surface. set_region_bc further divides the surface into an interior
-// surface and an exterior surface.  The mapping of the region to the
-// grid is dictated by the solely by the location of cell centers.
-//
-// Interior cells are cells whose centers are inside the
-// region. Exterior cells are cells whose centers are outside the
-// region.
-// 
-// Surface faces are faces for which one associated cell-center is
-// inside the region. Interior faces are faces where both associated
-// cell-centers are inside the region. Interior surface faces are
-// faces whose associated cell-center is inside the region but
-// neighbor cell-center is outside the region. The exterior surface
-// faces are faces whose associated cell-center is outside the region
-// but neighbor cell-center is inside the region.
-//
-// Surface edges are edges for which up to 3 associated cell-centers
-// are inside the region. Interior edges are edges where all
-// associated cell-centers are inside the region.
-//
-// Surface nodes are nodes for which up to 7 one associated
-// cell-centers are inside the region. Interior nodes are nodes where
-// all associated cell-centers are inside the region.
-
-// Define a region that fills the whole simulation
-
-#define everywhere (x == x) && (y == y) && (z == z)
-//#define everywhere 1
-
-// Define a macro to allow different parts of a region to be selected.
-// Note: get_particle_bc_id returns  0 if NULL is passed to it
-// Note: get_material_bc_id returns -1 if NULL is passed to it
-
-#define leave_unchanged NULL
-
-// FIXME: THESE GLOBAL POSITION CALCULATIONS NEED TO BE MADE MORE RIGOROUS
-
-#define set_point_region_material( rgn, rmat ) do {                 \
-    const material_id _rmat = get_material_id( (rmat) );            \
-    if( _rmat==-1 ) break;                                          \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;    \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;    \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;    \
-    for( int _k=0; _k<_nz+2; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=0; _j<_ny+2; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yc = _y0 + _dy*(_j-0.5); field_t * _f = &field(0,_j,_k); \
-    for( int _i=0; _i<_nx+2; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          x = _xn; y = _yn; z = _zn; if( (rgn) ) _f->nmat  = _rmat; \
-          x = _xc;                   if( (rgn) ) _f->ematx = _rmat; \
-                   y = _yc;          if( (rgn) ) _f->fmatz = _rmat; \
-                            z = _zc; if( (rgn) ) _f->cmat  = _rmat; \
-                   y = _yn;          if( (rgn) ) _f->fmaty = _rmat; \
-          x = _xn;                   if( (rgn) ) _f->ematz = _rmat; \
-                   y = _yc;          if( (rgn) ) _f->fmatx = _rmat; \
-                            z = _zn; if( (rgn) ) _f->ematy = _rmat; \
-          _f++;                                                     \
-    }}}                                                             \
-  } while(0)
-
-#define set_point_region_bc( rgn, ipbc, epbc ) do {            		 \
-    const int64_t _ipbc = get_particle_bc_id( (particle_bc_t *)(ipbc) ); \
-    const int64_t _epbc = get_particle_bc_id( (particle_bc_t *)(epbc) ); \
-    if( !_ipbc && !_epbc ) break;                                        \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
-    int64_t * _n0 = grid->neighbor;                                      \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zh = _z0 + _dz*_k; \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yh = _y0 + _dy*_j; int64_t * _n = _n0 + 6*voxel(1,_j,_k); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xh = _x0 + _dx*_i; double x, y, z; \
-          int _r000, _r100, _r010, _r110, _r001, _r101, _r011, _r111;    \
-          x = _xn; y = _yn; z = _zn; _r000 = (rgn);                      \
-          x = _xh;                   _r100 = (rgn);                      \
-          x = _xn; y = _yh;          _r010 = (rgn);                      \
-          x = _xh;                   _r110 = (rgn);                      \
-          x = _xn; y = _yn; z = _zh; _r001 = (rgn);                      \
-          x = _xh;                   _r101 = (rgn);                      \
-          x = _xn; y = _yh;          _r011 = (rgn);                      \
-          x = _xh;                   _r111 = (rgn);                      \
-          if( _epbc ) {                                                  \
-            if( _r000 || _r010 || _r001 || _r011 ) _n[0] = _epbc;        \
-            if( _r000 || _r001 || _r100 || _r101 ) _n[1] = _epbc;        \
-            if( _r000 || _r100 || _r010 || _r110 ) _n[2] = _epbc;        \
-            if( _r100 || _r110 || _r101 || _r111 ) _n[3] = _epbc;        \
-            if( _r010 || _r011 || _r110 || _r111 ) _n[4] = _epbc;        \
-            if( _r001 || _r101 || _r011 || _r111 ) _n[5] = _epbc;        \
-          }                                                              \
-          if( _ipbc ) {                                                  \
-            if( _r000 && _r010 && _r001 && _r011 ) _n[0] = _ipbc         \
-            if( _r000 && _r001 && _r100 && _r101 ) _n[1] = _ipbc         \
-            if( _r000 && _r100 && _r010 && _r110 ) _n[2] = _ipbc         \
-            if( _r100 && _r110 && _r101 && _r111 ) _n[3] = _ipbc         \
-            if( _r010 && _r011 && _r110 && _r111 ) _n[4] = _ipbc         \
-            if( _r001 && _r101 && _r011 && _r111 ) _n[5] = _ipbc         \
-          }                                                              \
-          _n += 6;                                                       \
-    }}}                                                                  \
-  } while(0)
-
-// The equations are strictly evaluated inside the region
-#define set_point_region_field( rgn,                                     \
-                                eqn_ex, eqn_ey, eqn_ez,                  \
-                                eqn_bx, eqn_by, eqn_bz ) do {            \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
-    const double _c  = grid->cvac;                                       \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
-    for( int _k=0; _k<_nz+2; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=0; _j<_ny+2; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yc = _y0 + _dy*(_j-0.5); field_t * _f = &field(0,_j,_k); \
-    for( int _i=0; _i<_nx+2; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          x = _xn; y = _yn; z = _zn; /* No node fields */                \
-          x = _xc;                   if( (rgn) ) _f->ex  =    (eqn_ex);  \
-                   y = _yc;          if( (rgn) ) _f->cbz = _c*(eqn_bz);  \
-                            z = _zc; /* No cell fields */                \
-                   y = _yn;          if( (rgn) ) _f->cby = _c*(eqn_by);  \
-          x = _xn;                   if( (rgn) ) _f->ez  =    (eqn_ez);  \
-                   y = _yc;          if( (rgn) ) _f->cbx = _c*(eqn_bx);  \
-                            z = _zn; if( (rgn) ) _f->ey  =    (eqn_ey);  \
-          _f++;                                                          \
-    }}}                                                                  \
-  } while(0)
-
-#define set_region_material( rgn, vmat, smat ) do {                    \
-    const material_id _vmat = get_material_id( (vmat) );               \
-    const material_id _smat = get_material_id( (smat) );               \
-    if( _vmat==-1 && _smat==-1 ) break;                                \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;       \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;       \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;       \
-    for( int _k=0; _k<_nz+2; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=0; _j<_ny+2; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5); field_t *_f = &field(0,_j,_k); \
-    for( int _i=0; _i<_nx+2; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          int _rccc, _rlcc, _rclc, _rllc, _rccl, _rlcl, _rcll, _rlll;  \
-          x = _xc; y = _yc; z = _zc; _rccc = (rgn);		       \
-          x = _xl;                   _rlcc = (rgn);		       \
-          x = _xc; y = _yl;          _rclc = (rgn);		       \
-          x = _xl;                   _rllc = (rgn);		       \
-          x = _xc; y = _yc; z = _zl; _rccl = (rgn);		       \
-          x = _xl;                   _rlcl = (rgn);		       \
-          x = _xc; y = _yl;          _rcll = (rgn);		       \
-          x = _xl;                   _rlll = (rgn);		       \
-          if( _smat!=-1 ) {                                            \
-            if( _rccc || _rclc || _rccl || _rcll )  _f->ematx = _smat; \
-            if( _rccc || _rccl || _rlcc || _rlcl )  _f->ematy = _smat; \
-            if( _rccc || _rlcc || _rclc || _rllc )  _f->ematz = _smat; \
-            if( _rccc || _rlcc )                    _f->fmatx = _smat; \
-            if( _rccc || _rclc )                    _f->fmaty = _smat; \
-            if( _rccc || _rccl )                    _f->fmatz = _smat; \
-            if( _rccc || _rlcc || _rclc || _rllc ||		       \
-                _rccl || _rlcl || _rcll || _rlll )  _f->nmat  = _smat; \
-          }                                                            \
-          if( _vmat!=-1 ) {                                            \
-            if( _rccc && _rclc && _rccl && _rcll )  _f->ematx = _vmat; \
-            if( _rccc && _rccl && _rlcc && _rlcl )  _f->ematy = _vmat; \
-            if( _rccc && _rlcc && _rclc && _rllc )  _f->ematz = _vmat; \
-            if( _rccc && _rlcc )                    _f->fmatx = _vmat; \
-            if( _rccc && _rclc )                    _f->fmaty = _vmat; \
-            if( _rccc && _rccl )                    _f->fmatz = _vmat; \
-            if( _rccc && _rlcc && _rclc && _rllc &&		       \
-                _rccl && _rlcl && _rcll && _rlll )  _f->nmat  = _vmat; \
-            if( _rccc )                             _f->cmat  = _vmat; \
-          }							       \
-          _f++;                                                        \
-    }}}                                                                \
-  } while(0)
-
-#define set_region_bc( rgn, vpbc, ipbc, epbc ) do {                      \
-    const int64_t _vpbc = get_particle_bc_id( (particle_bc_t *)(vpbc) ); \
-    const int64_t _ipbc = get_particle_bc_id( (particle_bc_t *)(ipbc) ); \
-    const int64_t _epbc = get_particle_bc_id( (particle_bc_t *)(epbc) ); \
-    if( !_vpbc && !_ipbc && !_epbc ) break;                              \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
-    int64_t * _n0 = grid->neighbor;                                      \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); int64_t * _n = _n0 + 6*voxel(1,_j,_k); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
-          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                         \
-          x = _xc; y = _yc; z = _zc; _rc = (rgn);                        \
-          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                        \
-          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                        \
-          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                        \
-          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                        \
-          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                        \
-          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                        \
-          if( _vpbc ) {                                                  \
-            if( _rc && _r0  ) _n[0] = _vpbc;                             \
-            if( _rc && _r1  ) _n[1] = _vpbc;                             \
-            if( _rc && _r2  ) _n[2] = _vpbc;                             \
-            if( _rc && _r3  ) _n[3] = _vpbc;                             \
-            if( _rc && _r4  ) _n[4] = _vpbc;                             \
-            if( _rc && _r5  ) _n[5] = _vpbc;                             \
-          }                                                              \
-          if( _ipbc ) {                                                  \
-            if( _rc && !_r0 ) _n[0] = _ipbc;                             \
-            if( _rc && !_r1 ) _n[1] = _ipbc;                             \
-            if( _rc && !_r2 ) _n[2] = _ipbc;                             \
-            if( _rc && !_r3 ) _n[3] = _ipbc;                             \
-            if( _rc && !_r4 ) _n[4] = _ipbc;                             \
-            if( _rc && !_r5 ) _n[5] = _ipbc;                             \
-          }                                                              \
-          if( _epbc ) {                                                  \
-            if( !_rc && _r0 ) _n[0] = _epbc;                             \
-            if( !_rc && _r1 ) _n[1] = _epbc;                             \
-            if( !_rc && _r2 ) _n[2] = _epbc;                             \
-            if( !_rc && _r3 ) _n[3] = _epbc;                             \
-            if( !_rc && _r4 ) _n[4] = _epbc;                             \
-            if( !_rc && _r5 ) _n[5] = _epbc;                             \
-          }                                                              \
-          _n += 6;                                                       \
-    }}}                                                                  \
-  } while(0)
-
-// rgn is a logical equation that specifies the interior of the volume
-// emitter.  This mechanism is only efficient for volumeteric emission
-// processes that occupy a small portion of the simulation volume.
-// For volumetric emission processes that occupy the entire simulation
-// volume, recommend using the begin_particle_injection { }; input
-// deck segment.
-
-#define define_volume_emitter( e, rgn ) do {                      \
-    /* Count the number of cells in the emitter */                \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;  \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;  \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;  \
-    int _nc = 0;                                                  \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yc = _y0 + _dy*(_j-0.5); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          x = _xc; y = _yc; z = _zc; if( (rgn) ) _nc++;           \
-    }}}                                                           \
-    /* Define the emitter */                                      \
-    int32_t * _c = size_emitter( define_emitter( (e) ), _nc );    \
-    _nc = 0;                                                      \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yc = _y0 + _dy*(_j-0.5); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          x = _xc; y = _yc; z = _zc;                              \
-          if( (rgn) ) _c[_nc++] =                                 \
-            COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY(0,0,0) );     \
-    }}}                                                           \
-  } while(0)
-
-// rgn is a logical equation.
-// rgn = true for interior of region
-// rgn = false for exterior of region
-// A surface emitter emits into the exterior of the region.
-
-#define define_surface_emitter( e, rgn ) do {                    \
-    /* Count the number of faces in emitter surface */           \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0; \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz; \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz; \
-    int _nf = 0;                                                 \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
-          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                 \
-          x = _xc; y = _yc; z = _zc; _rc = (rgn);                \
-          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                \
-          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                \
-          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                \
-          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                \
-          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                \
-          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                \
-          if( !_rc && _r0 ) _nf++;                               \
-          if( !_rc && _r1 ) _nf++;                               \
-          if( !_rc && _r2 ) _nf++;                               \
-          if( !_rc && _r3 ) _nf++;                               \
-          if( !_rc && _r4 ) _nf++;                               \
-          if( !_rc && _r5 ) _nf++;                               \
-    }}}                                                          \
-    /* Define the emitter */                                     \
-    int32_t * _c = size_emitter( define_emitter( (e) ), _nf );   \
-    _nf = 0;                                                     \
-    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
-    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); \
-    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
-          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                 \
-          x = _xc; y = _yc; z = _zc; _rc = (rgn);                \
-          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                \
-          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                \
-          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                \
-          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                \
-          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                \
-          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                \
-          if( !_rc && _r0 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY(-1, 0, 0) ); \
-          if( !_rc && _r1 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0,-1, 0) ); \
-          if( !_rc && _r2 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 0,-1) ); \
-          if( !_rc && _r3 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 1, 0, 0) ); \
-          if( !_rc && _r4 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 1, 0) ); \
-          if( !_rc && _r5 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 0, 1) ); \
-    }}}                                                          \
-  } while(0)
-
-// The equations are only evaluated inside the mesh-mapped region
-// (This is not strictly inside the region)
-#define set_region_field( rgn,                                        \
-                          eqn_ex, eqn_ey, eqn_ez,                     \
-                          eqn_bx, eqn_by, eqn_bz ) do {               \
-    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;      \
-    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;      \
-    const double _c  = grid->cvac;                                    \
-    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;      \
-    for( int _k=0; _k<_nz+2; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _ze = _z0 + _dz*_k, _zc = _z0 + _dz*(_k-0.5); \
-    for( int _j=0; _j<_ny+2; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _ye = _y0 + _dy*_j, _yc = _y0 + _dy*(_j-0.5); field_t *_f = &field(0,_j,_k); \
-    for( int _i=0; _i<_nx+2; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xe = _x0 + _dx*_i, _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
-          int _rccc, _rlcc, _rclc, _rllc, _rccl, _rlcl, _rcll;        \
-          x = _xc; y = _yc; z = _zc; _rccc = (rgn);                   \
-          x = _xl;                   _rlcc = (rgn);                   \
-          x = _xc; y = _yl;          _rclc = (rgn);                   \
-          x = _xl;                   _rllc = (rgn);                   \
-          x = _xc; y = _yc; z = _zl; _rccl = (rgn);                   \
-          x = _xl;                   _rlcl = (rgn);                   \
-          x = _xc; y = _yl;          _rcll = (rgn);                   \
-          x = _xc; y = _ye; z = _ze; if( _rccc || _rclc || _rccl || _rcll ) _f->ex  =    (eqn_ex); \
-          x = _xe; y = _yc; z = _ze; if( _rccc || _rccl || _rlcc || _rlcl ) _f->ey  =    (eqn_ey); \
-          x = _xe; y = _ye; z = _zc; if( _rccc || _rlcc || _rclc || _rllc ) _f->ez  =    (eqn_ez); \
-          x = _xe; y = _yc; z = _zc; if( _rccc || _rlcc )                   _f->cbx = _c*(eqn_bx); \
-          x = _xc; y = _ye; z = _zc; if( _rccc || _rclc )                   _f->cby = _c*(eqn_by); \
-          x = _xc; y = _yc; z = _ze; if( _rccc || _rccl )                   _f->cbz = _c*(eqn_bz); \
-          _f++;                                                       \
-    }}}                                                               \
-  } while(0)
-
-// In main.cxx
-
-void
-checkpt( const char * fbase,
-         int tag );
-
-//-----------------------------------------------------------------------------
+#include "wrapper.h"
 
 // Include the users input deck
 #include EXPAND_AND_STRINGIFY(INPUT_DECK)
diff --git a/deck/wrapper.h b/deck/wrapper.h
new file mode 100644
index 00000000..e495aaf4
--- /dev/null
+++ b/deck/wrapper.h
@@ -0,0 +1,420 @@
+/*
+ * Written by:
+ *   Kevin J. Bowers, Ph.D.
+ *   Plasma Physics Group (X-1)
+ *   Applied Physics Division
+ *   Los Alamos National Lab
+ * March/April 2004 - Revised and extended from earlier V4PIC versions
+ *
+ */
+
+#include <iostream> // For std::cerr and friends
+
+#include "vpic/vpic.h"
+#include "util/util_base.h"
+
+//-----------------------------------------------------------------------------
+
+#define begin_globals struct user_global_t
+#define DECLARE_GLOBAL_STRUCT user_global_t* global = new user_global_t();
+
+// RFB: This macro *also* declares the global struct. global cannot be used
+// before begin_initialization in input decks. If you see something like:
+// "error: use of undeclared identifier 'global'" it is because you are trying
+// to use the global variable before begin_initialization is declared. This can
+// most likely be fixed by hoisting begin_globals and begin_initialization to
+// the top of your input deck. Alternatively, you can manually invoke
+// DECLARE_GLOBAL_STRUCT.
+#define begin_initialization                                      \
+DECLARE_GLOBAL_STRUCT                                             \
+void                                                              \
+vpic_simulation::user_initialization( int num_cmdline_arguments,  \
+                                      char ** cmdline_argument )
+
+#define begin_diagnostics \
+void                      \
+vpic_simulation::user_diagnostics( void )
+
+#define begin_particle_injection \
+void                             \
+vpic_simulation::user_particle_injection( void )
+
+#define begin_current_injection \
+void                            \
+vpic_simulation::user_current_injection( void )
+
+#define begin_field_injection \
+void                          \
+vpic_simulation::user_field_injection( void )
+
+#define begin_particle_collisions \
+void                              \
+vpic_simulation::user_particle_collisions( void )
+
+#define repeat( count ) for( int64_t _remain=(int64_t)(count); _remain; _remain-- )
+
+#define _SIM_LOG_PREFIX \
+  __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank() << "]: "
+#define sim_log_local(x) std::cerr << _SIM_LOG_PREFIX << x << std::endl
+#define sim_log(x) do {                                \
+    if( rank()==0 ) {                                  \
+      std::cerr << _SIM_LOG_PREFIX << x << std::endl;  \
+      std::cerr.flush();                               \
+    }                                                  \
+  } while(0)
+
+//-----------------------------------------------------------------------------
+
+// These macros provide support for setting materials, boundary
+// conditions and field values inside and on the surface of regions.
+//
+// Most macros work by providing a logical expression in terms of
+// double precision coordinates x,y,z that are non-zero if a point is
+// inside the intended region and 0 if not. The field macros also take
+// several other equations to set field values. For example:
+//
+// set_region_field( x>0 && sqrt(x*x+y*y+z*z)<1,  // A half-sphere region
+//                   sin(k*x), 0, 0,              // electric field
+//                   0, sin(k*x), bz );           // magnetic field
+//
+// There are two types of regions, point regions and regular regions.
+//
+// A material value or field component is inside a point region if its
+// location is inside the region. A boundary condition face is inside
+// a point region if all corners of the face are inside the
+// region. Otherwise, a face is on the partially inside a point region
+// if some corner of the face is inside the region.
+//
+// A regular region has two parts: an interior and a
+// surface. set_region_bc further divides the surface into an interior
+// surface and an exterior surface.  The mapping of the region to the
+// grid is dictated by the solely by the location of cell centers.
+//
+// Interior cells are cells whose centers are inside the
+// region. Exterior cells are cells whose centers are outside the
+// region.
+//
+// Surface faces are faces for which one associated cell-center is
+// inside the region. Interior faces are faces where both associated
+// cell-centers are inside the region. Interior surface faces are
+// faces whose associated cell-center is inside the region but
+// neighbor cell-center is outside the region. The exterior surface
+// faces are faces whose associated cell-center is outside the region
+// but neighbor cell-center is inside the region.
+//
+// Surface edges are edges for which up to 3 associated cell-centers
+// are inside the region. Interior edges are edges where all
+// associated cell-centers are inside the region.
+//
+// Surface nodes are nodes for which up to 7 one associated
+// cell-centers are inside the region. Interior nodes are nodes where
+// all associated cell-centers are inside the region.
+
+// Define a region that fills the whole simulation
+
+#define everywhere (x == x) && (y == y) && (z == z)
+//#define everywhere 1
+
+// Define a macro to allow different parts of a region to be selected.
+// Note: get_particle_bc_id returns  0 if NULL is passed to it
+// Note: get_material_bc_id returns -1 if NULL is passed to it
+
+#define leave_unchanged NULL
+
+// FIXME: THESE GLOBAL POSITION CALCULATIONS NEED TO BE MADE MORE RIGOROUS
+
+#define set_point_region_material( rgn, rmat ) do {                 \
+    const material_id _rmat = get_material_id( (rmat) );            \
+    if( _rmat==-1 ) break;                                          \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;    \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;    \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;    \
+    for( int _k=0; _k<_nz+2; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=0; _j<_ny+2; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yc = _y0 + _dy*(_j-0.5); field_t * _f = &field(0,_j,_k); \
+    for( int _i=0; _i<_nx+2; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          x = _xn; y = _yn; z = _zn; if( (rgn) ) _f->nmat  = _rmat; \
+          x = _xc;                   if( (rgn) ) _f->ematx = _rmat; \
+                   y = _yc;          if( (rgn) ) _f->fmatz = _rmat; \
+                            z = _zc; if( (rgn) ) _f->cmat  = _rmat; \
+                   y = _yn;          if( (rgn) ) _f->fmaty = _rmat; \
+          x = _xn;                   if( (rgn) ) _f->ematz = _rmat; \
+                   y = _yc;          if( (rgn) ) _f->fmatx = _rmat; \
+                            z = _zn; if( (rgn) ) _f->ematy = _rmat; \
+          _f++;                                                     \
+    }}}                                                             \
+  } while(0)
+
+#define set_point_region_bc( rgn, ipbc, epbc ) do {            		 \
+    const int64_t _ipbc = get_particle_bc_id( (particle_bc_t *)(ipbc) ); \
+    const int64_t _epbc = get_particle_bc_id( (particle_bc_t *)(epbc) ); \
+    if( !_ipbc && !_epbc ) break;                                        \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
+    int64_t * _n0 = grid->neighbor;                                      \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zh = _z0 + _dz*_k; \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yh = _y0 + _dy*_j; int64_t * _n = _n0 + 6*voxel(1,_j,_k); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xh = _x0 + _dx*_i; double x, y, z; \
+          int _r000, _r100, _r010, _r110, _r001, _r101, _r011, _r111;    \
+          x = _xn; y = _yn; z = _zn; _r000 = (rgn);                      \
+          x = _xh;                   _r100 = (rgn);                      \
+          x = _xn; y = _yh;          _r010 = (rgn);                      \
+          x = _xh;                   _r110 = (rgn);                      \
+          x = _xn; y = _yn; z = _zh; _r001 = (rgn);                      \
+          x = _xh;                   _r101 = (rgn);                      \
+          x = _xn; y = _yh;          _r011 = (rgn);                      \
+          x = _xh;                   _r111 = (rgn);                      \
+          if( _epbc ) {                                                  \
+            if( _r000 || _r010 || _r001 || _r011 ) _n[0] = _epbc;        \
+            if( _r000 || _r001 || _r100 || _r101 ) _n[1] = _epbc;        \
+            if( _r000 || _r100 || _r010 || _r110 ) _n[2] = _epbc;        \
+            if( _r100 || _r110 || _r101 || _r111 ) _n[3] = _epbc;        \
+            if( _r010 || _r011 || _r110 || _r111 ) _n[4] = _epbc;        \
+            if( _r001 || _r101 || _r011 || _r111 ) _n[5] = _epbc;        \
+          }                                                              \
+          if( _ipbc ) {                                                  \
+            if( _r000 && _r010 && _r001 && _r011 ) _n[0] = _ipbc         \
+            if( _r000 && _r001 && _r100 && _r101 ) _n[1] = _ipbc         \
+            if( _r000 && _r100 && _r010 && _r110 ) _n[2] = _ipbc         \
+            if( _r100 && _r110 && _r101 && _r111 ) _n[3] = _ipbc         \
+            if( _r010 && _r011 && _r110 && _r111 ) _n[4] = _ipbc         \
+            if( _r001 && _r101 && _r011 && _r111 ) _n[5] = _ipbc         \
+          }                                                              \
+          _n += 6;                                                       \
+    }}}                                                                  \
+  } while(0)
+
+// The equations are strictly evaluated inside the region
+#define set_point_region_field( rgn,                                     \
+                                eqn_ex, eqn_ey, eqn_ez,                  \
+                                eqn_bx, eqn_by, eqn_bz ) do {            \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
+    const double _c  = grid->cvac;                                       \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
+    for( int _k=0; _k<_nz+2; _k++ ) { const double _zn = _z0 + _dz*(_k-1), _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=0; _j<_ny+2; _j++ ) { const double _yn = _y0 + _dy*(_j-1), _yc = _y0 + _dy*(_j-0.5); field_t * _f = &field(0,_j,_k); \
+    for( int _i=0; _i<_nx+2; _i++ ) { const double _xn = _x0 + _dx*(_i-1), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          x = _xn; y = _yn; z = _zn; /* No node fields */                \
+          x = _xc;                   if( (rgn) ) _f->ex  =    (eqn_ex);  \
+                   y = _yc;          if( (rgn) ) _f->cbz = _c*(eqn_bz);  \
+                            z = _zc; /* No cell fields */                \
+                   y = _yn;          if( (rgn) ) _f->cby = _c*(eqn_by);  \
+          x = _xn;                   if( (rgn) ) _f->ez  =    (eqn_ez);  \
+                   y = _yc;          if( (rgn) ) _f->cbx = _c*(eqn_bx);  \
+                            z = _zn; if( (rgn) ) _f->ey  =    (eqn_ey);  \
+          _f++;                                                          \
+    }}}                                                                  \
+  } while(0)
+
+#define set_region_material( rgn, vmat, smat ) do {                    \
+    const material_id _vmat = get_material_id( (vmat) );               \
+    const material_id _smat = get_material_id( (smat) );               \
+    if( _vmat==-1 && _smat==-1 ) break;                                \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;       \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;       \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;       \
+    for( int _k=0; _k<_nz+2; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=0; _j<_ny+2; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5); field_t *_f = &field(0,_j,_k); \
+    for( int _i=0; _i<_nx+2; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          int _rccc, _rlcc, _rclc, _rllc, _rccl, _rlcl, _rcll, _rlll;  \
+          x = _xc; y = _yc; z = _zc; _rccc = (rgn);		       \
+          x = _xl;                   _rlcc = (rgn);		       \
+          x = _xc; y = _yl;          _rclc = (rgn);		       \
+          x = _xl;                   _rllc = (rgn);		       \
+          x = _xc; y = _yc; z = _zl; _rccl = (rgn);		       \
+          x = _xl;                   _rlcl = (rgn);		       \
+          x = _xc; y = _yl;          _rcll = (rgn);		       \
+          x = _xl;                   _rlll = (rgn);		       \
+          if( _smat!=-1 ) {                                            \
+            if( _rccc || _rclc || _rccl || _rcll )  _f->ematx = _smat; \
+            if( _rccc || _rccl || _rlcc || _rlcl )  _f->ematy = _smat; \
+            if( _rccc || _rlcc || _rclc || _rllc )  _f->ematz = _smat; \
+            if( _rccc || _rlcc )                    _f->fmatx = _smat; \
+            if( _rccc || _rclc )                    _f->fmaty = _smat; \
+            if( _rccc || _rccl )                    _f->fmatz = _smat; \
+            if( _rccc || _rlcc || _rclc || _rllc ||		       \
+                _rccl || _rlcl || _rcll || _rlll )  _f->nmat  = _smat; \
+          }                                                            \
+          if( _vmat!=-1 ) {                                            \
+            if( _rccc && _rclc && _rccl && _rcll )  _f->ematx = _vmat; \
+            if( _rccc && _rccl && _rlcc && _rlcl )  _f->ematy = _vmat; \
+            if( _rccc && _rlcc && _rclc && _rllc )  _f->ematz = _vmat; \
+            if( _rccc && _rlcc )                    _f->fmatx = _vmat; \
+            if( _rccc && _rclc )                    _f->fmaty = _vmat; \
+            if( _rccc && _rccl )                    _f->fmatz = _vmat; \
+            if( _rccc && _rlcc && _rclc && _rllc &&		       \
+                _rccl && _rlcl && _rcll && _rlll )  _f->nmat  = _vmat; \
+            if( _rccc )                             _f->cmat  = _vmat; \
+          }							       \
+          _f++;                                                        \
+    }}}                                                                \
+  } while(0)
+
+#define set_region_bc( rgn, vpbc, ipbc, epbc ) do {                      \
+    const int64_t _vpbc = get_particle_bc_id( (particle_bc_t *)(vpbc) ); \
+    const int64_t _ipbc = get_particle_bc_id( (particle_bc_t *)(ipbc) ); \
+    const int64_t _epbc = get_particle_bc_id( (particle_bc_t *)(epbc) ); \
+    if( !_vpbc && !_ipbc && !_epbc ) break;                              \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;         \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;         \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;         \
+    int64_t * _n0 = grid->neighbor;                                      \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); int64_t * _n = _n0 + 6*voxel(1,_j,_k); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
+          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                         \
+          x = _xc; y = _yc; z = _zc; _rc = (rgn);                        \
+          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                        \
+          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                        \
+          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                        \
+          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                        \
+          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                        \
+          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                        \
+          if( _vpbc ) {                                                  \
+            if( _rc && _r0  ) _n[0] = _vpbc;                             \
+            if( _rc && _r1  ) _n[1] = _vpbc;                             \
+            if( _rc && _r2  ) _n[2] = _vpbc;                             \
+            if( _rc && _r3  ) _n[3] = _vpbc;                             \
+            if( _rc && _r4  ) _n[4] = _vpbc;                             \
+            if( _rc && _r5  ) _n[5] = _vpbc;                             \
+          }                                                              \
+          if( _ipbc ) {                                                  \
+            if( _rc && !_r0 ) _n[0] = _ipbc;                             \
+            if( _rc && !_r1 ) _n[1] = _ipbc;                             \
+            if( _rc && !_r2 ) _n[2] = _ipbc;                             \
+            if( _rc && !_r3 ) _n[3] = _ipbc;                             \
+            if( _rc && !_r4 ) _n[4] = _ipbc;                             \
+            if( _rc && !_r5 ) _n[5] = _ipbc;                             \
+          }                                                              \
+          if( _epbc ) {                                                  \
+            if( !_rc && _r0 ) _n[0] = _epbc;                             \
+            if( !_rc && _r1 ) _n[1] = _epbc;                             \
+            if( !_rc && _r2 ) _n[2] = _epbc;                             \
+            if( !_rc && _r3 ) _n[3] = _epbc;                             \
+            if( !_rc && _r4 ) _n[4] = _epbc;                             \
+            if( !_rc && _r5 ) _n[5] = _epbc;                             \
+          }                                                              \
+          _n += 6;                                                       \
+    }}}                                                                  \
+  } while(0)
+
+// rgn is a logical equation that specifies the interior of the volume
+// emitter.  This mechanism is only efficient for volumeteric emission
+// processes that occupy a small portion of the simulation volume.
+// For volumetric emission processes that occupy the entire simulation
+// volume, recommend using the begin_particle_injection { }; input
+// deck segment.
+
+#define define_volume_emitter( e, rgn ) do {                      \
+    /* Count the number of cells in the emitter */                \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;  \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;  \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;  \
+    int _nc = 0;                                                  \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yc = _y0 + _dy*(_j-0.5); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          x = _xc; y = _yc; z = _zc; if( (rgn) ) _nc++;           \
+    }}}                                                           \
+    /* Define the emitter */                                      \
+    int32_t * _c = size_emitter( define_emitter( (e) ), _nc );    \
+    _nc = 0;                                                      \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yc = _y0 + _dy*(_j-0.5); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          x = _xc; y = _yc; z = _zc;                              \
+          if( (rgn) ) _c[_nc++] =                                 \
+            COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY(0,0,0) );     \
+    }}}                                                           \
+  } while(0)
+
+// rgn is a logical equation.
+// rgn = true for interior of region
+// rgn = false for exterior of region
+// A surface emitter emits into the exterior of the region.
+
+#define define_surface_emitter( e, rgn ) do {                    \
+    /* Count the number of faces in emitter surface */           \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0; \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz; \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz; \
+    int _nf = 0;                                                 \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
+          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                 \
+          x = _xc; y = _yc; z = _zc; _rc = (rgn);                \
+          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                \
+          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                \
+          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                \
+          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                \
+          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                \
+          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                \
+          if( !_rc && _r0 ) _nf++;                               \
+          if( !_rc && _r1 ) _nf++;                               \
+          if( !_rc && _r2 ) _nf++;                               \
+          if( !_rc && _r3 ) _nf++;                               \
+          if( !_rc && _r4 ) _nf++;                               \
+          if( !_rc && _r5 ) _nf++;                               \
+    }}}                                                          \
+    /* Define the emitter */                                     \
+    int32_t * _c = size_emitter( define_emitter( (e) ), _nf );   \
+    _nf = 0;                                                     \
+    for( int _k=1; _k<_nz+1; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _zc = _z0 + _dz*(_k-0.5), _zh = _z0 + _dz*(_k+0.5); \
+    for( int _j=1; _j<_ny+1; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _yc = _y0 + _dy*(_j-0.5), _yh = _y0 + _dy*(_j+0.5); \
+    for( int _i=1; _i<_nx+1; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xc = _x0 + _dx*(_i-0.5), _xh = _x0 + _dx*(_i+0.5); double x, y, z; \
+          int _rc, _r0, _r1, _r2, _r3, _r4, _r5;                 \
+          x = _xc; y = _yc; z = _zc; _rc = (rgn);                \
+          x = _xl; y = _yc; z = _zc; _r0 = (rgn);                \
+          x = _xc; y = _yl; z = _zc; _r1 = (rgn);                \
+          x = _xc; y = _yc; z = _zl; _r2 = (rgn);                \
+          x = _xh; y = _yc; z = _zc; _r3 = (rgn);                \
+          x = _xc; y = _yh; z = _zc; _r4 = (rgn);                \
+          x = _xc; y = _yc; z = _zh; _r5 = (rgn);                \
+          if( !_rc && _r0 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY(-1, 0, 0) ); \
+          if( !_rc && _r1 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0,-1, 0) ); \
+          if( !_rc && _r2 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 0,-1) ); \
+          if( !_rc && _r3 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 1, 0, 0) ); \
+          if( !_rc && _r4 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 1, 0) ); \
+          if( !_rc && _r5 ) _c[_nf++] = COMPONENT_ID( voxel(_i,_j,_k), BOUNDARY( 0, 0, 1) ); \
+    }}}                                                          \
+  } while(0)
+
+// The equations are only evaluated inside the mesh-mapped region
+// (This is not strictly inside the region)
+#define set_region_field( rgn,                                        \
+                          eqn_ex, eqn_ey, eqn_ez,                     \
+                          eqn_bx, eqn_by, eqn_bz ) do {               \
+    const double _x0 = grid->x0, _y0 = grid->y0, _z0 = grid->z0;      \
+    const double _dx = grid->dx, _dy = grid->dy, _dz = grid->dz;      \
+    const double _c  = grid->cvac;                                    \
+    const int    _nx = grid->nx, _ny = grid->ny, _nz = grid->nz;      \
+    for( int _k=0; _k<_nz+2; _k++ ) { const double _zl = _z0 + _dz*(_k-1.5), _ze = _z0 + _dz*_k, _zc = _z0 + _dz*(_k-0.5); \
+    for( int _j=0; _j<_ny+2; _j++ ) { const double _yl = _y0 + _dy*(_j-1.5), _ye = _y0 + _dy*_j, _yc = _y0 + _dy*(_j-0.5); field_t *_f = &field(0,_j,_k); \
+    for( int _i=0; _i<_nx+2; _i++ ) { const double _xl = _x0 + _dx*(_i-1.5), _xe = _x0 + _dx*_i, _xc = _x0 + _dx*(_i-0.5); double x, y, z; \
+          int _rccc, _rlcc, _rclc, _rllc, _rccl, _rlcl, _rcll;        \
+          x = _xc; y = _yc; z = _zc; _rccc = (rgn);                   \
+          x = _xl;                   _rlcc = (rgn);                   \
+          x = _xc; y = _yl;          _rclc = (rgn);                   \
+          x = _xl;                   _rllc = (rgn);                   \
+          x = _xc; y = _yc; z = _zl; _rccl = (rgn);                   \
+          x = _xl;                   _rlcl = (rgn);                   \
+          x = _xc; y = _yl;          _rcll = (rgn);                   \
+          x = _xc; y = _ye; z = _ze; if( _rccc || _rclc || _rccl || _rcll ) _f->ex  =    (eqn_ex); \
+          x = _xe; y = _yc; z = _ze; if( _rccc || _rccl || _rlcc || _rlcl ) _f->ey  =    (eqn_ey); \
+          x = _xe; y = _ye; z = _zc; if( _rccc || _rlcc || _rclc || _rllc ) _f->ez  =    (eqn_ez); \
+          x = _xe; y = _yc; z = _zc; if( _rccc || _rlcc )                   _f->cbx = _c*(eqn_bx); \
+          x = _xc; y = _ye; z = _zc; if( _rccc || _rclc )                   _f->cby = _c*(eqn_by); \
+          x = _xc; y = _yc; z = _ze; if( _rccc || _rccl )                   _f->cbz = _c*(eqn_bz); \
+          _f++;                                                       \
+    }}}                                                               \
+  } while(0)
+
+// In main.cxx
+
+void
+checkpt( const char * fbase,
+         int tag );
+
+//-----------------------------------------------------------------------------
diff --git a/sample/harris b/sample/harris
index 364544da..0ecaae6d 100644
--- a/sample/harris
+++ b/sample/harris
@@ -138,9 +138,9 @@ begin_initialization {
   global->fields_interval    = status_interval;
   global->ehydro_interval    = status_interval;
   global->ihydro_interval    = status_interval;
-  global->eparticle_interval = status_interval; // Do not dump
-  global->iparticle_interval = status_interval; // Do not dump
-  global->restart_interval   = status_interval; // Do not dump
+  global->eparticle_interval = status_interval;
+  global->iparticle_interval = status_interval;
+  global->restart_interval   = status_interval;
 
   ///////////////////////////
   // Setup the space and time
diff --git a/src/boundary/boundary_p.cc b/src/boundary/boundary_p.cc
index b14517a2..25d87b41 100644
--- a/src/boundary/boundary_p.cc
+++ b/src/boundary/boundary_p.cc
@@ -3,16 +3,34 @@
 
 // If this is defined particle and mover buffers will not resize dynamically
 // (This is the common case for the users)
-#define DISABLE_DYNAMIC_RESIZING
+//#define DISABLE_DYNAMIC_RESIZING
 
 // FIXME: ARCHITECTURAL FLAW!  CUSTOM BCS AND SHARED FACES CANNOT
 // COEXIST ON THE SAME FACE!  THIS MEANS THAT CUSTOM BOUNDARYS MUST
 // REINJECT ALL ABSORBED PARTICLES IN THE SAME DOMAIN!
 
+
+// Updated by Scott V. Luedtke, XCP-6, December 6, 2018.
+// The mover array is now resized along with the particle array.  The mover
+// array is filled during advance_p and is most likely to overflow there, not
+// here.  Both arrays will now resize down as well.
+// 12/20/18: The mover array is no longer resized with the particle array, as
+// this actually uses more RAM than having static mover arrays.  The mover will
+// still size up if there are too many incoming particles, but I have not
+// encountered this.  Some hard-to-understand bit shifts have been replaced with
+// cleaner code that the compiler should have no trouble optimizing.
+// Spits out lots of warnings. TODO: Remove warnings after testing.
+
 #ifdef V4_ACCELERATION
 using namespace v4;
 #endif
 
+#ifndef MIN_NP
+#define MIN_NP 128 // Default to 4kb (~1 page worth of memory)
+//#define MIN_NP 32768 // 32768 particles is 1 MiB of memory.
+#endif
+
+
 enum { MAX_PBC = 32, MAX_SP = 32 };
 
 void
@@ -53,9 +71,9 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
   species_t * sp;
   int face;
 
-  // Check input args 
+  // Check input args
 
-  if( !sp_list ) return; // Nothing to do if no species 
+  if( !sp_list ) return; // Nothing to do if no species
   if( !fa || !aa || sp_list->g!=aa->g || fa->g!=aa->g )
     ERROR(( "Bad args" ));
 
@@ -92,7 +110,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     bc[face] = g->bc[f2b[face]];
     shared[face] = (bc[face]>=0) && (bc[face]<world_size) &&
                    (bc[face]!=world_rank);
-    if( shared[face] ) range[face] = g->range[bc[face]]; 
+    if( shared[face] ) range[face] = g->range[bc[face]];
   }
 
   // Begin receiving the particle counts
@@ -104,7 +122,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     }
 
   // Load the particle send and local injection buffers
-  
+
   do {
 
     particle_injector_t * RESTRICT ALIGNED(16) pi_send[6];
@@ -131,7 +149,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     // be satisfied (if the handlers conform that it).  We should be
     // more flexible though in the future (especially given above the
     // above overalloc).
-    
+
     int nm = 0; LIST_FOR_EACH( sp, sp_list ) nm += sp->nm;
 
     for( face=0; face<6; face++ )
@@ -165,7 +183,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
       particle_injector_t * RESTRICT ALIGNED(16) pi;
       int i, voxel;
       int64_t nn;
-      
+
       // Note that particle movers for each species are processed in
       // reverse order.  This allows us to backfill holes in the
       // particle list created by boundary conditions and/or
@@ -182,7 +200,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
         voxel >>= 3;
         p0[i].i = voxel;
         nn = neighbor[ 6*voxel + face ];
-        
+
         // Absorb
 
         if( nn==absorb_particles ) {
@@ -202,11 +220,9 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
           copy_4x1( &pi->dispx, &pm->dispx );
 #         else
           pi->dx=p0[i].dx; pi->dy=p0[i].dy; pi->dz=p0[i].dz;
-          pi->i =nn - range[face];
           pi->ux=p0[i].ux; pi->uy=p0[i].uy; pi->uz=p0[i].uz; pi->w=p0[i].w;
           pi->dispx = pm->dispx; pi->dispy = pm->dispy; pi->dispz = pm->dispz;
-          pi->sp_id = sp_id;
-#         endif 
+#         endif
           (&pi->dx)[axis[face]] = dir[face];
           pi->i                 = nn - range[face];
           pi->sp_id             = sp_id;
@@ -254,7 +270,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
 #       endif
 
       }
-      
+
       sp->np = np;
       sp->nm = 0;
     }
@@ -263,7 +279,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
 
   // Finish exchanging particle counts and start exchanging actual
   // particles.
-  
+
   // Note: This is wasteful of communications.  A better protocol
   // would fuse the exchange of the counts with the exchange of the
   // messages.  in a slightly more complex protocol.  However, the MP
@@ -272,7 +288,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
   // prohibits such (specifically, in both, you can't do the
   // equilvanet of a MPI_Getcount to determine how much data you
   // actually received.
-  
+
   for( face=0; face<6; face++ )
     if( shared[face] ) {
       *((int *)mp_send_buffer( mp, f2b[face] )) = n_send[face];
@@ -303,13 +319,13 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
   // Resize particle storage to accomodate worst case inject
 
   do {
-    int n;
-    
+    int n, nm;
+
     // Resize each species's particle and mover storage to be large
     // enough to guarantee successful injection.  (If we broke down
     // the n_recv[face] by species before sending it, we could be
     // tighter on memory footprint here.)
-    
+
     int max_inj = n_ci;
     for( face=0; face<6; face++ )
       if( shared[face] ) max_inj += n_recv[face];
@@ -317,31 +333,73 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     LIST_FOR_EACH( sp, sp_list ) {
       particle_mover_t * new_pm;
       particle_t * new_p;
-      
+
       n = sp->np + max_inj;
       if( n>sp->max_np ) {
-        n = n + (n>>2) + (n>>4); // Increase by 31.25% (~<"silver
+        n += 0.3125*n; // Increase by 31.25% (~<"silver
         /**/                     // ratio") to minimize resizes (max
         /**/                     // rate that avoids excessive heap
         /**/                     // fragmentation)
+        //float resize_ratio = (float)n/sp->max_np;
         WARNING(( "Resizing local %s particle storage from %i to %i",
                   sp->name, sp->max_np, n ));
         MALLOC_ALIGNED( new_p, n, 128 );
         COPY( new_p, sp->p, sp->np );
         FREE_ALIGNED( sp->p );
         sp->p = new_p, sp->max_np = n;
-      }
-      
-      n = sp->nm + max_inj;
-      if( n>sp->max_nm ) {
-        n = n + (n>>2) + (n>>4); // See note above
+
+        /*nm = sp->max_nm * resize_ratio;
         WARNING(( "Resizing local %s mover storage from %i to %i",
-                  sp->name, sp->max_nm, n ));
-        MALLOC_ALIGNED( new_pm, n, 128 );
+                  sp->name, sp->max_nm, nm ));
+        MALLOC_ALIGNED( new_pm, nm, 128 );
         COPY( new_pm, sp->pm, sp->nm );
         FREE_ALIGNED( sp->pm );
         sp->pm = new_pm;
-        sp->max_nm = n;
+        sp->max_nm = nm;*/
+      }
+      else if(sp->max_np > MIN_NP && n < sp->max_np>>1)
+      {
+        n += 0.125*n; // Overallocate by less since this rank is decreasing
+        if (n<MIN_NP) n = MIN_NP;
+        //float resize_ratio = (float)n/sp->max_np;
+        WARNING(( "Resizing (shrinking) local %s particle storage from "
+                    "%i to %i", sp->name, sp->max_np, n));
+        MALLOC_ALIGNED( new_p, n, 128 );
+        COPY( new_p, sp->p, sp->np );
+        FREE_ALIGNED( sp->p );
+        sp->p = new_p, sp->max_np = n;
+
+        /*nm = sp->max_nm * resize_ratio;
+        WARNING(( "Resizing (shrinking) local %s mover storage from "
+                    "%i to %i", sp->name, sp->max_nm, nm));
+        MALLOC_ALIGNED( new_pm, nm, 128 );
+        COPY( new_pm, sp->pm, sp->nm );
+        FREE_ALIGNED( sp->pm );
+        sp->pm = new_pm, sp->max_nm = nm;*/
+      }
+
+      // Feasibly, a vacuum-filled rank may receive a shock and need more movers
+      // than available from MIN_NP
+      nm = sp->nm + max_inj;
+      if( nm>sp->max_nm ) {
+        nm += 0.3125*nm; // See note above
+        //float resize_ratio = (float)nm/sp->max_nm;
+        WARNING(( "This happened.  Resizing local %s mover storage from "
+                    "%i to %i based on not enough movers",
+                  sp->name, sp->max_nm, nm ));
+        MALLOC_ALIGNED( new_pm, nm, 128 );
+        COPY( new_pm, sp->pm, sp->nm );
+        FREE_ALIGNED( sp->pm );
+        sp->pm = new_pm;
+        sp->max_nm = nm;
+
+        /*n = sp->max_np * resize_ratio;
+        WARNING(( "Resizing local %s particle storage from %i to %i",
+                  sp->name, sp->max_np, n ));
+        MALLOC_ALIGNED( new_p, n, 128 );
+        COPY( new_p, sp->p, sp->np );
+        FREE_ALIGNED( sp->p );
+        sp->p = new_p, sp->max_np = n;*/
       }
     }
   } while(0);
@@ -362,7 +420,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     int sp_max_nm[64], n_dropped_movers[64];
 #   endif
 
-    if( num_species( sp_list ) > MAX_SP ) 
+    if( num_species( sp_list ) > MAX_SP )
       ERROR(( "Update this to support more species" ));
     LIST_FOR_EACH( sp, sp_list ) {
       sp_p[  sp->id ] = sp->p;
@@ -385,7 +443,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
       /**/  particle_mover_t    * RESTRICT ALIGNED(16) pm;
       const particle_injector_t * RESTRICT ALIGNED(16) pi;
       int np, nm, n, id;
-  
+
       face++; if( face==7 ) face = 0;
       if( face==6 ) pi = ci, n = n_ci;
       else if( shared[face] ) {
@@ -394,7 +452,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
           (((char *)mp_recv_buffer(mp,f2b[face]))+16);
         n  = n_recv[face];
       } else continue;
-        
+
       // Reverse order injection is done to reduce thrashing of the
       // particle list (particles are removed reverse order so the
       // overall impact of removal + injection is to keep injected
@@ -402,7 +460,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
       //
       // WARNING: THIS TRUSTS THAT THE INJECTORS (INCLUDING THOSE
       // RECEIVED FROM OTHER NODES) HAVE VALID PARTICLE IDS.
-  
+
       pi += n-1;
       for( ; n; pi--, n-- ) {
         id = pi->sp_id;
@@ -434,7 +492,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
         sp_nm[id] = nm + move_p( p, pm+nm, a0, g, sp_q[id] );
       }
     } while(face!=5);
-  
+
     LIST_FOR_EACH( sp, sp_list ) {
 #     ifdef DISABLE_DYNAMIC_RESIZING
       if( n_dropped_particles[sp->id] )
@@ -454,7 +512,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
     }
 
   } while(0);
-  
+
   for( face=0; face<6; face++ )
     if( shared[face] ) mp_end_send(mp,f2b[face]);
 }
diff --git a/src/collision/binary.c b/src/collision/binary.c
index 830f0748..2fb761fc 100644
--- a/src/collision/binary.c
+++ b/src/collision/binary.c
@@ -1,187 +1,44 @@
 #define IN_collision
-/*#define HAS_V4_PIPELINE*/
-#include "collision_private.h"
+
+#include "binary.h"
 
 /* FIXME: ADD SAMPLE TO UNARY */
 
 /* Private interface *********************************************************/
 
-typedef struct binary_collision_model {
-  char * name;
-  binary_rate_constant_func_t rate_constant;
-  binary_collision_func_t collision;
-  void * params;
-  species_t  * spi;
-  species_t  * spj;
-  rng_pool_t * rp;
-  double sample;
-  int interval;
-  int n_large_pr[ MAX_PIPELINE ];
-} binary_collision_model_t;
+//----------------------------------------------------------------------------//
+// Top level function to select and call proper apply_binary_collision_model
+// function.
+//----------------------------------------------------------------------------//
 
 void
-binary_pipeline( binary_collision_model_t * RESTRICT cm,
-                 int pipeline_rank,
-                 int n_pipeline ) {
-  if( pipeline_rank==n_pipeline ) return; /* No host straggler cleanup */
-
-  binary_rate_constant_func_t rate_constant = cm->rate_constant;
-  binary_collision_func_t     collision     = cm->collision;
-
-  /**/  void       * RESTRICT params        = cm->params;
-  /**/  species_t  * RESTRICT spi           = cm->spi;
-  /**/  species_t  * RESTRICT spj           = cm->spj;
-  /**/  rng_t      * RESTRICT rng           = cm->rp->rng[ pipeline_rank ];
-
-  /**/  particle_t * RESTRICT spi_p         = spi->p;
-  const int        * RESTRICT spi_partition = spi->partition;
-  const grid_t     * RESTRICT g             = spi->g;
-
-  /**/  particle_t * RESTRICT spj_p         = spj->p;
-  const int        * RESTRICT spj_partition = spj->partition;
-
-  const double sample        = (spi_p==spj_p ? 0.5 : 1)*cm->sample;
-  const float  dtinterval_dV = ( g->dt * (float)cm->interval ) / g->dV;
-
-  float pr_norm, pr_coll, wk, wl, w_max, w_min;
-  int v, v1, k, k0, nk, rk, l, l0, nl, rl, np, nc, type, n_large_pr = 0;
-
-  /* Stripe the (mostly non-ghost) voxels over threads for load balance */
-
-  v  = VOXEL( 0,0,0,             g->nx,g->ny,g->nz ) + pipeline_rank;
-  v1 = VOXEL( g->nx,g->ny,g->nz, g->nx,g->ny,g->nz ) + 1;
-  for( ; v<v1; v+=n_pipeline ) {
- 
-    /* Find the species i computational particles, k, and the species j
-       computational particles, l, in this voxel, determine the number
-       of computational particle pairs, np and the number of candidate
-       pairs, nc, to test for collisions within this voxel.  Also,
-       split the range of the fastest integer rng into intervals
-       suitable for sampling pairs. */
-
-    k0 = spi_partition[v  ];
-    nk = spi_partition[v+1] - k0;
-    if( !nk ) continue; /* Nothing to do */
-    rk = UINT_MAX / (unsigned)nk;
-
-    if( spi==spj ) {
-
-      /* For intraspecies collisions:
-           np = nk(nk+1)/2
-         and:
-           nc = round( sample nk / 2 )
-         such that, for sample==1, on average every particle is tested
-         for collision once.  Note that the below pair sampling method
-         allows for the possibility of a computational particle
-         colliding with itself.  This isn't so silly when considering
-         that a computational particle represents many physical
-         particles.  At the same time, most microscopic physics
-         processes will give a zero collision rate constant for such as
-         the implied colliding physical particles are comoving. */
-
-      l0 = k0;
-      nl = nk;
-      rl = rk;
-      np = nk*(nk+1) >> 1;
-      nc = (int)( 0.5 + sample*(double)nk );
-
-    } else {
-
-      /* For interspecies collisions:
-           np = nk nl
-         and:
-           nc = round( sample max( nk, nl ) )
-         such that, for sample==1, on average every particle is thest
-         for collision at least once. */
-
-      l0 = spj_partition[v  ];
-      nl = spj_partition[v+1] - l0;
-      if( !nl ) continue; /* Nothing to do */
-      rl = UINT_MAX / (unsigned)nl;
-      np = nk*nl;
-      nc = (int)( 0.5 + sample*(double)(nk>nl ? nk : nl) );
-
-    }
-
-    /* Determine the collision rate to probability normalization:
-         pr_norm = ( dt interval np ) / ( dV nc ) */
-
-    pr_norm = dtinterval_dV*((float)np / (float)nc);
-
-    /* For each candidate pair */
-
-    for( ; nc; nc-- ) {
-
-      /* Pick a pair of computational particles uniformly at random
-         from all pairs of particles in the voxel.  Note that the
-         while test virtually always fails (this manner of
-         splitting up the nk, nl guarantees a uniform prob 
-         of getting k on 0:nk-1 and l on 0:nl-1 and uses the
-         preferred high order randgen bits). */
-  
-      do { k = (int)(uirand(rng)/rk); } while( k==nk ); k += k0;
-      do { l = (int)(uirand(rng)/rl); } while( l==nl ); l += l0;
-     
-      /* Compute the probability that a physical particle in the
-         species whose candidate computational particle has the least
-         weight (and comoving with this computational particle) will
-         collide off a beam of physical particles of the density and
-         momentum of the computational particle in the other species.
-         If this probability is bigger than one, make a note for
-         diagnostic use. */
-
-      wk = spi_p[k].w;
-      wl = spj_p[l].w;
-      w_max = (wk>wl) ? wk : wl;
-      pr_coll = w_max * pr_norm *
-        rate_constant( params, spi, spj, &spi_p[k], &spj_p[l] );
-      if( pr_coll>1 ) n_large_pr++;
-
-      /* Yes, >= so that 0 rate constants guarantee no collision and
-         yes, _c0, so that 1 probabilities guarantee a collision */
-      if( frand_c0(rng)>=pr_coll ) continue; /* Didn't collide */
-
-      /* k and l had a collision.  Determine which computational
-         particles should be updated by the collision process.
-         We should always update the particle of least weight.
-         The other particle should be updated with probability of
-         w_min / w_max, such that, on average, detailed balance is
-         preserved. */
-    
-      w_min = (wk>wl) ? wl : wk;
-      type = 1; if( wl==w_min ) type++;
-      if( w_max==w_min || w_max*frand_c0(rng)<w_min ) type = 3;
-      collision( params, spi, spj, &spi_p[k], &spj_p[l], rng, type );
+apply_binary_collision_model( binary_collision_model_t * cm )
+{
+  if ( cm->interval < 1 || ( cm->spi->g->step % cm->interval ) )
+  {
+    return;
+  }
 
-    } 
+  if ( cm->spi->last_sorted != cm->spi->g->step )
+  {
+    sort_p( cm->spi );
   }
 
-  cm->n_large_pr[pipeline_rank] = n_large_pr;
-}
+  if ( cm->spj->last_sorted != cm->spi->g->step )
+  {
+    sort_p( cm->spj );
+  }
 
-void
-apply_binary_collision_model( binary_collision_model_t * cm ) {
-  int p, n_large_pr = 0;
-  if( cm->interval<1 || (cm->spi->g->step % cm->interval) ) return;
-  if( cm->spi->last_sorted!=cm->spi->g->step ) sort_p( cm->spi );
-  if( cm->spj->last_sorted!=cm->spi->g->step ) sort_p( cm->spj );
-  EXEC_PIPELINES( binary, cm, 0 );
-  WAIT_PIPELINES();
-  for( p=0; p<N_PIPELINE; p++ ) n_large_pr += cm->n_large_pr[p];
-  if( n_large_pr )
-    WARNING(( "%i computational particle pairs between species \"%s\" and "
-              "species \"%s\" encountered a large collision probability in "
-              "collision model \"%s\".  The collision rate for such pairs "
-              "will be lower than it should be physically.  Consider lowering "
-              "the collision operator interval, increasing the sampling or "
-              "reducing the timestep.",
-              n_large_pr, cm->spi->name, cm->spj->name, cm->name ));
+  // Conditionally execute this when more abstractions are available.
+  apply_binary_collision_model_pipeline( cm );
 }
 
 void
-checkpt_binary_collision_model( const collision_op_t * cop ) {
+checkpt_binary_collision_model( const collision_op_t * cop )
+{
   const binary_collision_model_t * cm =
-    (const binary_collision_model_t *)cop->params;
+    ( const binary_collision_model_t * ) cop->params;
+
   CHECKPT( cm, 1 );
   CHECKPT_STR( cm->name );
   CHECKPT_SYM( cm->rate_constant );
@@ -190,12 +47,15 @@ checkpt_binary_collision_model( const collision_op_t * cop ) {
   CHECKPT_PTR( cm->spi );
   CHECKPT_PTR( cm->spj );
   CHECKPT_PTR( cm->rp );
+
   checkpt_collision_op_internal( cop );
 }
 
 collision_op_t *
-restore_binary_collision_model( void ) {
+restore_binary_collision_model( void )
+{
   binary_collision_model_t * cm;
+
   RESTORE( cm );
   RESTORE_STR( cm->name );
   RESTORE_SYM( cm->rate_constant );
@@ -204,41 +64,66 @@ restore_binary_collision_model( void ) {
   RESTORE_PTR( cm->spi );
   RESTORE_PTR( cm->spj );
   RESTORE_PTR( cm->rp );
+
   return restore_collision_op_internal( cm );
 }
 
 void
-delete_binary_collision_model( collision_op_t * cop ) {
-  binary_collision_model_t * cm = (binary_collision_model_t *)cop->params;
+delete_binary_collision_model( collision_op_t * cop )
+{
+  binary_collision_model_t * cm
+    = (binary_collision_model_t *) cop->params;
+
   FREE( cm->name );
   FREE( cm );
+
   delete_collision_op_internal( cop );
 }
 
 /* Public interface **********************************************************/
 
 collision_op_t *
-binary_collision_model( const char       * RESTRICT name,
+binary_collision_model( const char * RESTRICT name,
                         binary_rate_constant_func_t rate_constant,
-                        binary_collision_func_t     collision,
-                        /**/  void       * RESTRICT params,
-                        /**/  species_t  * RESTRICT spi,
-                        /**/  species_t  * RESTRICT spj,
-                        /**/  rng_pool_t * RESTRICT rp,
-                        double                      sample,
-                        int                         interval ) {
+                        binary_collision_func_t collision,
+                        void * RESTRICT params,
+                        species_t * RESTRICT spi,
+                        species_t * RESTRICT spj,
+                        rng_pool_t * RESTRICT rp,
+                        double sample,
+                        int interval )
+{
   binary_collision_model_t * cm;
+
   size_t len = name ? strlen(name) : 0;
 
-  if( !rate_constant || !collision || !spi || !spj || spi->g!=spj->g ||
-      !rp || rp->n_rng<N_PIPELINE ) ERROR(( "Bad args" ));
-  if( len==0 ) ERROR(( "Cannot specify a nameless collision model" ));
-  if( params && !object_id( params ) )
-    ERROR(( "collision model parameters must be checkpoint registered" ));
+  if ( !rate_constant         ||
+       !collision             ||
+       !spi                   ||
+       !spj                   ||
+       spi->g != spj->g       ||
+       !rp                    ||
+       rp->n_rng < N_PIPELINE )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  if ( len == 0 )
+  {
+    ERROR( ( "Cannot specify a nameless collision model" ) );
+  }
+
+  if ( params               &&
+       !object_id( params ) )
+  {
+    ERROR( ( "collision model parameters must be checkpoint registered" ) );
+  }
 
   MALLOC( cm, 1 );
   MALLOC( cm->name, len+1 );
+
   strcpy( cm->name, name ); 
+
   cm->rate_constant = rate_constant;
   cm->collision     = collision;
   cm->params        = params;
@@ -247,11 +132,11 @@ binary_collision_model( const char       * RESTRICT name,
   cm->rp            = rp;
   cm->sample        = sample;
   cm->interval      = interval;
+
   return new_collision_op_internal( cm,
-                                    (collision_op_func_t)apply_binary_collision_model,
+                                    ( collision_op_func_t ) apply_binary_collision_model,
                                     delete_binary_collision_model,
-                                    (checkpt_func_t)checkpt_binary_collision_model,
-                                    (restore_func_t)restore_binary_collision_model,
+                                    ( checkpt_func_t ) checkpt_binary_collision_model,
+                                    ( restore_func_t ) restore_binary_collision_model,
                                     NULL );
 }
-
diff --git a/src/collision/binary.h b/src/collision/binary.h
new file mode 100644
index 00000000..aa16dd37
--- /dev/null
+++ b/src/collision/binary.h
@@ -0,0 +1,23 @@
+#ifndef _binary_h_
+#define _binary_h_
+
+#include "collision_private.h"
+
+typedef struct binary_collision_model
+{
+  char * name;
+  binary_rate_constant_func_t rate_constant;
+  binary_collision_func_t collision;
+  void * params;
+  species_t  * spi;
+  species_t  * spj;
+  rng_pool_t * rp;
+  double sample;
+  int interval;
+  int n_large_pr[ MAX_PIPELINE ];
+} binary_collision_model_t;
+
+void
+apply_binary_collision_model_pipeline( binary_collision_model_t * cm );
+
+#endif /* _binary_h_ */
diff --git a/src/collision/collision_private.h b/src/collision/collision_private.h
index ea7dfd7e..52d6c8e4 100644
--- a/src/collision/collision_private.h
+++ b/src/collision/collision_private.h
@@ -53,6 +53,6 @@ typedef struct langevin_pipeline_args {
   PAD_STRUCT( (1+MAX_PIPELINE)*SIZEOF_MEM_PTR+2*sizeof(float)+sizeof(int) )
 } langevin_pipeline_args_t;
 
-PROTOTYPE_PIPELINE( langevin, langevin_pipeline_args_t );
+// PROTOTYPE_PIPELINE( langevin, langevin_pipeline_args_t );
 
 #endif /* _collision_h_ */
diff --git a/src/collision/langevin.c b/src/collision/langevin.c
index c9d2a755..f7bf2a18 100644
--- a/src/collision/langevin.c
+++ b/src/collision/langevin.c
@@ -1,150 +1,89 @@
 #define IN_collision
-#include "collision_private.h"
+
+#include "langevin.h"
 
 /* Private interface *********************************************************/
 
-typedef struct langevin {
-  species_t  * sp;
-  rng_pool_t * rp;
-  float kT;
-  float nu;
-  int interval;
-} langevin_t;
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper apply_langevin function.
+//----------------------------------------------------------------------------//
 
 void
-langevin_pipeline( langevin_pipeline_args_t * RESTRICT args,
-                   int pipeline_rank,
-                   int n_pipeline ) {
-  if( pipeline_rank==n_pipeline ) return; /* No host straggler cleanup */
-
-  particle_t * RESTRICT p     = args->p;
-  rng_t      * RESTRICT rng   = args->rng[ pipeline_rank ];
-  float                 decay = args->decay;
-  float                 drive = args->drive;
-
-  double n_target = (double)args->np / (double)n_pipeline;
-  /**/  int i  = (int)( 0.5 + n_target*(double) pipeline_rank    );
-  const int i1 = (int)( 0.5 + n_target*(double)(pipeline_rank+1) );
-
-  for( ; i<i1; i++ ) {
-    p[i].ux = decay*p[i].ux + drive*frandn(rng);
-    p[i].uy = decay*p[i].uy + drive*frandn(rng);
-    p[i].uz = decay*p[i].uz + drive*frandn(rng);
+apply_langevin( langevin_t * l )
+{
+  if ( l->interval < 1                  ||
+       ( l->sp->g->step % l->interval ) )
+  {
+    return;
   }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "V4 pipeline not implemented"
 
-#endif
-
-void
-apply_langevin( langevin_t * l ) {
-  if( l->interval<1 || (l->sp->g->step % l->interval) ) return;
-
-  /* Decay and drive have a fun derivation.  We want to integrate the
-     stochastic equation:
-       du = -nu u dt + sqrt( 2 kT / mc ) dW
-     For small dt, this is:
-       u_1 = u_0 ( 1- nu dt ) + RANDN( 2 kT nu dt / mc )
-     where RANDN( var ) is a normal random number with _variance_ var.
-     Let:
-       a = nu dt
-       b = 2 kT nu dt / mc
-     Then:
-       u_1 = (1-a) u_0 + RANDN( b )
-     We can get more accurate by making N substeps of length dt/N.
-     Then:
-       u_{n+1} = (1-a/N) u_n + RANDN( b/N )
-     such that:
-       u_N = (1-a/N)^N u_0 + sum_{n=0:N-1} (1-a/N)^n RANDN( b/N )
-     Noting that sum of N normal random numbers is a normal random
-     number whose variance is the sum of the N variances, we have:
-       u_N = (1-a/N)^N u_0 + RANDN( sum_{n=0:N-1} (1-a/N)^{2n} b/N )
-     Analytically summing the variances yields:
-       u_N = (1-a/N)^N u_0 + RANDN( [1-(1-a/N)^{2N} / ( 1-(1-a/N)^2 )] b/N )
-     In the continuum limit (N goes to infinity):
-       u_N = decay u_0 + RANDN( drive^2 )
-     or:
-       u_N = decay u_0 + drive RANDN( 1 )
-     where:
-       decay   = lim (1-a/N)^N = exp(-a)
-       drive^2 = lim variance sum
-               = [(1-exp(-2a) b] / [(1 - 1 + 2a/N) N]
-               = ( 1-exp(-2a) ) b / (2a)
-     subtituting a and b into decay and drive yields:
-       decay   = exp(-nu dt)
-       drive   = sqrt( (1-exp(-2 nu dt)) kT / mc )
-     In the limit nu dt small:
-       decay -> 1 - nu dt
-       drive -> sqrt( 2 nu dt kT / mc )
-     reproducing the infinitesimal stochastic differential equation.
-     In the limit nu dt large:
-       decay -> 0
-       drive -> sqrt( kT / mc )
-     which is equivalent to resampling the momentum with the
-     desired temperature. */
-
-  float nudt  = l->nu * (float)l->interval * l->sp->g->dt;
-  DECLARE_ALIGNED_ARRAY( langevin_pipeline_args_t, 128, args, 1 );
-  args->p     = l->sp->p;
-  COPY( args->rng, l->rp->rng, N_PIPELINE );
-  args->decay = exp( -nudt );
-  args->drive = sqrt(( -expm1(-2*nudt)*l->kT )/( l->sp->m*l->sp->g->cvac ));
-  args->np    = l->sp->np;
-  EXEC_PIPELINES( langevin, args, 0 );
-  WAIT_PIPELINES();
+  // Conditionally execute this when more abstractions are available.
+  apply_langevin_pipeline( l );
 }
 
 void
-checkpt_langevin( const collision_op_t * cop ) {
-  const langevin_t * l = (const langevin_t *)cop->params;
+checkpt_langevin( const collision_op_t * cop )
+{
+  const langevin_t * l = ( const langevin_t * ) cop->params;
+
   CHECKPT( l, 1 );
   CHECKPT_PTR( l->sp );
   CHECKPT_PTR( l->rp );
+
   checkpt_collision_op_internal( cop );
 }
 
 collision_op_t *
-restore_langevin( void ) {
+restore_langevin( void )
+{
   langevin_t * l;
+
   RESTORE( l );
   RESTORE_PTR( l->sp );
   RESTORE_PTR( l->rp );
+
   return restore_collision_op_internal( l );
 }
 
 void
-delete_langevin( collision_op_t * cop ) {
+delete_langevin( collision_op_t * cop )
+{
   FREE( cop->params );
+
   delete_collision_op_internal( cop );
 }
 
 /* Public interface **********************************************************/
 
 collision_op_t *
-langevin( float                 kT,
-          float                 nu,
-          species_t  * RESTRICT sp,
+langevin( float kT,
+          float nu,
+          species_t * RESTRICT sp,
           rng_pool_t * RESTRICT rp,
-          int                   interval ) {
+          int interval )
+{
   langevin_t * l;
 
-  if( !sp || !rp || rp->n_rng<N_PIPELINE || kT<0 || nu<0 )
-    ERROR(( "Bad args" ));
+  if ( !sp    ||
+       !rp    ||
+       kT < 0 ||
+       nu < 0 )
+  {
+    ERROR( ( "Bad args" ) );
+  }
 
   MALLOC( l, 1 );
+
   l->sp       = sp;
   l->rp       = rp;
   l->kT       = kT;
   l->nu       = nu;
   l->interval = interval;
+
   return new_collision_op_internal( l,
-                                    (collision_op_func_t)apply_langevin,
+                                    ( collision_op_func_t ) apply_langevin,
                                     delete_langevin,
-                                    (checkpt_func_t)checkpt_langevin,
-                                    (restore_func_t)restore_langevin,
+                                    ( checkpt_func_t ) checkpt_langevin,
+                                    ( restore_func_t ) restore_langevin,
                                     NULL );
 }
-
diff --git a/src/collision/langevin.h b/src/collision/langevin.h
new file mode 100644
index 00000000..c8dbbaf2
--- /dev/null
+++ b/src/collision/langevin.h
@@ -0,0 +1,18 @@
+#ifndef _langevin_h_
+#define _langevin_h_
+
+#include "collision_private.h"
+
+typedef struct langevin
+{
+  species_t  * sp;
+  rng_pool_t * rp;
+  float kT;
+  float nu;
+  int interval;
+} langevin_t;
+
+void
+apply_langevin_pipeline( langevin_t * l );
+
+#endif /* _langevin_h_ */
diff --git a/src/collision/pipeline/binary_pipeline.c b/src/collision/pipeline/binary_pipeline.c
new file mode 100644
index 00000000..e3595309
--- /dev/null
+++ b/src/collision/pipeline/binary_pipeline.c
@@ -0,0 +1,198 @@
+#define IN_collision
+
+/* #define HAS_V4_PIPELINE */
+
+#include "collision_pipeline.h"
+
+#include "../binary.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+/* FIXME: ADD SAMPLE TO UNARY */
+
+/* Private interface *********************************************************/
+
+void
+binary_pipeline_scalar( binary_collision_model_t * RESTRICT cm,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  if ( pipeline_rank == n_pipeline )
+  {
+    return; /* No host straggler cleanup */
+  }
+
+  binary_rate_constant_func_t rate_constant = cm->rate_constant;
+  binary_collision_func_t     collision     = cm->collision;
+
+  /**/  void       * RESTRICT params        = cm->params;
+  /**/  species_t  * RESTRICT spi           = cm->spi;
+  /**/  species_t  * RESTRICT spj           = cm->spj;
+  /**/  rng_t      * RESTRICT rng           = cm->rp->rng[ pipeline_rank ];
+
+  /**/  particle_t * RESTRICT spi_p         = spi->p;
+  const int        * RESTRICT spi_partition = spi->partition;
+  const grid_t     * RESTRICT g             = spi->g;
+
+  /**/  particle_t * RESTRICT spj_p         = spj->p;
+  const int        * RESTRICT spj_partition = spj->partition;
+
+  const double sample        = (spi_p==spj_p ? 0.5 : 1)*cm->sample;
+  const float  dtinterval_dV = ( g->dt * (float)cm->interval ) / g->dV;
+
+  float pr_norm, pr_coll, wk, wl, w_max, w_min;
+  int v, v1, k, k0, nk, rk, l, l0, nl, rl, np, nc, type, n_large_pr = 0;
+
+  /* Stripe the (mostly non-ghost) voxels over threads for load balance */
+
+  v  = VOXEL( 0,0,0,             g->nx,g->ny,g->nz ) + pipeline_rank;
+  v1 = VOXEL( g->nx,g->ny,g->nz, g->nx,g->ny,g->nz ) + 1;
+
+  for( ; v<v1; v+=n_pipeline )
+  {
+    /* Find the species i computational particles, k, and the species j
+       computational particles, l, in this voxel, determine the number
+       of computational particle pairs, np and the number of candidate
+       pairs, nc, to test for collisions within this voxel.  Also,
+       split the range of the fastest integer rng into intervals
+       suitable for sampling pairs. */
+
+    k0 = spi_partition[v  ];
+    nk = spi_partition[v+1] - k0;
+    if( !nk ) continue; /* Nothing to do */
+    rk = UINT_MAX / (unsigned)nk;
+
+    if ( spi == spj )
+    {
+      /* For intraspecies collisions:
+           np = nk(nk+1)/2
+         and:
+           nc = round( sample nk / 2 )
+         such that, for sample==1, on average every particle is tested
+         for collision once.  Note that the below pair sampling method
+         allows for the possibility of a computational particle
+         colliding with itself.  This isn't so silly when considering
+         that a computational particle represents many physical
+         particles.  At the same time, most microscopic physics
+         processes will give a zero collision rate constant for such as
+         the implied colliding physical particles are comoving. */
+
+      l0 = k0;
+      nl = nk;
+      rl = rk;
+      np = nk*(nk+1) >> 1;
+      nc = (int)( 0.5 + sample*(double)nk );
+    }
+
+    else
+    {
+      /* For interspecies collisions:
+           np = nk nl
+         and:
+           nc = round( sample max( nk, nl ) )
+         such that, for sample==1, on average every particle is tested
+         for collision at least once. */
+
+      l0 = spj_partition[v  ];
+      nl = spj_partition[v+1] - l0;
+      if( !nl ) continue; /* Nothing to do */
+      rl = UINT_MAX / (unsigned)nl;
+      np = nk*nl;
+      nc = (int)( 0.5 + sample*(double)(nk>nl ? nk : nl) );
+    }
+
+    /* Determine the collision rate to probability normalization:
+         pr_norm = ( dt interval np ) / ( dV nc ) */
+
+    pr_norm = dtinterval_dV*((float)np / (float)nc);
+
+    /* For each candidate pair */
+
+    for( ; nc; nc-- )
+    {
+      /* Pick a pair of computational particles uniformly at random
+         from all pairs of particles in the voxel.  Note that the
+         while test virtually always fails (this manner of
+         splitting up the nk, nl guarantees a uniform prob 
+         of getting k on 0:nk-1 and l on 0:nl-1 and uses the
+         preferred high order randgen bits). */
+  
+      do { k = (int)(uirand(rng)/rk); } while( k==nk ); k += k0;
+      do { l = (int)(uirand(rng)/rl); } while( l==nl ); l += l0;
+     
+      /* Compute the probability that a physical particle in the
+         species whose candidate computational particle has the least
+         weight (and comoving with this computational particle) will
+         collide off a beam of physical particles of the density and
+         momentum of the computational particle in the other species.
+         If this probability is bigger than one, make a note for
+         diagnostic use. */
+
+      wk = spi_p[k].w;
+      wl = spj_p[l].w;
+      w_max = (wk>wl) ? wk : wl;
+      pr_coll = w_max * pr_norm *
+        rate_constant( params, spi, spj, &spi_p[k], &spj_p[l] );
+      if( pr_coll>1 ) n_large_pr++;
+
+      /* Yes, >= so that 0 rate constants guarantee no collision and
+         yes, _c0, so that 1 probabilities guarantee a collision */
+      if( frand_c0(rng)>=pr_coll ) continue; /* Didn't collide */
+
+      /* k and l had a collision.  Determine which computational
+         particles should be updated by the collision process.
+         We should always update the particle of least weight.
+         The other particle should be updated with probability of
+         w_min / w_max, such that, on average, detailed balance is
+         preserved. */
+    
+      w_min = (wk>wl) ? wl : wk;
+      type = 1; if( wl==w_min ) type++;
+      if( w_max==w_min || w_max*frand_c0(rng)<w_min ) type = 3;
+      collision( params, spi, spj, &spi_p[k], &spj_p[l], rng, type );
+    }
+  }
+
+  cm->n_large_pr[pipeline_rank] = n_large_pr;
+}
+
+void
+apply_binary_collision_model_pipeline( binary_collision_model_t * cm )
+{
+  int p, n_large_pr = 0;
+
+  if ( cm->interval < 1 || ( cm->spi->g->step % cm->interval ) )
+  {
+    return;
+  }
+
+  if ( cm->spi->last_sorted != cm->spi->g->step )
+  {
+    sort_p( cm->spi );
+  }
+
+  if ( cm->spj->last_sorted != cm->spi->g->step )
+  {
+    sort_p( cm->spj );
+  }
+
+  EXEC_PIPELINES( binary, cm, 0 );
+
+  WAIT_PIPELINES();
+
+  for( p = 0; p < N_PIPELINE; p++ )
+  {
+    n_large_pr += cm->n_large_pr[p];
+  }
+
+  if ( n_large_pr )
+  {
+    WARNING( ( "%i computational particle pairs between species \"%s\" and "
+               "species \"%s\" encountered a large collision probability in "
+               "collision model \"%s\".  The collision rate for such pairs "
+               "will be lower than it should be physically.  Consider lowering "
+               "the collision operator interval, increasing the sampling or "
+               "reducing the timestep.",
+               n_large_pr, cm->spi->name, cm->spj->name, cm->name ) );
+  }
+}
diff --git a/src/collision/pipeline/collision_pipeline.h b/src/collision/pipeline/collision_pipeline.h
new file mode 100644
index 00000000..591e020b
--- /dev/null
+++ b/src/collision/pipeline/collision_pipeline.h
@@ -0,0 +1,23 @@
+#ifndef _collision_pipeline_h_
+#define _collision_pipeline_h_
+
+#include "../binary.h"
+#include "../langevin.h"
+#include "../unary.h"
+
+void
+binary_pipeline_scalar( binary_collision_model_t * RESTRICT cm,
+                        int pipeline_rank,
+                        int n_pipeline );
+
+void
+langevin_pipeline_scalar( langevin_pipeline_args_t * RESTRICT args,
+                          int pipeline_rank,
+                          int n_pipeline );
+
+void
+unary_pipeline_scalar( unary_collision_model_t * RESTRICT cm,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+#endif /* _collision_pipeline_h_ */
diff --git a/src/collision/pipeline/langevin_pipeline.c b/src/collision/pipeline/langevin_pipeline.c
new file mode 100644
index 00000000..a32748ca
--- /dev/null
+++ b/src/collision/pipeline/langevin_pipeline.c
@@ -0,0 +1,112 @@
+#define IN_collision
+
+#include "collision_pipeline.h"
+
+#include "../langevin.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+/* Private interface *********************************************************/
+
+void
+langevin_pipeline_scalar( langevin_pipeline_args_t * RESTRICT args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  if ( pipeline_rank == n_pipeline )
+  {
+    return; /* No host straggler cleanup */
+  }
+
+  particle_t * RESTRICT p     = args->p;
+  rng_t      * RESTRICT rng   = args->rng[ pipeline_rank ];
+  float                 decay = args->decay;
+  float                 drive = args->drive;
+
+  double n_target = (double)args->np / (double)n_pipeline;
+
+  /**/  int i  = (int)( 0.5 + n_target * (double)  pipeline_rank    );
+  const int i1 = (int)( 0.5 + n_target * (double) (pipeline_rank+1) );
+
+  for( ; i < i1; i++ )
+  {
+    p[i].ux = decay * p[i].ux + drive * frandn(rng);
+    p[i].uy = decay * p[i].uy + drive * frandn(rng);
+    p[i].uz = decay * p[i].uz + drive * frandn(rng);
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "V4 pipeline not implemented"
+
+#endif
+
+void
+apply_langevin_pipeline( langevin_t * l )
+{
+  if ( l->interval < 1                  ||
+       ( l->sp->g->step % l->interval ) )
+  {
+    return;
+  }
+
+  /* Decay and drive have a fun derivation.  We want to integrate the
+     stochastic equation:
+       du = -nu u dt + sqrt( 2 kT / mc ) dW
+     For small dt, this is:
+       u_1 = u_0 ( 1- nu dt ) + RANDN( 2 kT nu dt / mc )
+     where RANDN( var ) is a normal random number with _variance_ var.
+     Let:
+       a = nu dt
+       b = 2 kT nu dt / mc
+     Then:
+       u_1 = (1-a) u_0 + RANDN( b )
+     We can get more accurate by making N substeps of length dt/N.
+     Then:
+       u_{n+1} = (1-a/N) u_n + RANDN( b/N )
+     such that:
+       u_N = (1-a/N)^N u_0 + sum_{n=0:N-1} (1-a/N)^n RANDN( b/N )
+     Noting that sum of N normal random numbers is a normal random
+     number whose variance is the sum of the N variances, we have:
+       u_N = (1-a/N)^N u_0 + RANDN( sum_{n=0:N-1} (1-a/N)^{2n} b/N )
+     Analytically summing the variances yields:
+       u_N = (1-a/N)^N u_0 + RANDN( [1-(1-a/N)^{2N} / ( 1-(1-a/N)^2 )] b/N )
+     In the continuum limit (N goes to infinity):
+       u_N = decay u_0 + RANDN( drive^2 )
+     or:
+       u_N = decay u_0 + drive RANDN( 1 )
+     where:
+       decay   = lim (1-a/N)^N = exp(-a)
+       drive^2 = lim variance sum
+               = [(1-exp(-2a) b] / [(1 - 1 + 2a/N) N]
+               = ( 1-exp(-2a) ) b / (2a)
+     subtituting a and b into decay and drive yields:
+       decay   = exp(-nu dt)
+       drive   = sqrt( (1-exp(-2 nu dt)) kT / mc )
+     In the limit nu dt small:
+       decay -> 1 - nu dt
+       drive -> sqrt( 2 nu dt kT / mc )
+     reproducing the infinitesimal stochastic differential equation.
+     In the limit nu dt large:
+       decay -> 0
+       drive -> sqrt( kT / mc )
+     which is equivalent to resampling the momentum with the
+     desired temperature. */
+
+  float nudt = l->nu * (float) l->interval * l->sp->g->dt;
+
+  DECLARE_ALIGNED_ARRAY( langevin_pipeline_args_t, 128, args, 1 );
+
+  args->p     = l->sp->p;
+
+  COPY( args->rng, l->rp->rng, N_PIPELINE );
+
+  args->decay = exp( -nudt );
+  args->drive = sqrt( ( -expm1( -2 * nudt ) * l->kT ) / ( l->sp->m * l->sp->g->cvac ) );
+  args->np    = l->sp->np;
+
+  EXEC_PIPELINES( langevin, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/collision/pipeline/unary_pipeline.c b/src/collision/pipeline/unary_pipeline.c
new file mode 100644
index 00000000..f2a7ced4
--- /dev/null
+++ b/src/collision/pipeline/unary_pipeline.c
@@ -0,0 +1,95 @@
+#define IN_collision
+
+/* #define HAS_V4_PIPELINE */
+
+#include "collision_pipeline.h"
+
+#include "../unary.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+/* Private interface *********************************************************/
+
+void
+unary_pipeline_scalar( unary_collision_model_t * RESTRICT cm,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  if ( pipeline_rank == n_pipeline )
+  {
+    return; /* No host straggler cleanup */
+  }
+
+  unary_rate_constant_func_t rate_constant = cm->rate_constant;
+  unary_collision_func_t     collision     = cm->collision;
+
+  /**/  void       * RESTRICT params = cm->params;
+  const species_t  * RESTRICT sp     = cm->sp;
+  /**/  particle_t * RESTRICT p      = cm->sp->p;
+  /**/  rng_t      * RESTRICT rng    = cm->rp->rng[ pipeline_rank ];
+
+  const float dt = sp->g->dt * (float) cm->interval;
+
+  double n_target = (double) sp->np / (double) n_pipeline;
+
+  /**/  int i  = (int) ( 0.5 + n_target * (double)  pipeline_rank    );
+  const int i1 = (int) ( 0.5 + n_target * (double) (pipeline_rank+1) );
+
+  float pr_coll;
+  int n_large_pr = 0;
+
+  /* For each computational particle assigned to this pipeline, compute
+     the probability a comoving physical particle had collision with
+     the background.  If this "probability" is greater than one, make
+     a note for diagnostic purposes.  Then flip a bias coin of that
+     probability to decide if this particle should undergo a collision. */
+
+  for( ; i < i1; i++ )
+  {
+    pr_coll = dt * rate_constant( params, sp, &p[i] );
+
+    if ( pr_coll > 1 )
+    {
+      n_large_pr++;
+    }
+
+    /* Yes, strictly < (so that 0 rate constants guarantee no collision,
+       and, yes, _c0, so that 1 probabilities guarantee a collision  */
+    if ( frand_c0( rng ) < pr_coll )
+    {
+      collision( params, sp, &p[i], rng );
+    }
+  }
+
+  cm->n_large_pr[ pipeline_rank ] = n_large_pr;
+}
+
+void
+apply_unary_collision_model_pipeline( unary_collision_model_t * cm )
+{
+  int p, n_large_pr = 0;
+
+  if ( cm->interval < 1                   ||
+       ( cm->sp->g->step % cm->interval ) )
+  {
+    return;
+  }
+
+  EXEC_PIPELINES( unary, cm, 0 );
+
+  WAIT_PIPELINES();
+
+  for( p = 0; p < N_PIPELINE; p++ )
+  {
+    n_large_pr += cm->n_large_pr[p];
+  }
+
+  if ( n_large_pr )
+  {
+    WARNING( ( "%i particles in species \"%s\" encountered a large collision "
+               "probability in collision model \"%s\".  The collision rate for "
+               "such particles will be lower than it should be physically.  "
+               "Consider lowering the collision operator interval or reducing "
+               "the timestep.", n_large_pr, cm->sp->name, cm->name ) );
+  }
+}
diff --git a/src/collision/unary.c b/src/collision/unary.c
index 05be0834..b0964a79 100644
--- a/src/collision/unary.c
+++ b/src/collision/unary.c
@@ -1,79 +1,33 @@
 #define IN_collision
-/*#define HAS_V4_PIPELINE*/
-#include "collision_private.h"
+
+#include "unary.h"
 
 /* Private interface *********************************************************/
 
-typedef struct unary_collision_model {
-  char * name;
-  unary_rate_constant_func_t rate_constant;
-  unary_collision_func_t collision;
-  void * params;
-  species_t * sp;
-  rng_pool_t * rp;
-  int interval;
-  int n_large_pr[ MAX_PIPELINE ];
-} unary_collision_model_t;
+//----------------------------------------------------------------------------//
+// Top level function to select and call proper apply_unary_collision_model
+// function.
+//----------------------------------------------------------------------------//
 
 void
-unary_pipeline( unary_collision_model_t * RESTRICT cm,
-                 int pipeline_rank,
-                 int n_pipeline ) {
-  if( pipeline_rank==n_pipeline ) return; /* No host straggler cleanup */
-
-  unary_rate_constant_func_t rate_constant = cm->rate_constant;
-  unary_collision_func_t     collision     = cm->collision;
-
-  /**/  void       * RESTRICT params = cm->params;
-  const species_t  * RESTRICT sp     = cm->sp;
-  /**/  particle_t * RESTRICT p      = cm->sp->p;
-  /**/  rng_t      * RESTRICT rng    = cm->rp->rng[ pipeline_rank ];
-  const float dt = sp->g->dt * (float)cm->interval;
-
-  double n_target = (double)sp->np / (double)n_pipeline;
-  /**/  int i  = (int)( 0.5 + n_target*(double) pipeline_rank    );
-  const int i1 = (int)( 0.5 + n_target*(double)(pipeline_rank+1) );
-
-  float pr_coll;
-  int n_large_pr = 0;
-
-  /* For each computational particle assigned to this pipeline, compute
-     the probability a comoving physical particle had collision with
-     the background.  If this "probability" is greater than one, make
-     a note for diagnostic purposes.  Then flip a bias coin of that
-     probability to decide if this particle should undergo a collision. */
-
-  for( ; i<i1; i++ ) {
-    pr_coll = dt * rate_constant( params, sp, &p[i] );
-    if( pr_coll>1 ) n_large_pr++;
-
-    /* Yes, strictly < (so that 0 rate constants guarantee no collision,
-       and, yes, _c0, so that 1 probabilities guarantee a collision  */
-    if( frand_c0(rng) < pr_coll ) collision( params, sp, &p[i], rng );
+apply_unary_collision_model( unary_collision_model_t * cm )
+{
+  if ( cm->interval < 1                   ||
+       ( cm->sp->g->step % cm->interval ) )
+  {
+    return;
   }
 
-  cm->n_large_pr[pipeline_rank] = n_large_pr;
+  // Conditionally execute this when more abstractions are available.
+  apply_unary_collision_model_pipeline( cm );
 }
 
 void
-apply_unary_collision_model( unary_collision_model_t * cm ) {
-  int p, n_large_pr = 0;
-  if( cm->interval<1 || (cm->sp->g->step % cm->interval) ) return;
-  EXEC_PIPELINES( unary, cm, 0 );
-  WAIT_PIPELINES();
-  for( p=0; p<N_PIPELINE; p++ ) n_large_pr += cm->n_large_pr[p];
-  if( n_large_pr )
-    WARNING(( "%i particles in species \"%s\" encountered a large collision "
-              "probability in collision model \"%s\".  The collision rate for "
-              "such particles will be lower than it should be physically.  "
-              "Consider lowering the collision operator interval or reducing "
-              "the timestep.", n_large_pr, cm->sp->name, cm->name ));
-}
-
-void
-checkpt_unary_collision_model( const collision_op_t * cop ) {
+checkpt_unary_collision_model( const collision_op_t * cop )
+{
   const unary_collision_model_t * cm =
-    (const unary_collision_model_t *)cop->params;
+    ( const unary_collision_model_t * ) cop->params;
+
   CHECKPT( cm, 1 );
   CHECKPT_STR( cm->name );
   CHECKPT_SYM( cm->rate_constant );
@@ -81,12 +35,15 @@ checkpt_unary_collision_model( const collision_op_t * cop ) {
   CHECKPT_PTR( cm->params );
   CHECKPT_PTR( cm->sp );
   CHECKPT_PTR( cm->rp );
+
   checkpt_collision_op_internal( cop );
 }
 
 collision_op_t *
-restore_unary_collision_model( void ) {
+restore_unary_collision_model( void )
+{
   unary_collision_model_t * cm;
+
   RESTORE( cm );
   RESTORE_STR( cm->name );
   RESTORE_SYM( cm->rate_constant );
@@ -94,50 +51,72 @@ restore_unary_collision_model( void ) {
   RESTORE_PTR( cm->params );
   RESTORE_PTR( cm->sp );
   RESTORE_PTR( cm->rp );
+
   return restore_collision_op_internal( cm );
 }
 
 void
-delete_unary_collision_model( collision_op_t * cop ) {
-  unary_collision_model_t * cm = (unary_collision_model_t *)cop->params;
+delete_unary_collision_model( collision_op_t * cop )
+{
+  unary_collision_model_t * cm = ( unary_collision_model_t * ) cop->params;
+
   FREE( cm->name );
   FREE( cm );
+
   delete_collision_op_internal( cop );
 }
 
 /* Public interface **********************************************************/
 
 collision_op_t *
-unary_collision_model( const char       * RESTRICT name,
-                       unary_rate_constant_func_t  rate_constant,
-                       unary_collision_func_t      collision,
-                       /**/  void       * RESTRICT params,
-                       /**/  species_t  * RESTRICT sp,
-                       /**/  rng_pool_t * RESTRICT rp,
-                       int                         interval ) {
+unary_collision_model( const char * RESTRICT name,
+                       unary_rate_constant_func_t rate_constant,
+                       unary_collision_func_t collision,
+                       void * RESTRICT params,
+                       species_t * RESTRICT sp,
+                       rng_pool_t * RESTRICT rp,
+                       int interval )
+{
   unary_collision_model_t * cm;
+
   size_t len = name ? strlen(name) : 0;
 
-  if( !rate_constant || !collision || !sp || !rp || rp->n_rng<N_PIPELINE )
-    ERROR(( "Bad args" ));
-  if( !len ) ERROR(( "Cannot specify a nameless collision model" ));
-  if( params && !object_id( params ) )
-    ERROR(( "collision model parameters must be checkpoint registered" ));
+  if ( !rate_constant         ||
+       !collision             ||
+       !sp                    ||
+       !rp                    ||
+       rp->n_rng < N_PIPELINE )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  if ( !len )
+  {
+    ERROR( ( "Cannot specify a nameless collision model." ) );
+  }
+
+  if ( params               &&
+       !object_id( params ) )
+  {
+    ERROR( ( "collision model parameters must be checkpoint registered." ) );
+  }
 
   MALLOC( cm, 1 );
   MALLOC( cm->name, len+1 );
+
   strcpy( cm->name, name ); 
+
   cm->rate_constant = rate_constant;
   cm->collision     = collision;
   cm->params        = params;
   cm->sp            = sp;
   cm->rp            = rp;
   cm->interval      = interval;
+
   return new_collision_op_internal( cm,
-                                    (collision_op_func_t)apply_unary_collision_model,
+                                    ( collision_op_func_t ) apply_unary_collision_model,
                                     delete_unary_collision_model,
-                                    (checkpt_func_t)checkpt_unary_collision_model,
-                                    (restore_func_t)restore_unary_collision_model,
+                                    ( checkpt_func_t ) checkpt_unary_collision_model,
+                                    ( restore_func_t ) restore_unary_collision_model,
                                     NULL );
 }
-
diff --git a/src/collision/unary.h b/src/collision/unary.h
new file mode 100644
index 00000000..c14b92b9
--- /dev/null
+++ b/src/collision/unary.h
@@ -0,0 +1,21 @@
+#ifndef _unary_h_
+#define _unary_h_
+
+#include "collision_private.h"
+
+typedef struct unary_collision_model
+{
+  char * name;
+  unary_rate_constant_func_t rate_constant;
+  unary_collision_func_t collision;
+  void * params;
+  species_t * sp;
+  rng_pool_t * rp;
+  int interval;
+  int n_large_pr[ MAX_PIPELINE ];
+} unary_collision_model_t;
+
+void
+apply_unary_collision_model_pipeline( unary_collision_model_t * cm );
+
+#endif /* _unary_h_ */
diff --git a/src/field_advance/field_advance.c b/src/field_advance/field_advance.c
index 40f7e57d..019e16cd 100644
--- a/src/field_advance/field_advance.c
+++ b/src/field_advance/field_advance.c
@@ -1,14 +1,21 @@
 #define IN_field_advance
+
 #include "field_advance_private.h"
 
 void
-delete_field_array( field_array_t * fa ) {
-  if( !fa ) return;
+delete_field_array( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    return;
+  }
+
   fa->kernel->delete_fa( fa );
 }
 
 void
-checkpt_field_advance_kernels( const field_advance_kernels_t * kernel ) {
+checkpt_field_advance_kernels( const field_advance_kernels_t * kernel )
+{
   CHECKPT_SYM( kernel->delete_fa                 );
   CHECKPT_SYM( kernel->advance_b                 );
   CHECKPT_SYM( kernel->advance_e                 );
@@ -29,7 +36,8 @@ checkpt_field_advance_kernels( const field_advance_kernels_t * kernel ) {
 }
 
 void
-restore_field_advance_kernels( field_advance_kernels_t * kernel ) {
+restore_field_advance_kernels( field_advance_kernels_t * kernel )
+{
   RESTORE_SYM( kernel->delete_fa                 );
   RESTORE_SYM( kernel->advance_b                 );
   RESTORE_SYM( kernel->advance_e                 );
@@ -48,4 +56,3 @@ restore_field_advance_kernels( field_advance_kernels_t * kernel ) {
   RESTORE_SYM( kernel->compute_rms_div_b_err     );
   RESTORE_SYM( kernel->clean_div_b               );
 }
-
diff --git a/src/field_advance/field_advance.h b/src/field_advance/field_advance.h
index a2f66156..d1cee710 100644
--- a/src/field_advance/field_advance.h
+++ b/src/field_advance/field_advance.h
@@ -149,7 +149,8 @@
 // FIXME: SHOULD HAVE DIFFERENT FIELD_T FOR CELL BUILDS AND USE NEW
 // INFRASTRUCTURE
 
-typedef struct field {
+typedef struct field
+{
   float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
   float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
   float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
@@ -166,8 +167,8 @@ typedef struct field {
 
 struct field_array;
 
-typedef struct field_advance_kernels {
-
+typedef struct field_advance_kernels
+{
   // FIXME: DUMP.CXX SHOULD BE DECENTRALIZED AND DIAGNOSTIC DUMP
   // FOR FIELDS SHOULD BE ADDED TO THIS
   // FIXME: FOR SYSTEMS WITH MAGNETIC CURRENTS (E.G. PML LAYERS)
@@ -219,7 +220,8 @@ typedef struct field_advance_kernels {
 // A field_array holds all the field quanties and pointers to
 // kernels used to advance them.
 
-typedef struct field_array {
+typedef struct field_array
+{
   field_t * ALIGNED(128) f;          // Local field data
   grid_t  * g;                       // Underlying grid
   void    * params;                  // Field advance specific parameters
diff --git a/src/field_advance/standard/advance_b.cc b/src/field_advance/standard/advance_b.cc
index a673f276..148c85d2 100644
--- a/src/field_advance/standard/advance_b.cc
+++ b/src/field_advance/standard/advance_b.cc
@@ -1,189 +1,20 @@
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  field_t      * ALIGNED(128) f;
-  const grid_t *              g;
-  float frac;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                       \
-  /**/  field_t * ALIGNED(128) f = args->f;                     \
-  const grid_t  *              g = args->g;                     \
-                                                                \
-  const int   nx   = g->nx;                                     \
-  const int   ny   = g->ny;                                     \
-  const int   nz   = g->nz;                                     \
-                                                                \
-  const float frac = args->frac;                                \
-  const float px   = (nx>1) ? frac*g->cvac*g->dt*g->rdx : 0;    \
-  const float py   = (ny>1) ? frac*g->cvac*g->dt*g->rdy : 0;    \
-  const float pz   = (nz>1) ? frac*g->cvac*g->dt*g->rdz : 0;    \
-                                                                \
-  field_t * ALIGNED(16) f0;                                     \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x+1,y,  z  ); \
-  fy = &f(x,  y+1,z  ); \
-  fz = &f(x,  y,  z+1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 1; \
-    if( y>ny ) z++; if( y>ny ) y = 1; \
-    INIT_STENCIL();                   \
-  }
- 
-// WTF!  Under -ffast-math, gcc-4.1.1 thinks it is okay to treat the
-// below as
-//   f0->cbx = ( f0->cbx + py*( blah ) ) - pz*( blah )
-// even with explicit parenthesis are in there!  Oh my ...
-// -fno-unsafe-math-optimizations must be used
-
-#define UPDATE_CBX() f0->cbx -= ( py*( fy->ez-f0->ez ) - pz*( fz->ey-f0->ey ) )
-#define UPDATE_CBY() f0->cby -= ( pz*( fz->ex-f0->ex ) - px*( fx->ez-f0->ez ) )
-#define UPDATE_CBZ() f0->cbz -= ( px*( fx->ey-f0->ey ) - py*( fy->ex-f0->ex ) )
-
-void
-advance_b_pipeline( pipeline_args_t * args,
-                    int pipeline_rank,
-                    int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_CBX(); UPDATE_CBY(); UPDATE_CBZ();
-    NEXT_STENCIL();
-  }
-
-# undef LOAD_STENCIL
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-advance_b_pipeline_v4( pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  const v4float vpx( px );
-  const v4float vpy( py );
-  const v4float vpz( pz );
-
-  v4float f0_ex,  f0_ey,  f0_ez;  // Voxel quad electric fields
-  v4float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
-  v4float fx_ey, fx_ez;           // Voxel quad +x neighbor fields
-  v4float fy_ez, fy_ex;           // Voxel quad +y neighbor fields
-  v4float fz_ex, fz_ey;           // Voxel quad +z neighbor fields
-  v4float dummy;
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
-  
-  // Process the bulk of the voxels 4 at a time
 
-  INIT_STENCIL();
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
-    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
-    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
-    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
-
-    load_4x3_tr(  &f00->ex,  &f01->ex,  &f02->ex,  &f03->ex,  f0_ex,  f0_ey,  f0_ez  );
-    load_4x3_tr(  &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx, f0_cbx, f0_cby, f0_cbz );
-
-    load_4x3_tr(  &fx0->ex,  &fx1->ex,  &fx2->ex,  &fx3->ex,  dummy,  fx_ey,  fx_ez  );
-    load_4x3_tr(  &fy0->ex,  &fy1->ex,  &fy2->ex,  &fy3->ex,  fy_ex,  dummy,  fy_ez  );
-    load_4x2_tr(  &fz0->ex,  &fz1->ex,  &fz2->ex,  &fz3->ex,  fz_ex,  fz_ey   /**/   );
-
-    f0_cbx += fnms( vpy,( fy_ez-f0_ez ),  vpz*( fz_ey-f0_ey ) );
-    f0_cby += fnms( vpz,( fz_ex-f0_ex ),  vpx*( fx_ez-f0_ez ) );
-    f0_cbz += fnms( vpx,( fx_ey-f0_ey ),  vpy*( fy_ex-f0_ex ) );
-
-    store_4x3_tr( f0_cbx, f0_cby, f0_cbz, &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx );
-  }
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper advance_b function.
+//----------------------------------------------------------------------------//
 
 void
 advance_b( field_array_t * RESTRICT fa,
-           float                    _frac ) {
-  if( !fa ) ERROR(( "Bad args" )); 
-  
-  // Do the bulk of the magnetic fields in the pipelines.  The host
-  // handles stragglers.
-
-  pipeline_args_t args[1];
-  args->f    = fa->f;
-  args->g    = fa->g;
-  args->frac = _frac;
-  EXEC_PIPELINES( advance_b, args, 0 );
-  
-  // While the pipelines are busy, do surface fields
-
-  DECLARE_STENCIL();
-  
-  // Do left over bx
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fy = &f(nx+1,y+1,z);
-      fz = &f(nx+1,y,  z+1);
-      UPDATE_CBX();
-    }
+           float _frac )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  // Do left over by
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(2,ny+1,z);
-    fz = &f(1,ny+1,z+1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_CBY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do left over bz
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fx = &f(2,y,  nz+1);
-    fy = &f(1,y+1,nz+1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_CBZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-
-  WAIT_PIPELINES();
-
-  local_adjust_norm_b( f, g );
+  // Conditionally execute this when more abstractions are available.
+  advance_b_pipeline( fa, _frac );
 }
diff --git a/src/field_advance/standard/advance_e.cc b/src/field_advance/standard/advance_e.cc
index ec216815..e326d00f 100644
--- a/src/field_advance/standard/advance_e.cc
+++ b/src/field_advance/standard/advance_e.cc
@@ -1,404 +1,25 @@
-// Note: This is similar to compute_curl_b
-
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  field_t            * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                        \
-  /**/  field_t                * ALIGNED(128) f = args->f;       \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;   \
-  const grid_t                 *              g = args->g;       \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                  \
-                                                                 \
-  const float damp = args->p->damp;                              \
-  const float px   = (nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0; \
-  const float py   = (ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0; \
-  const float pz   = (nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0; \
-  const float cj   = g->dt/g->eps0;                              \
-                                                                 \
-  field_t * ALIGNED(16) f0;                                      \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;  \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++;	fy++; fz++; x++;      \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_EX()						            \
-  f0->tcax = ( py*(f0->cbz*m[f0->fmatz].rmuz-fy->cbz*m[fy->fmatz].rmuz) -   \
-               pz*(f0->cby*m[f0->fmaty].rmuy-fz->cby*m[fz->fmaty].rmuy) ) - \
-             damp*f0->tcax;                                                 \
-  f0->ex   = m[f0->ematx].decayx*f0->ex +                                   \
-             m[f0->ematx].drivex*( f0->tcax - cj*f0->jfx )
-#define UPDATE_EY()						            \
-  f0->tcay = ( pz*(f0->cbx*m[f0->fmatx].rmux-fz->cbx*m[fz->fmatx].rmux) -   \
-               px*(f0->cbz*m[f0->fmatz].rmuz-fx->cbz*m[fx->fmatz].rmuz) ) - \
-             damp*f0->tcay;                                                 \
-  f0->ey   = m[f0->ematy].decayy*f0->ey +                                   \
-             m[f0->ematy].drivey*( f0->tcay - cj*f0->jfy )
-#define UPDATE_EZ()						            \
-  f0->tcaz = ( px*(f0->cby*m[f0->fmaty].rmuy-fx->cby*m[fx->fmaty].rmuy) -   \
-               py*(f0->cbx*m[f0->fmatx].rmux-fy->cbx*m[fy->fmatx].rmux) ) - \
-             damp*f0->tcaz;                                                 \
-  f0->ez   = m[f0->ematz].decayz*f0->ez +                                   \
-             m[f0->ematz].drivez*( f0->tcaz - cj*f0->jfz )
-
-void
-advance_e_pipeline( pipeline_args_t * args,
-                    int pipeline_rank,
-                    int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_EX(); UPDATE_EY(); UPDATE_EZ(); 
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-advance_e_pipeline_v4( pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  const v4float vdamp( damp );
-  const v4float vpx( px );
-  const v4float vpy( py );
-  const v4float vpz( pz );
-  const v4float vcj( cj );
-
-  v4float save0, save1, dummy;
-
-  v4float f0_ex,   f0_ey,   f0_ez;
-  v4float f0_cbx,  f0_cby,  f0_cbz;
-  v4float f0_tcax, f0_tcay, f0_tcaz;
-  v4float f0_jfx,  f0_jfy,  f0_jfz;
-  v4float          fx_cby,  fx_cbz;
-  v4float fy_cbx,           fy_cbz;
-  v4float fz_cbx,  fz_cby;
-  v4float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
-  v4float            m_fx_rmuy, m_fx_rmuz;
-  v4float m_fy_rmux,            m_fy_rmuz;
-  v4float m_fz_rmux, m_fz_rmuy;
-  v4float m_f0_decayx, m_f0_drivex;
-  v4float m_f0_decayy, m_f0_drivey;
-  v4float m_f0_decayz, m_f0_drivez;
-
-  v4float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
-
-  // Process the bulk of the voxels 4 at a time
-                               
-  INIT_STENCIL();
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
-    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
-    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
-    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
-
-    load_4x4_tr( &f00->ex,   &f01->ex,   &f02->ex,   &f03->ex,   f0_ex,   f0_ey,   f0_ez,   save0 );
-    load_4x3_tr( &f00->cbx,  &f01->cbx,  &f02->cbx,  &f03->cbx,  f0_cbx,  f0_cby,  f0_cbz         );
-    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax, f0_tcax, f0_tcay, f0_tcaz, save1 );
-    load_4x3_tr( &f00->jfx,  &f01->jfx,  &f02->jfx,  &f03->jfx,  f0_jfx,  f0_jfy,  f0_jfz         );
-
-    load_4x3_tr( &fx0->cbx,  &fx1->cbx,  &fx2->cbx,  &fx3->cbx,  dummy,   fx_cby,  fx_cbz         );
-    load_4x3_tr( &fy0->cbx,  &fy1->cbx,  &fy2->cbx,  &fy3->cbx,  fy_cbx,  dummy,   fy_cbz         );
-    load_4x2_tr( &fz0->cbx,  &fz1->cbx,  &fz2->cbx,  &fz3->cbx,  fz_cbx,  fz_cby   /**/           );
-
-#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v4float( m[f##V##0->fmat##D].rmu##D, \
-                                                  m[f##V##1->fmat##D].rmu##D, \
-                                                  m[f##V##2->fmat##D].rmu##D, \
-                                                  m[f##V##3->fmat##D].rmu##D )
-
-    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
-    /**/           LOAD_RMU(x,y); LOAD_RMU(x,z);
-    LOAD_RMU(y,x);                LOAD_RMU(y,z);
-    LOAD_RMU(z,x); LOAD_RMU(z,y);
-    
-    load_4x2_tr( &m[f00->ematx].decayx, &m[f01->ematx].decayx,
-                 &m[f02->ematx].decayx, &m[f03->ematx].decayx,
-                 m_f0_decayx, m_f0_drivex );
-    load_4x2_tr( &m[f00->ematy].decayy, &m[f01->ematy].decayy,
-                 &m[f02->ematy].decayy, &m[f03->ematy].decayy,
-                 m_f0_decayy, m_f0_drivey );
-    load_4x2_tr( &m[f00->ematz].decayz, &m[f01->ematz].decayz,
-                 &m[f02->ematz].decayz, &m[f03->ematz].decayz,
-                 m_f0_decayz, m_f0_drivez );
-
-#   undef LOAD_RMU
-
-    f0_cbx_rmux = f0_cbx * m_f0_rmux;
-    f0_cby_rmuy = f0_cby * m_f0_rmuy;
-    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
-
-    f0_tcax = fnms( vdamp,f0_tcax,
-                    fms( vpy,fnms( fy_cbz,m_fy_rmuz, f0_cbz_rmuz ),
-                         vpz*fnms( fz_cby,m_fz_rmuy, f0_cby_rmuy ) ) );
 
-    f0_tcay = fnms( vdamp,f0_tcay,
-                    fms( vpz,fnms( fz_cbx,m_fz_rmux, f0_cbx_rmux ),
-                         vpx*fnms( fx_cbz,m_fx_rmuz, f0_cbz_rmuz ) ) );
-
-    f0_tcaz = fnms( vdamp,f0_tcaz,
-                    fms( vpx,fnms( fx_cby,m_fx_rmuy, f0_cby_rmuy ),
-                         vpy*fnms( fy_cbx,m_fy_rmux, f0_cbx_rmux ) ) );
-
-    f0_ex = fma( m_f0_decayx,f0_ex, m_f0_drivex*fnms( vcj,f0_jfx, f0_tcax ));
-    f0_ey = fma( m_f0_decayy,f0_ey, m_f0_drivey*fnms( vcj,f0_jfy, f0_tcay ));
-    f0_ez = fma( m_f0_decayz,f0_ez, m_f0_drivez*fnms( vcj,f0_jfz, f0_tcaz ));
-
-    // Note: Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient than store_4x3!
-
-    store_4x4_tr( f0_ex,   f0_ey,   f0_ez,   save0, &f00->ex,    &f01->ex,    &f02->ex,    &f03->ex   );
-    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1, &f00->tcax,  &f01->tcax,  &f02->tcax,  &f03->tcax );
-  }
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper advance_e function.
+//----------------------------------------------------------------------------//
 
 void
 advance_e( field_array_t * RESTRICT fa,
-           float frac ) {
-  if( !fa     ) ERROR(( "Bad args" ));
-  if( frac!=1 ) ERROR(( "standard advance_e does not support frac!=1 yet" ));
-
-  /***************************************************************************
-   * Begin tangential B ghost setup
-   ***************************************************************************/
-  
-  begin_remote_ghost_tang_b( fa->f, fa->g );
-  local_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update interior fields
-   * Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
-   * Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
-   * Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
-   ***************************************************************************/
-
-  // Do majority interior in a single pass.  The host handles
-  // stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( advance_e, args, 0 );
-  
-  // While the pipelines are busy, do non-bulk interior fields
-
-  DECLARE_STENCIL();
-
-  // Do left over interior ex
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_EX();
-    }
-  }
-
-  // Do left over interior ey
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(2,1,z);
-    fx = &f(1,1,z);
-    fz = &f(2,1,z-1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do left over interior ez
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(2,y,  1);
-    fx = &f(1,y,  1);
-    fy = &f(2,y-1,1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
+           float frac )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  WAIT_PIPELINES();
-  
-  /***************************************************************************
-   * Finish tangential B ghost setup
-   ***************************************************************************/
-
-  end_remote_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update exterior fields
-   ***************************************************************************/
-
-  // Do exterior ex
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // Do exterior ey
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(0,y,z);
-      fz = &f(1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fx = &f(nx,  y,z);
-      fz = &f(nx+1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,1);
-    fx = &f(1,y,1);
-    fz = &f(2,y,0);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,nz+1);
-    fx = &f(1,y,nz+1);
-    fz = &f(2,y,nz  );
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do exterior ez
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      UPDATE_EZ();
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      UPDATE_EZ();
-    }
+  if ( frac != 1 )
+  {
+    ERROR( ( "standard advance_e does not support frac != 1 yet" ) );
   }
 
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  advance_e_pipeline( fa, frac );
 }
diff --git a/src/field_advance/standard/clean_div_b.cc b/src/field_advance/standard/clean_div_b.cc
index bd6ab6a6..f6ad3006 100644
--- a/src/field_advance/standard/clean_div_b.cc
+++ b/src/field_advance/standard/clean_div_b.cc
@@ -1,344 +1,19 @@
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define MARDER_CBX() f0->cbx += px*( f0->div_b_err - fx->div_b_err )
-#define MARDER_CBY() f0->cby += py*( f0->div_b_err - fy->div_b_err )
-#define MARDER_CBZ() f0->cbz += pz*( f0->div_b_err - fz->div_b_err )
-
-typedef struct pipeline_args {
-  field_t      * ALIGNED(128) f;
-  const grid_t *              g;
-} pipeline_args_t;
-
-void
-clean_div_b_pipeline( pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline ) {
-  field_t      * ALIGNED(128) f = args->f;
-  const grid_t *              g = args->g;
-  
-  field_t * ALIGNED(16) f0;
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
-  int x, y, z, n_voxel;
-
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
-
-  float px, py, pz, alphadt;
-
-  px = (nx>1) ? g->rdx : 0;
-  py = (ny>1) ? g->rdy : 0;
-  pz = (nz>1) ? g->rdz : 0;
-  alphadt = 0.3888889/( px*px + py*py + pz*pz );
-  px *= alphadt;
-  py *= alphadt;
-  pz *= alphadt;
-
-  // Process voxels assigned to this pipeline
-  
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-# define LOAD_STENCIL() \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-  LOAD_STENCIL();
-  
-  for( ; n_voxel; n_voxel-- ) {
-    MARDER_CBX();
-    MARDER_CBY();
-    MARDER_CBZ();
-    f0++; fx++; fy++; fz++;
-    
-    x++;
-    if( x>nx ) {
-      x=2, y++;
-      if( y>ny ) y=2, z++;
-      LOAD_STENCIL();
-    }      
-  }
-
-# undef LOAD_STENCIL
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-clean_div_b_pipeline_v4( pipeline_args_t * args,
-                         int pipeline_rank,
-                         int n_pipeline ) {
-  field_t      * ALIGNED(128) f = args->f;
-  const grid_t *              g = args->g;
-
-  field_t * ALIGNED(16) f0;
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
-  int x, y, z, n_voxel;
-  
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
-
-  float px, py, pz, alphadt;
-
-  px = (nx>1) ? g->rdx : 0;
-  py = (ny>1) ? g->rdy : 0;
-  pz = (nz>1) ? g->rdz : 0;
-  alphadt = 0.3888889/( px*px + py*py + pz*pz );
-  px *= alphadt;
-  py *= alphadt;
-  pz *= alphadt;
-
-  const v4float vpx(px);
-  const v4float vpy(py);
-  const v4float vpz(pz);
-
-  v4float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
-  v4float f0_div_b_err;           // Voxel quad div b errs
-  v4float fx_div_b_err;           // Voxel quad -x neighbor div b err
-  v4float fy_div_b_err;           // Voxel quad -y neighbor div b err
-  v4float fz_div_b_err;           // Voxel quad -z neighbor div b err
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +x neighbors
-
-  // Process voxels assigned to this pipeline 
-  
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  // Process bulk of voxels 4 at a time
-
-# define LOAD_STENCIL() \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-# define NEXT_STENCIL(n) \
-  f0##n = f0++;          \
-  fx##n = fx++;          \
-  fy##n = fy++;          \
-  fz##n = fz++;          \
-  x++;                   \
-  if( x>nx ) {           \
-    x=2, y++;            \
-    if( y>ny ) y=2, z++; \
-    LOAD_STENCIL();      \
-  }
-
-  LOAD_STENCIL();
-
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    NEXT_STENCIL(0); NEXT_STENCIL(1); NEXT_STENCIL(2); NEXT_STENCIL(3);
 
-    load_4x4_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx, f0_cbx, f0_cby, f0_cbz, f0_div_b_err );
-
-    fx_div_b_err = v4float( fx0->div_b_err, fx1->div_b_err, fx2->div_b_err, fx3->div_b_err );
-    fy_div_b_err = v4float( fy0->div_b_err, fy1->div_b_err, fy2->div_b_err, fy3->div_b_err );
-    fz_div_b_err = v4float( fz0->div_b_err, fz1->div_b_err, fz2->div_b_err, fz3->div_b_err );
-
-    f0_cbx = fma( f0_div_b_err-fx_div_b_err, px, f0_cbx );
-    f0_cby = fma( f0_div_b_err-fy_div_b_err, py, f0_cby );
-    f0_cbz = fma( f0_div_b_err-fz_div_b_err, pz, f0_cbz );
-
-    store_4x4_tr( f0_cbx, f0_cby, f0_cbz, f0_div_b_err, &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx );
-  }
-
-# undef NEXT_STENCIL
-# undef LOAD_STENCIL
-
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper clean_div_b function.
+//----------------------------------------------------------------------------//
 
 void
-clean_div_b( field_array_t * fa ) {
-  pipeline_args_t args[1];
-  
-  field_t * f, * f0, * fx, * fy, * fz;
-  const grid_t * g;
-  float alphadt, px, py, pz;
-  int x, y, z, nx, ny, nz;
-
-  if( !fa ) ERROR(( "Bad args" ));
-  f = fa->f;
-  g = fa->g;
-
-  nx = g->nx;
-  ny = g->ny;
-  nz = g->nz;
-  px = (nx>1) ? g->rdx : 0;
-  py = (ny>1) ? g->rdy : 0;
-  pz = (nz>1) ? g->rdz : 0;
-  alphadt = 0.3888889/( px*px + py*py + pz*pz );
-  px *= alphadt;
-  py *= alphadt;
-  pz *= alphadt;
-
-  // Have pipelines do Marder pass in interior.  The host handles
-  // stragglers.
-
-# if 0 // Original non-pipelined version
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(2,y,  z);
-      fx = &f(1,y,  z);
-      fy = &f(2,y-1,z);
-      fz = &f(2,y,  z-1);
-      for( x=2; x<=nx; x++ ) {
-	MARDER_CBX();
-	MARDER_CBY();
-	MARDER_CBZ();
-	f0++; fx++; fy++; fz++;
-      }
-    }
-  }
-# endif
-
-  // Begin setting derr ghosts
-  begin_remote_ghost_div_b( f, g );
-  local_ghost_div_b( f, g);
-
-  // Have pipelines do interior of the local domain
-  args->f = f;
-  args->g = g;
-  EXEC_PIPELINES( clean_div_b, args, 0 );
-  
-  // Do left over interior bx
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,1);
-    fx = &f(1,y,1);
-    for( x=2; x<=nx; x++ ) {
-      MARDER_CBX();
-      f0++;
-      fx++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(2,1,z);
-    fx = &f(1,1,z);
-    for( x=2; x<=nx; x++ ) {
-      MARDER_CBX();
-      f0++;
-      fx++;
-    }
-  }
-
-  // Left over interior by
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fy = &f(1,y-1,z);
-      MARDER_CBY();
-    }
-  }
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(2,y,  1);
-    fy = &f(2,y-1,1);
-    for( x=2; x<=nx; x++ ) {
-      MARDER_CBY();
-      f0++;
-      fy++;
-    }
-  }
-
-  // Left over interior bz
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_CBZ();
-      f0++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fz = &f(1,y,z-1);
-      MARDER_CBZ();
-    }
-  }
-
-  // Finish setting derr ghosts
-  
-  end_remote_ghost_div_b( f, g );
-
-  // Do Marder pass in exterior
-
-  // Exterior bx
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(0,y,z);
-      MARDER_CBX();
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fx = &f(nx,  y,z);
-      MARDER_CBX();
-    }
-  }
-
-  // Exterior by
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fy = &f(1,0,z);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_CBY();
-      f0++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fy = &f(1,ny,  z);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_CBY();
-      f0++;
-      fy++;
-    }
-  }
-
-  // Exterior bz
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(1,y,1);
-    fz = &f(1,y,0);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_CBZ();
-      f0++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(1,y,nz+1);
-    fz = &f(1,y,nz);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_CBZ();
-      f0++;
-      fz++;
-    }
+clean_div_b( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  // Wait for pipelines to finish up cleaning div_b in interior
-  
-  WAIT_PIPELINES();
-  
-  local_adjust_norm_b(f,g);
+  // Conditionally execute this when more abstractions are available.
+  clean_div_b_pipeline( fa );
 }
diff --git a/src/field_advance/standard/clean_div_e.c b/src/field_advance/standard/clean_div_e.c
index 91268847..c4156afd 100644
--- a/src/field_advance/standard/clean_div_e.c
+++ b/src/field_advance/standard/clean_div_e.c
@@ -1,150 +1,19 @@
 #define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  field_t            * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                                \
-  field_t                      * ALIGNED(128) f = args->f;               \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;           \
-  const grid_t                 *              g = args->g;               \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                          \
-                                                                         \
-  const float _rdx = (nx>1) ? g->rdx : 0;                                \
-  const float _rdy = (ny>1) ? g->rdy : 0;                                \
-  const float _rdz = (nz>1) ? g->rdz : 0;                                \
-  const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \
-  const float px   = alphadt*_rdx;                                       \
-  const float py   = alphadt*_rdy;                                       \
-  const float pz   = alphadt*_rdz;                                       \
-                                                                         \
-  field_t * ALIGNED(16) f0;                                              \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;          \
-  int x, y, z
-                     
-#define f(x,y,z) f[ VOXEL(x,y,z,nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x+1,y,  z  ); \
-  fy = &f(x,  y+1,z  ); \
-  fz = &f(x,  y,  z+1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 1; \
-    if( y>ny ) z++; if( y>ny ) y = 1; \
-    INIT_STENCIL();                   \
-  }
-
-#define MARDER_EX() \
-    f0->ex += m[f0->ematx].drivex*px*(fx->div_e_err-f0->div_e_err)
-#define MARDER_EY() \
-    f0->ey += m[f0->ematy].drivey*py*(fy->div_e_err-f0->div_e_err)
-#define MARDER_EZ() \
-    f0->ez += m[f0->ematz].drivez*pz*(fz->div_e_err-f0->div_e_err)
-
-static void
-clean_div_e_pipeline( pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline ) {
-  DECLARE_STENCIL();
-  
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    MARDER_EX(); MARDER_EY(); MARDER_EZ();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
 
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper clean_div_e function.
+//----------------------------------------------------------------------------//
 
 void
-clean_div_e( field_array_t * fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Do majority of field components in single pass on the pipelines.
-  // The host handles stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( clean_div_e, args, 0 );
-
-  // While pipelines are busy, do left overs on the host
-
-  do {
-    DECLARE_STENCIL();
-    
-    // Do left over ex
-    for( y=1; y<=ny+1; y++ ) {
-      f0 = &f(1,y,nz+1);
-      fx = &f(2,y,nz+1);
-      for( x=1; x<=nx; x++ ) {
-        MARDER_EX();
-        f0++; fx++;
-      }
-    }
-    for( z=1; z<=nz; z++ ) {
-      f0 = &f(1,ny+1,z);
-      fx = &f(2,ny+1,z);
-      for( x=1; x<=nx; x++ ) {
-        MARDER_EX();
-        f0++; fx++;
-      }
-    }
-  
-    // Do left over ey
-    for( z=1; z<=nz+1; z++ ) {
-      for( y=1; y<=ny; y++ ) {
-        f0 = &f(nx+1,y,  z);
-        fy = &f(nx+1,y+1,z);
-        MARDER_EY();
-      }
-    }
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,  nz+1);
-      fy = &f(1,y+1,nz+1);
-      for( x=1; x<=nx; x++ ) {
-        MARDER_EY();
-        f0++; fy++;
-      }
-    }
-  
-    // Do left over ez
-    for( z=1; z<=nz; z++ ) {
-      f0 = &f(1,ny+1,z);
-      fz = &f(1,ny+1,z+1);
-      for( x=1; x<=nx+1; x++ ) {
-        MARDER_EZ();
-        f0++; fz++;
-      }
-    }
-    for( z=1; z<=nz; z++ ) {
-      for( y=1; y<=ny; y++ ) {
-        f0 = &f(nx+1,y,z);
-        fz = &f(nx+1,y,z+1);
-        MARDER_EZ();
-      }
-    }
-  } while(0);
-
-  WAIT_PIPELINES();
+clean_div_e( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
 
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  clean_div_e_pipeline( fa );
 }
-
diff --git a/src/field_advance/standard/compute_curl_b.cc b/src/field_advance/standard/compute_curl_b.cc
index 7eaf76e2..cb6053b5 100644
--- a/src/field_advance/standard/compute_curl_b.cc
+++ b/src/field_advance/standard/compute_curl_b.cc
@@ -1,361 +1,19 @@
-// Note: This is similar to advance_e
-
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  field_t            * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                        \
-  /**/  field_t                * ALIGNED(128) f = args->f;       \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;   \
-  const grid_t                 *              g = args->g;       \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                  \
-                                                                 \
-  const float px = (nx>1) ? g->cvac*g->dt*g->rdx : 0;            \
-  const float py = (ny>1) ? g->cvac*g->dt*g->rdy : 0;            \
-  const float pz = (nz>1) ? g->cvac*g->dt*g->rdz : 0;            \
-                                                                 \
-  field_t * ALIGNED(16) f0;                                      \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;  \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++;	fy++; fz++; x++;      \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_EX()						            \
-  f0->tcax = ( py*(f0->cbz*m[f0->fmatz].rmuz-fy->cbz*m[fy->fmatz].rmuz) -   \
-               pz*(f0->cby*m[f0->fmaty].rmuy-fz->cby*m[fz->fmaty].rmuy) )
-#define UPDATE_EY()						            \
-  f0->tcay = ( pz*(f0->cbx*m[f0->fmatx].rmux-fz->cbx*m[fz->fmatx].rmux) -   \
-               px*(f0->cbz*m[f0->fmatz].rmuz-fx->cbz*m[fx->fmatz].rmuz) )
-#define UPDATE_EZ()						            \
-  f0->tcaz = ( px*(f0->cby*m[f0->fmaty].rmuy-fx->cby*m[fx->fmaty].rmuy) -   \
-               py*(f0->cbx*m[f0->fmatx].rmux-fy->cbx*m[fy->fmatx].rmux) )
-
-void
-compute_curl_b_pipeline( pipeline_args_t * args,
-                    int pipeline_rank,
-                    int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_EX(); UPDATE_EY(); UPDATE_EZ(); 
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-compute_curl_b_pipeline_v4( pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  const v4float vpx( px );
-  const v4float vpy( py );
-  const v4float vpz( pz );
-
-  v4float save1, dummy;
-
-  v4float f0_cbx,  f0_cby,  f0_cbz;
-  v4float f0_tcax, f0_tcay, f0_tcaz;
-  v4float          fx_cby,  fx_cbz;
-  v4float fy_cbx,           fy_cbz;
-  v4float fz_cbx,  fz_cby;
-  v4float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
-  v4float            m_fx_rmuy, m_fx_rmuz;
-  v4float m_fy_rmux,            m_fy_rmuz;
-  v4float m_fz_rmux, m_fz_rmuy;
-
-  v4float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
-
-  // Process the bulk of the voxels 4 at a time
-                               
-  INIT_STENCIL();
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
-    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
-    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
-    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
-
-    load_4x3_tr( &f00->cbx,  &f01->cbx,  &f02->cbx,  &f03->cbx,  f0_cbx,  f0_cby,  f0_cbz         );
-    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax, f0_tcax, f0_tcay, f0_tcaz, save1 );
-
-    load_4x3_tr( &fx0->cbx,  &fx1->cbx,  &fx2->cbx,  &fx3->cbx,  dummy,   fx_cby,  fx_cbz         );
-    load_4x3_tr( &fy0->cbx,  &fy1->cbx,  &fy2->cbx,  &fy3->cbx,  fy_cbx,  dummy,   fy_cbz         );
-    load_4x2_tr( &fz0->cbx,  &fz1->cbx,  &fz2->cbx,  &fz3->cbx,  fz_cbx,  fz_cby   /**/           );
-
-#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v4float( m[f##V##0->fmat##D].rmu##D, \
-                                                  m[f##V##1->fmat##D].rmu##D, \
-                                                  m[f##V##2->fmat##D].rmu##D, \
-                                                  m[f##V##3->fmat##D].rmu##D )
-
-    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
-    /**/           LOAD_RMU(x,y); LOAD_RMU(x,z);
-    LOAD_RMU(y,x);                LOAD_RMU(y,z);
-    LOAD_RMU(z,x); LOAD_RMU(z,y);
-    
-#   undef LOAD_RMU
-
-    f0_cbx_rmux = f0_cbx * m_f0_rmux;
-    f0_cby_rmuy = f0_cby * m_f0_rmuy;
-    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
-
-    f0_tcax = fms( vpy,fnms( fy_cbz,m_fy_rmuz, f0_cbz_rmuz ),
-                   vpz*fnms( fz_cby,m_fz_rmuy, f0_cby_rmuy ) );
-    f0_tcay = fms( vpz,fnms( fz_cbx,m_fz_rmux, f0_cbx_rmux ),
-                   vpx*fnms( fx_cbz,m_fx_rmuz, f0_cbz_rmuz ) );
-    f0_tcaz = fms( vpx,fnms( fx_cby,m_fx_rmuy, f0_cby_rmuy ),
-                   vpy*fnms( fy_cbx,m_fy_rmux, f0_cbx_rmux ) );
 
-    // Note: Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient than store_4x3!
-    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1, &f00->tcax,  &f01->tcax,  &f02->tcax,  &f03->tcax );
-  }
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_curl_b function.
+//----------------------------------------------------------------------------//
 
 void
-compute_curl_b( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  /***************************************************************************
-   * Begin tangential B ghost setup
-   ***************************************************************************/
-  
-  begin_remote_ghost_tang_b( fa->f, fa->g );
-  local_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update interior fields
-   * Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
-   * Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
-   * Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
-   ***************************************************************************/
-
-  // Do majority interior in a single pass.  The host handles
-  // stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( compute_curl_b, args, 0 );
-  
-  // While the pipelines are busy, do non-bulk interior fields
-
-  DECLARE_STENCIL();
-
-  // Do left over interior ex
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_EX();
-    }
-  }
-
-  // Do left over interior ey
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(2,1,z);
-    fx = &f(1,1,z);
-    fz = &f(2,1,z-1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do left over interior ez
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(2,y,  1);
-    fx = &f(1,y,  1);
-    fy = &f(2,y-1,1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-
-  WAIT_PIPELINES();
-  
-  /***************************************************************************
-   * Finish tangential B ghost setup
-   ***************************************************************************/
-
-  end_remote_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update exterior fields
-   ***************************************************************************/
-
-  // Do exterior ex
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // Do exterior ey
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(0,y,z);
-      fz = &f(1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fx = &f(nx,  y,z);
-      fz = &f(nx+1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,1);
-    fx = &f(1,y,1);
-    fz = &f(2,y,0);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,nz+1);
-    fx = &f(1,y,nz+1);
-    fz = &f(2,y,nz  );
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do exterior ez
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      UPDATE_EZ();
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      UPDATE_EZ();
-    }
+compute_curl_b( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  compute_curl_b_pipeline( fa );
 }
diff --git a/src/field_advance/standard/compute_div_b_err.cc b/src/field_advance/standard/compute_div_b_err.cc
index f83e5fa0..ad74c86a 100644
--- a/src/field_advance/standard/compute_div_b_err.cc
+++ b/src/field_advance/standard/compute_div_b_err.cc
@@ -1,176 +1,19 @@
 #define IN_sfa
-#include "sfa_private.h"
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-typedef struct pipeline_args {
-  field_t      * ALIGNED(128) f;
-  const grid_t *              g;
-} pipeline_args_t;
-
-void
-compute_div_b_err_pipeline( pipeline_args_t * args,
-                            int pipeline_rank,
-                            int n_pipeline ) {
-  field_t      * ALIGNED(128) f = args->f;
-  const grid_t *              g = args->g;
-  
-  field_t * ALIGNED(16) f0;
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
-  int x, y, z, n_voxel;
-
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
-
-  const float px = (nx>1) ? g->rdx : 0;
-  const float py = (ny>1) ? g->rdy : 0;
-  const float pz = (nz>1) ? g->rdz : 0;
-
-  // Process the voxels assigned to this pipeline
-  
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-# define LOAD_STENCIL() \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x+1,y,  z  ); \
-  fy = &f(x,  y+1,z  ); \
-  fz = &f(x,  y,  z+1)
-
-  LOAD_STENCIL();
-
-  for( ; n_voxel; n_voxel-- ) {
-    f0->div_b_err = px*( fx->cbx - f0->cbx ) +
-                    py*( fy->cby - f0->cby ) +
-                    pz*( fz->cbz - f0->cbz );
-    f0++; fx++; fy++; fz++;
-    
-    x++;
-    if( x>nx ) {
-      x=1, y++;
-      if( y>ny ) y=1, z++;
-      LOAD_STENCIL();
-    }
-  }
-
-# undef LOAD_STENCIL
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-static void
-compute_div_b_err_pipeline_v4( pipeline_args_t * args,
-                               int pipeline_rank,
-                               int n_pipeline ) {
-  field_t      * ALIGNED(128) f = args->f;
-  const grid_t *              g = args->g;
-
-  field_t * ALIGNED(16) f0;
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
-  int x, y, z, n_voxel;
-
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
 
-  const float px = (nx>1) ? g->rdx : 0;
-  const float py = (ny>1) ? g->rdy : 0;
-  const float pz = (nz>1) ? g->rdz : 0;
-
-  const v4float vpx(px);
-  const v4float vpy(py);
-  const v4float vpz(pz);
-
-  v4float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
-  v4float f0_div_b_err;           // Voxel quad div b errs
-  v4float fx_cbx;                 // Voxel quad +x neighbor x magnetic fields
-  v4float fy_cby;                 // Voxel quad +y neighbor y magnetic fields
-  v4float fz_cbz;                 // Voxel quad +z neighbor z magnetic fields
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +x neighbors
-
-  // Process the voxels assigned to this pipeline 
-  
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  // Process bulk of voxels 4 at a time
-
-# define LOAD_STENCIL() \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x+1,y,  z  ); \
-  fy = &f(x,  y+1,z  ); \
-  fz = &f(x,  y,  z+1)
-
-# define NEXT_STENCIL(n) \
-  f0##n = f0++;          \
-  fx##n = fx++;          \
-  fy##n = fy++;          \
-  fz##n = fz++;          \
-  x++;                   \
-  if( x>nx ) {           \
-    x=1, y++;            \
-    if( y>ny ) y=1, z++; \
-    LOAD_STENCIL();      \
-  }
-
-  LOAD_STENCIL();
-
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    NEXT_STENCIL(0); NEXT_STENCIL(1); NEXT_STENCIL(2); NEXT_STENCIL(3);
-
-    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx, f0_cbx, f0_cby, f0_cbz );
-
-    fx_cbx = v4float( fx0->cbx, fx1->cbx, fx2->cbx, fx3->cbx );
-    fy_cby = v4float( fy0->cby, fy1->cby, fy2->cby, fy3->cby );
-    fz_cbz = v4float( fz0->cbz, fz1->cbz, fz2->cbz, fz3->cbz );
-
-    f0_div_b_err = fma( vpx,fx_cbx-f0_cbx, fma( vpy,fy_cby-f0_cby, vpz*(fz_cbz-f0_cbz) ) );
-
-    store_4x1_tr( f0_div_b_err, &f00->div_b_err, &f01->div_b_err, &f02->div_b_err, &f03->div_b_err );
-  }
-
-# undef NEXT_STENCIL
-# undef LOAD_STENCIL
-
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_div_b_err function.
+//----------------------------------------------------------------------------//
 
 void
-compute_div_b_err( field_array_t * RESTRICT fa ) {
-  pipeline_args_t args[1];
-
-  if( !fa ) ERROR(( "Bad args" ));
-  
-# if 0 // Original non-pipelined version
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(2,y,z);
-      fy = &f(1,y+1,z);
-      fz = &f(1,y,z+1);
-      for( x=1; x<=nx; x++ ) {
-	f0->div_b_err = px*( fx->cbx - f0->cbx ) +
-	                py*( fy->cby - f0->cby ) +
-                        pz*( fz->cbz - f0->cbz );
-	f0++; fx++; fy++; fz++;
-      }
-    }
+compute_div_b_err( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-# endif
 
-  args->f = fa->f;
-  args->g = fa->g;
-  EXEC_PIPELINES( compute_div_b_err, args, 0 );
-  WAIT_PIPELINES();
+  // Conditionally execute this when more abstractions are available.
+  compute_div_b_err_pipeline( fa );
 }
diff --git a/src/field_advance/standard/compute_div_e_err.c b/src/field_advance/standard/compute_div_e_err.c
index 898351a6..73d8c509 100644
--- a/src/field_advance/standard/compute_div_e_err.c
+++ b/src/field_advance/standard/compute_div_e_err.c
@@ -1,177 +1,21 @@
 // Note: This is virtually identical to compute_rhob
-#define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                       \
-  /**/  field_t                * ALIGNED(128) f = args->f;      \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
-  const grid_t                 *              g = args->g;      \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
-                                                                \
-  const float px = (nx>1) ? g->rdx : 0;                         \
-  const float py = (ny>1) ? g->rdy : 0;                         \
-  const float pz = (nz>1) ? g->rdz : 0;                         \
-  const float cj = 1./g->eps0;                                  \
-                                                                \
-  field_t * ALIGNED(16) f0;                                     \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_DERR_E() f0->div_e_err = m[f0->nmat].nonconductive * \
-  ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) +    \
-    py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) +    \
-    pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) -    \
-    cj*( f0->rhof + f0->rhob ) )
 
-void
-compute_div_e_err_pipeline( pipeline_args_t * args,
-                            int pipeline_rank,
-                            int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_DERR_E();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+#define IN_sfa
 
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_div_e_err function.
+//----------------------------------------------------------------------------//
 
 void
-compute_div_e_err( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Have pipelines compute the interior of local domain (the host
-  // handles stragglers in the interior)
-
-  // Begin setting normal e ghosts
-
-  begin_remote_ghost_norm_e( fa->f, fa->g );
-  local_ghost_norm_e( fa->f, fa->g );
-
-  // Have pipelines compute interior of local domain
-
-  pipeline_args_t args[1];  
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( compute_div_e_err, args, 0 );
-
-  // While pipelines are busy, have host compute the exterior
-  // of the local domain
-
-  DECLARE_STENCIL();
-
-  // Finish setting normal e ghosts
-  end_remote_ghost_norm_e( fa->f, fa->g );
-
-  // z faces, x edges, y edges and all corners
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fx = &f(0,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
+compute_div_e_err( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fx = &f(0,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // y faces, z edges
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // x faces
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_DERR_E();
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      fz = &f(nx+1,y,  z-1);
-      UPDATE_DERR_E();
-    }
-  }
-
-  // Finish up setting interior
-
-  WAIT_PIPELINES();
 
-  local_adjust_div_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  compute_div_e_err_pipeline( fa );
 }
diff --git a/src/field_advance/standard/compute_rhob.c b/src/field_advance/standard/compute_rhob.c
index 3667702f..c1e5df80 100644
--- a/src/field_advance/standard/compute_rhob.c
+++ b/src/field_advance/standard/compute_rhob.c
@@ -1,176 +1,21 @@
 // Note: This is virtually identical to compute_div_e_err
-#define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                       \
-  /**/  field_t                * ALIGNED(128) f = args->f;      \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
-  const grid_t                 *              g = args->g;      \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
-                                                                \
-  const float px = (nx>1) ? g->eps0*g->rdx : 0;                 \
-  const float py = (ny>1) ? g->eps0*g->rdy : 0;                 \
-  const float pz = (nz>1) ? g->eps0*g->rdz : 0;                 \
-                                                                \
-  field_t * ALIGNED(16) f0;                                     \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_DERR_E() f0->rhob = m[f0->nmat].nonconductive *   \
-  ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) + \
-    py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) + \
-    pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) - \
-    f0->rhof )
 
-void
-compute_rhob_pipeline( pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_DERR_E();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+#define IN_sfa
 
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_rhob function.
+//----------------------------------------------------------------------------//
 
 void
-compute_rhob( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Have pipelines compute the interior of local domain (the host
-  // handles stragglers in the interior)
-
-  // Begin setting normal e ghosts
-
-  begin_remote_ghost_norm_e( fa->f, fa->g );
-  local_ghost_norm_e( fa->f, fa->g );
-
-  // Have pipelines compute interior of local domain
-
-  pipeline_args_t args[1];  
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( compute_rhob, args, 0 );
-
-  // While pipelines are busy, have host compute the exterior
-  // of the local domain
-
-  DECLARE_STENCIL();
-
-  // Finish setting normal e ghosts
-  end_remote_ghost_norm_e( fa->f, fa->g );
-
-  // z faces, x edges, y edges and all corners
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fx = &f(0,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
+compute_rhob( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fx = &f(0,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // y faces, z edges
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // x faces
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_DERR_E();
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      fz = &f(nx+1,y,  z-1);
-      UPDATE_DERR_E();
-    }
-  }
-
-  // Finish up setting interior
-
-  WAIT_PIPELINES();
 
-  local_adjust_rhob( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  compute_rhob_pipeline( fa );
 }
diff --git a/src/field_advance/standard/compute_rms_div_b_err.c b/src/field_advance/standard/compute_rms_div_b_err.c
index 4b569573..d7604c39 100644
--- a/src/field_advance/standard/compute_rms_div_b_err.c
+++ b/src/field_advance/standard/compute_rms_div_b_err.c
@@ -1,93 +1,24 @@
 #define IN_sfa
-#include "sfa_private.h"
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-typedef struct pipeline_args {
-  const field_t * ALIGNED(128) f;
-  const grid_t  *              g;
-  double err[MAX_PIPELINE+1];
-} pipeline_args_t;
-
-static void
-compute_rms_div_b_err_pipeline( pipeline_args_t * args,
-                                int pipeline_rank,
-                                int n_pipeline ) {
-  const field_t * ALIGNED(128) f = args->f;
-  const grid_t  *              g = args->g;
-                             
-  const field_t * ALIGNED(16) f0;
-  int x, y, z, n_voxel;
-
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
-
-  double err;
-
-  // Process voxels assigned to this pipeline
 
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-  
-  f0 = &f(x,y,z);
+#include "sfa_private.h"
 
-  err = 0;
-  for( ; n_voxel; n_voxel-- ) {
-    err += f0->div_b_err*f0->div_b_err;
-    f0++;
-    
-    x++;
-    if( x>nx ) {
-      x=1, y++;
-      if( y>ny ) y=1, z++;
-      f0 = &f(x,y,z);
-    }
-  }
-    
-  args->err[pipeline_rank] = err;
-}
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_rms_div_b_err
+// function.
+//----------------------------------------------------------------------------//
 
 double
-compute_rms_div_b_err( const field_array_t * fa ) {
-  pipeline_args_t args[1];
-  int p;
-  
-  double err = 0, local[2], global[2];
-
-  if( !fa ) ERROR(( "Bad args"));
+compute_rms_div_b_err( const field_array_t * fa )
+{
+  double rms_div_b_err;
 
-# if 0 // Original non-pipelined version
-  field_t * ALIGNED(16) f0;
-  int z, y, x;
-  int nx = g->nx;
-  int ny = g->ny;
-  int nz = g->nz;
-
-  err = 0;
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      for( x=1; x<=nx; x++ ) {
-        err += f0->div_b_err*f0->div_b_err;
-        f0++;
-      }
-    }
+  if ( !fa )
+  {
+    ERROR( ( "Bad args") );
   }
-# endif
-
-  args->f = fa->f;
-  args->g = fa->g;
-
-  EXEC_PIPELINES( compute_rms_div_b_err, args, 0 );
-  WAIT_PIPELINES();
 
-  err = 0;
-  for( p=0; p<=N_PIPELINE; p++ ) err += args->err[p];
+  // Conditionally execute this when more abstractions are available.
+  rms_div_b_err = compute_rms_div_b_err_pipeline( fa );
 
-  local[0] = err*fa->g->dV;
-  local[1] = (fa->g->nx*fa->g->ny*fa->g->nz)*fa->g->dV;
-  mp_allsum_d( local, global, 2 );
-  return fa->g->eps0*sqrt(global[0]/global[1]);
+  return rms_div_b_err;
 }
diff --git a/src/field_advance/standard/compute_rms_div_e_err.c b/src/field_advance/standard/compute_rms_div_e_err.c
index 6899411e..ec1706dc 100644
--- a/src/field_advance/standard/compute_rms_div_e_err.c
+++ b/src/field_advance/standard/compute_rms_div_e_err.c
@@ -1,157 +1,24 @@
 #define IN_sfa
-#include "sfa_private.h"
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-typedef struct pipeline_args {
-  const field_t * ALIGNED(128) f;
-  const grid_t  *              g;
-  double err[MAX_PIPELINE+1];
-} pipeline_args_t;
-
-static void
-compute_rms_div_e_err_pipeline( pipeline_args_t * args,
-                                int pipeline_rank,
-                                int n_pipeline ) {
-  const field_t * ALIGNED(128) f = args->f;
-  const grid_t  *              g = args->g;
-  
-  const field_t * ALIGNED(16) f0;
-  int x, y, z, n_voxel;
-
-  const int nx = g->nx;
-  const int ny = g->ny;
-  const int nz = g->nz;
-
-  double err;
-
-  // Process voxels assigned to this pipeline
-
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  f0 = &f(x,y,z);
 
-  err = 0;
-  for( ; n_voxel; n_voxel-- ) {
-    err += f0->div_e_err*f0->div_e_err;
-    f0++;
+#include "sfa_private.h"
 
-    x++;
-    if( x>nx ) {
-      x=2, y++;
-      if( y>ny ) y=2, z++;
-      f0 = &f(x,y,z);
-    }
-  }
-
-  args->err[pipeline_rank] = err;
-}
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_rms_div_e_err
+// function.
+//----------------------------------------------------------------------------//
 
 double
-compute_rms_div_e_err( const field_array_t * RESTRICT fa ) {
-  pipeline_args_t args[1];
-  const field_t * f, * f0;
-  const grid_t * RESTRICT g;
-  double err = 0, local[2], global[2];
-  int x, y, z, nx, ny, nz, p;
-
-  if( !fa ) ERROR(( "Bad args" ));
-  f = fa->f;
-  g = fa->g; 
+compute_rms_div_e_err( const field_array_t * RESTRICT fa )
+{
+  double rms_div_e_err;
 
-#if 0 // Original non-pipelined version
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      for( x=2; x<=nx; x++ ) {
-        err += f0->div_e_err*f0->div_e_err;
-        f0++;
-      }
-    }
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-# endif
-  
-  // Have the pipelines accumulate the interior of the local domain
-  // (the host handled stragglers in the interior).
-
-  args->f = f;
-  args->g = g;
-  EXEC_PIPELINES( compute_rms_div_e_err, args, 0 );
-
-  // Have the host accumulate the exterior of the local domain
-
-  nx = g->nx;
-  ny = g->ny;
-  nz = g->nz;
-
-  // Do exterior faces
-
-  for( y=2; y<=ny; y++ ) {
-    for( z=2; z<=nz; z++ ) {
-      f0 = &f(   1, y, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-      f0 = &f(nx+1, y, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-    }
-  }
-
-  for( z=2; z<=nz; z++ ) {
-    for( x=2; x<=nx; x++ ) {
-      f0 = &f( x,   1, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-      f0 = &f( x,ny+1, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-    }
-  }
-
-  for( x=2; x<=nx; x++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(   x,   y,   1); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-      f0 = &f(   x,   y,nz+1); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
-    }
-  }
-
-  // Do exterior edges
-
-  for( x=2; x<=nx; x++ ) {
-    f0 = &f(   x,   1,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(   x,ny+1,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(   x,   1,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(   x,ny+1,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-  }
-
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(   1,   y,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(   1,   y,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(nx+1,   y,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(nx+1,   y,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-  }
-
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(   1,   1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(nx+1,   1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(   1,ny+1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-    f0 = &f(nx+1,ny+1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
-  }
-
-  // Do exterior corners
-
-  f0 = &f(   1,   1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(nx+1,   1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(   1,ny+1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(nx+1,ny+1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(   1,   1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(nx+1,   1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(   1,ny+1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  f0 = &f(nx+1,ny+1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
-  
-  // Reduce the results from the host and pipelines
-
-  WAIT_PIPELINES();
-
-  for( p=0; p<=N_PIPELINE; p++ ) err += args->err[p];
 
-  // Reduce the results from all nodes
+  // Conditionally execute this when more abstractions are available.
+  rms_div_e_err = compute_rms_div_e_err_pipeline( fa );
 
-  local[0] = err*g->dV;
-  local[1] = (g->nx*g->ny*g->nz)*g->dV;
-  mp_allsum_d( local, global, 2 );
-  return g->eps0*sqrt(global[0]/global[1]);
+  return rms_div_e_err;
 }
diff --git a/src/field_advance/standard/energy_f.c b/src/field_advance/standard/energy_f.c
index 370a18d2..61cd7766 100644
--- a/src/field_advance/standard/energy_f.c
+++ b/src/field_advance/standard/energy_f.c
@@ -1,130 +1,23 @@
 // FIXME: USE THE DISCRETIZED VARIATIONAL PRINCIPLE DEFINITION OF ENERGY
 
 #define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  const field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-  double en[MAX_PIPELINE+1][6];
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                                  \
-  const field_t                * ALIGNED(128) f = args->f;                 \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;             \
-  const grid_t                 *              g = args->g;                 \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                            \
-                                                                           \
-  const field_t * ALIGNED(16) f0;                                          \
-  const field_t * ALIGNED(16) fx,  * ALIGNED(16) fy,  * ALIGNED(16) fz;    \
-  const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy;   \
-  double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()   \
-  f0  = &f(x,  y,  z  ); \
-  fx  = &f(x+1,y,  z  ); \
-  fy  = &f(x,  y+1,z  ); \
-  fz  = &f(x,  y,  z+1); \
-  fyz = &f(x,  y+1,z+1); \
-  fzx = &f(x+1,y,  z+1); \
-  fxy = &f(x+1,y+1,z  )
 
-#define NEXT_STENCIL()                              \
-  f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \
-  if( x>nx ) {                                      \
-    /**/       y++;            x = 1;               \
-    if( y>ny ) z++; if( y>ny ) y = 1;               \
-    INIT_STENCIL();                                 \
-  }
-
-#define REDUCE_EN()                                       \
-  en_ex += 0.25*( m[ f0->ematx].epsx* f0->ex * f0->ex +   \
-                  m[ fy->ematx].epsx* fy->ex * fy->ex +   \
-                  m[ fz->ematx].epsx* fz->ex * fz->ex +   \
-                  m[fyz->ematx].epsx*fyz->ex *fyz->ex );  \
-  en_ey += 0.25*( m[ f0->ematy].epsy* f0->ey * f0->ey +   \
-                  m[ fz->ematy].epsy* fz->ey * fz->ey +   \
-                  m[ fx->ematy].epsy* fx->ey * fx->ey +   \
-                  m[fzx->ematy].epsy*fzx->ey *fzx->ey );  \
-  en_ez += 0.25*( m[ f0->ematz].epsz* f0->ez * f0->ez +   \
-                  m[ fx->ematz].epsz* fx->ez * fx->ez +   \
-                  m[ fy->ematz].epsz* fy->ez * fy->ez +   \
-                  m[fxy->ematz].epsz*fxy->ez *fxy->ez );  \
-  en_bx += 0.5 *( m[ f0->fmatx].rmux* f0->cbx* f0->cbx +  \
-                  m[ fx->fmatx].rmux* fx->cbx* fx->cbx ); \
-  en_by += 0.5 *( m[ f0->fmaty].rmuy* f0->cby* f0->cby +  \
-                  m[ fy->fmaty].rmuy* fy->cby* fy->cby ); \
-  en_bz += 0.5 *( m[ f0->fmatz].rmuz* f0->cbz* f0->cbz +  \
-                  m[ fz->fmatz].rmuz* fz->cbz* fz->cbz )
- 
-void
-energy_f_pipeline( pipeline_args_t * args,
-                   int pipeline_rank,
-                   int n_pipeline ) {
-  DECLARE_STENCIL();
-  
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-  
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    REDUCE_EN();
-    NEXT_STENCIL();
-  }
-
-  args->en[pipeline_rank][0] = en_ex;
-  args->en[pipeline_rank][1] = en_ey;
-  args->en[pipeline_rank][2] = en_ez;
-  args->en[pipeline_rank][3] = en_bx;
-  args->en[pipeline_rank][4] = en_by;
-  args->en[pipeline_rank][5] = en_bz;
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper energy_f function.
+//----------------------------------------------------------------------------//
 
 void
-energy_f( double              *          global,
-          const field_array_t * RESTRICT fa ) {
-  if( !global || !fa ) ERROR(( "Bad args" ));
-
-  // Have each pipeline and the host handle a portion of the
-  // local voxels
-  
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( energy_f, args, 0 );
-  WAIT_PIPELINES();
-
-  // Reduce results from each pipelines
-  
-  int p;
-  for( p=1; p<=N_PIPELINE; p++ ) {
-    args->en[0][0] += args->en[p][0]; args->en[0][1] += args->en[p][1];
-    args->en[0][2] += args->en[p][2]; args->en[0][3] += args->en[p][3];
-    args->en[0][4] += args->en[p][4]; args->en[0][5] += args->en[p][5];
+energy_f( double * global,
+          const field_array_t * RESTRICT fa )
+{
+  if ( !global || !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-    
-  // Convert to physical units and reduce results between nodes
-  
-  double v0 = 0.5*fa->g->eps0*fa->g->dV;
-  args->en[0][0] *= v0; args->en[0][1] *= v0;
-  args->en[0][2] *= v0; args->en[0][3] *= v0;
-  args->en[0][4] *= v0; args->en[0][5] *= v0;
-
-  // Reduce results between nodes
 
-  mp_allsum_d( args->en[0], global, 6 );
+  // Conditionally execute this when more abstractions are available.
+  energy_f_pipeline( global, fa );
 }
 
diff --git a/src/field_advance/standard/local.c b/src/field_advance/standard/local.c
index 66b4bd71..01b6c8fe 100644
--- a/src/field_advance/standard/local.c
+++ b/src/field_advance/standard/local.c
@@ -113,9 +113,9 @@ local_ghost_tang_b( field_t      * ALIGNED(128) f,
     }                                                                    \
   } while(0)
 
-  APPLY_LOCAL_TANG_B(-1, 0, 0,x,y,z);
-  APPLY_LOCAL_TANG_B( 0,-1, 0,y,z,x);
-  APPLY_LOCAL_TANG_B( 0, 0,-1,z,x,y);
+  APPLY_LOCAL_TANG_B((-1), 0, 0,x,y,z);
+  APPLY_LOCAL_TANG_B( 0,(-1), 0,y,z,x);
+  APPLY_LOCAL_TANG_B( 0, 0,(-1),z,x,y);
   APPLY_LOCAL_TANG_B( 1, 0, 0,x,y,z);
   APPLY_LOCAL_TANG_B( 0, 1, 0,y,z,x);
   APPLY_LOCAL_TANG_B( 0, 0, 1,z,x,y);
@@ -170,9 +170,9 @@ local_ghost_norm_e( field_t      * ALIGNED(128) f,
     }                                                           \
   } while(0)
 
-  APPLY_LOCAL_NORM_E(-1, 0, 0,x,y,z);
-  APPLY_LOCAL_NORM_E( 0,-1, 0,y,z,x);
-  APPLY_LOCAL_NORM_E( 0, 0,-1,z,x,y);
+  APPLY_LOCAL_NORM_E((-1), 0, 0,x,y,z);
+  APPLY_LOCAL_NORM_E( 0,(-1), 0,y,z,x);
+  APPLY_LOCAL_NORM_E( 0, 0,(-1),z,x,y);
   APPLY_LOCAL_NORM_E( 1, 0, 0,x,y,z);
   APPLY_LOCAL_NORM_E( 0, 1, 0,y,z,x);
   APPLY_LOCAL_NORM_E( 0, 0, 1,z,x,y);
@@ -206,9 +206,9 @@ local_ghost_div_b( field_t      * ALIGNED(128) f,
     }									    \
   } while(0)
   
-  APPLY_LOCAL_DIV_B(-1, 0, 0,x,y,z);
-  APPLY_LOCAL_DIV_B( 0,-1, 0,y,z,x);
-  APPLY_LOCAL_DIV_B( 0, 0,-1,z,x,y);
+  APPLY_LOCAL_DIV_B((-1), 0, 0,x,y,z);
+  APPLY_LOCAL_DIV_B( 0,(-1), 0,y,z,x);
+  APPLY_LOCAL_DIV_B( 0, 0,(-1),z,x,y);
   APPLY_LOCAL_DIV_B( 1, 0, 0,x,y,z);
   APPLY_LOCAL_DIV_B( 0, 1, 0,y,z,x);
   APPLY_LOCAL_DIV_B( 0, 0, 1,z,x,y);
@@ -255,9 +255,9 @@ local_adjust_tang_e( field_t      * ALIGNED(128) f,
     }                                                                   \
   } while(0)
 
-  ADJUST_TANG_E(-1, 0, 0,x,y,z);
-  ADJUST_TANG_E( 0,-1, 0,y,z,x);
-  ADJUST_TANG_E( 0, 0,-1,z,x,y);
+  ADJUST_TANG_E((-1), 0, 0,x,y,z);
+  ADJUST_TANG_E( 0,(-1), 0,y,z,x);
+  ADJUST_TANG_E( 0, 0,(-1),z,x,y);
   ADJUST_TANG_E( 1, 0, 0,x,y,z);
   ADJUST_TANG_E( 0, 1, 0,y,z,x);
   ADJUST_TANG_E( 0, 0, 1,z,x,y);
@@ -287,9 +287,9 @@ local_adjust_norm_b( field_t      * ALIGNED(128) f,
     }                                                                   \
   } while(0)
 
-  ADJUST_NORM_B(-1, 0, 0,x,y,z);
-  ADJUST_NORM_B( 0,-1, 0,y,z,x);
-  ADJUST_NORM_B( 0, 0,-1,z,x,y);
+  ADJUST_NORM_B((-1), 0, 0,x,y,z);
+  ADJUST_NORM_B( 0,(-1), 0,y,z,x);
+  ADJUST_NORM_B( 0, 0,(-1),z,x,y);
   ADJUST_NORM_B( 1, 0, 0,x,y,z);
   ADJUST_NORM_B( 0, 1, 0,y,z,x);
   ADJUST_NORM_B( 0, 0, 1,z,x,y);
@@ -319,9 +319,9 @@ local_adjust_div_e( field_t      * ALIGNED(128) f,
     }								 \
   } while(0)
 
-  ADJUST_DIV_E_ERR(-1, 0, 0,x,y,z);
-  ADJUST_DIV_E_ERR( 0,-1, 0,y,z,x);
-  ADJUST_DIV_E_ERR( 0, 0,-1,z,x,y);
+  ADJUST_DIV_E_ERR((-1), 0, 0,x,y,z);
+  ADJUST_DIV_E_ERR( 0,(-1), 0,y,z,x);
+  ADJUST_DIV_E_ERR( 0, 0,(-1),z,x,y);
   ADJUST_DIV_E_ERR( 1, 0, 0,x,y,z);
   ADJUST_DIV_E_ERR( 0, 1, 0,y,z,x);
   ADJUST_DIV_E_ERR( 0, 0, 1,z,x,y);
@@ -359,9 +359,9 @@ local_adjust_jf( field_t      * ALIGNED(128) f,
     }                                                                   \
   } while(0)
   
-  ADJUST_JF(-1, 0, 0,x,y,z);
-  ADJUST_JF( 0,-1, 0,y,z,x);
-  ADJUST_JF( 0, 0,-1,z,x,y);
+  ADJUST_JF((-1), 0, 0,x,y,z);
+  ADJUST_JF( 0,(-1), 0,y,z,x);
+  ADJUST_JF( 0, 0,(-1),z,x,y);
   ADJUST_JF( 1, 0, 0,x,y,z);
   ADJUST_JF( 0, 1, 0,y,z,x);
   ADJUST_JF( 0, 0, 1,z,x,y);
@@ -398,9 +398,9 @@ local_adjust_rhof( field_t      * ALIGNED(128) f,
     }                                                                   \
   } while(0)
   
-  ADJUST_RHOF(-1, 0, 0,x,y,z);
-  ADJUST_RHOF( 0,-1, 0,y,z,x);
-  ADJUST_RHOF( 0, 0,-1,z,x,y);
+  ADJUST_RHOF((-1), 0, 0,x,y,z);
+  ADJUST_RHOF( 0,(-1), 0,y,z,x);
+  ADJUST_RHOF( 0, 0,(-1),z,x,y);
   ADJUST_RHOF( 1, 0, 0,x,y,z);
   ADJUST_RHOF( 0, 1, 0,y,z,x);
   ADJUST_RHOF( 0, 0, 1,z,x,y);
@@ -435,9 +435,9 @@ local_adjust_rhob( field_t      * ALIGNED(128) f,
     }                                                                   \
   } while(0)
   
-  ADJUST_RHOB(-1, 0, 0,x,y,z);
-  ADJUST_RHOB( 0,-1, 0,y,z,x);
-  ADJUST_RHOB( 0, 0,-1,z,x,y);
+  ADJUST_RHOB((-1), 0, 0,x,y,z);
+  ADJUST_RHOB( 0,(-1), 0,y,z,x);
+  ADJUST_RHOB( 0, 0,(-1),z,x,y);
   ADJUST_RHOB( 1, 0, 0,x,y,z);
   ADJUST_RHOB( 0, 1, 0,y,z,x);
   ADJUST_RHOB( 0, 0, 1,z,x,y);
diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline.cc b/src/field_advance/standard/pipeline/advance_b_pipeline.cc
new file mode 100644
index 00000000..ebab8d1e
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_b_pipeline.cc
@@ -0,0 +1,125 @@
+#define IN_sfa
+#define IN_advance_b_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "advance_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an advance_b pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+advance_b_pipeline_scalar( pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_CBX();
+    UPDATE_CBY();
+    UPDATE_CBZ();
+
+    NEXT_STENCIL();
+  }
+
+# undef LOAD_STENCIL
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper advance_b pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+advance_b_pipeline( field_array_t * RESTRICT fa,
+                    float _frac )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+  
+  // Do the bulk of the magnetic fields in the pipelines.  The host
+  // handles stragglers.
+
+  pipeline_args_t args[1];
+
+  args->f    = fa->f;
+  args->g    = fa->g;
+  args->frac = _frac;
+
+  EXEC_PIPELINES( advance_b, args, 0 );
+
+  // While the pipelines are busy, do surface fields
+
+  DECLARE_STENCIL();
+
+  // Do left over bx
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y,   z   );
+      fy = &f( nx+1, y+1, z   );
+      fz = &f( nx+1, y,   z+1 );
+
+      UPDATE_CBX();
+    }
+  }
+
+  // Do left over by
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fx = &f( 2, ny+1, z   );
+    fz = &f( 1, ny+1, z+1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_CBY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do left over bz
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fx = &f( 2, y,   nz+1 );
+    fy = &f( 1, y+1, nz+1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_CBZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  WAIT_PIPELINES();
+
+  local_adjust_norm_b( f, g );
+}
diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline.h b/src/field_advance/standard/pipeline/advance_b_pipeline.h
new file mode 100644
index 00000000..132d8419
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_b_pipeline.h
@@ -0,0 +1,81 @@
+#ifndef _advance_b_pipeline_h_
+#define _advance_b_pipeline_h_
+
+#ifndef IN_advance_b_pipeline
+#error "Only include advance_b_pipeline.h in advance_b_pipeline source files."
+#endif
+
+#include "../../field_advance.h"
+
+typedef struct pipeline_args
+{
+  field_t      * ALIGNED(128) f;
+  const grid_t *              g;
+  float frac;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                       \
+        field_t * ALIGNED(128) f = args->f;                     \
+  const grid_t  *              g = args->g;                     \
+                                                                \
+  const int   nx   = g->nx;                                     \
+  const int   ny   = g->ny;                                     \
+  const int   nz   = g->nz;                                     \
+                                                                \
+  const float frac = args->frac;                                \
+  const float px   = (nx>1) ? frac*g->cvac*g->dt*g->rdx : 0;    \
+  const float py   = (ny>1) ? frac*g->cvac*g->dt*g->rdy : 0;    \
+  const float pz   = (nz>1) ? frac*g->cvac*g->dt*g->rdz : 0;    \
+                                                                \
+  field_t * ALIGNED(16) f0;                                     \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x+1, y,   z   ); \
+  fy = &f( x,   y+1, z   ); \
+  fz = &f( x,   y,   z+1 )
+
+#define NEXT_STENCIL()                      \
+  f0++; fx++; fy++; fz++; x++;              \
+  if ( x > nx )                             \
+  {				            \
+                  y++;               x = 1; \
+    if ( y > ny ) z++; if ( y > ny ) y = 1; \
+    INIT_STENCIL();                         \
+  }
+ 
+// WTF!  Under -ffast-math, gcc-4.1.1 thinks it is okay to treat the
+// below as
+//   f0->cbx = ( f0->cbx + py*( blah ) ) - pz*( blah )
+// even with explicit parenthesis are in there!  Oh my ...
+// -fno-unsafe-math-optimizations must be used
+
+#define UPDATE_CBX() f0->cbx -= ( py*( fy->ez-f0->ez ) - pz*( fz->ey-f0->ey ) )
+#define UPDATE_CBY() f0->cby -= ( pz*( fz->ex-f0->ex ) - px*( fx->ez-f0->ez ) )
+#define UPDATE_CBZ() f0->cbz -= ( px*( fx->ey-f0->ey ) - py*( fy->ex-f0->ex ) )
+
+void
+advance_b_pipeline_scalar( pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline );
+
+void
+advance_b_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_b_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_b_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline );
+
+#endif // _advance_b_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline_v16.cc b/src/field_advance/standard/pipeline/advance_b_pipeline_v16.cc
new file mode 100644
index 00000000..c0198dfa
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_b_pipeline_v16.cc
@@ -0,0 +1,132 @@
+#define IN_sfa
+#define IN_advance_b_pipeline
+
+#include "advance_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+advance_b_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v16float vpx( px );
+  const v16float vpy( py );
+  const v16float vpz( pz );
+
+  v16float f0_ex,  f0_ey,  f0_ez;  // Voxel block electric fields
+  v16float f0_cbx, f0_cby, f0_cbz; // Voxel block magnetic fields
+  v16float fx_ey, fx_ez;           // Voxel block +x neighbor fields
+  v16float fy_ez, fy_ex;           // Voxel block +y neighbor fields
+  v16float fz_ex, fz_ey;           // Voxel block +z neighbor fields
+  v16float dummy;
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 16 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    f000 = f0; fx00 = fx; fy00 = fy; fz00 = fz; NEXT_STENCIL();
+    f001 = f0; fx01 = fx; fy01 = fy; fz01 = fz; NEXT_STENCIL();
+    f002 = f0; fx02 = fx; fy02 = fy; fz02 = fz; NEXT_STENCIL();
+    f003 = f0; fx03 = fx; fy03 = fy; fz03 = fz; NEXT_STENCIL();
+    f004 = f0; fx04 = fx; fy04 = fy; fz04 = fz; NEXT_STENCIL();
+    f005 = f0; fx05 = fx; fy05 = fy; fz05 = fz; NEXT_STENCIL();
+    f006 = f0; fx06 = fx; fy06 = fy; fz06 = fz; NEXT_STENCIL();
+    f007 = f0; fx07 = fx; fy07 = fy; fz07 = fz; NEXT_STENCIL();
+    f008 = f0; fx08 = fx; fy08 = fy; fz08 = fz; NEXT_STENCIL();
+    f009 = f0; fx09 = fx; fy09 = fy; fz09 = fz; NEXT_STENCIL();
+    f010 = f0; fx10 = fx; fy10 = fy; fz10 = fz; NEXT_STENCIL();
+    f011 = f0; fx11 = fx; fy11 = fy; fz11 = fz; NEXT_STENCIL();
+    f012 = f0; fx12 = fx; fy12 = fy; fz12 = fz; NEXT_STENCIL();
+    f013 = f0; fx13 = fx; fy13 = fy; fz13 = fz; NEXT_STENCIL();
+    f014 = f0; fx14 = fx; fy14 = fy; fz14 = fz; NEXT_STENCIL();
+    f015 = f0; fx15 = fx; fy15 = fy; fz15 = fz; NEXT_STENCIL();
+
+    load_16x3_tr( &f000->ex, &f001->ex, &f002->ex, &f003->ex,
+                  &f004->ex, &f005->ex, &f006->ex, &f007->ex,
+                  &f008->ex, &f009->ex, &f010->ex, &f011->ex,
+                  &f012->ex, &f013->ex, &f014->ex, &f015->ex,
+                  f0_ex, f0_ey, f0_ez );
+
+    load_16x3_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz );
+
+    load_16x3_tr( &fx00->ex, &fx01->ex, &fx02->ex, &fx03->ex,
+                  &fx04->ex, &fx05->ex, &fx06->ex, &fx07->ex,
+                  &fx08->ex, &fx09->ex, &fx10->ex, &fx11->ex,
+                  &fx12->ex, &fx13->ex, &fx14->ex, &fx15->ex,
+                  dummy, fx_ey, fx_ez );
+
+    load_16x3_tr( &fy00->ex, &fy01->ex, &fy02->ex, &fy03->ex,
+                  &fy04->ex, &fy05->ex, &fy06->ex, &fy07->ex,
+                  &fy08->ex, &fy09->ex, &fy10->ex, &fy11->ex,
+                  &fy12->ex, &fy13->ex, &fy14->ex, &fy15->ex,
+                  fy_ex, dummy, fy_ez );
+
+    load_16x2_tr( &fz00->ex, &fz01->ex, &fz02->ex, &fz03->ex,
+                  &fz04->ex, &fz05->ex, &fz06->ex, &fz07->ex,
+                  &fz08->ex, &fz09->ex, &fz10->ex, &fz11->ex,
+                  &fz12->ex, &fz13->ex, &fz14->ex, &fz15->ex,
+                  fz_ex, fz_ey );
+
+    f0_cbx += fnms( vpy, ( fy_ez - f0_ez ), vpz*( fz_ey - f0_ey ) );
+    f0_cby += fnms( vpz, ( fz_ex - f0_ex ), vpx*( fx_ez - f0_ez ) );
+    f0_cbz += fnms( vpx, ( fx_ey - f0_ey ), vpy*( fy_ex - f0_ex ) );
+
+    store_16x3_tr( f0_cbx, f0_cby, f0_cbz,
+                   &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                   &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                   &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                   &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx );
+  }
+}
+
+#else
+
+void
+advance_b_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No advance_b_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline_v4.cc b/src/field_advance/standard/pipeline/advance_b_pipeline_v4.cc
new file mode 100644
index 00000000..5bef260a
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_b_pipeline_v4.cc
@@ -0,0 +1,90 @@
+#define IN_sfa
+#define IN_advance_b_pipeline
+
+#include "advance_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+advance_b_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v4float vpx( px );
+  const v4float vpy( py );
+  const v4float vpz( pz );
+
+  v4float f0_ex,  f0_ey,  f0_ez;  // Voxel quad electric fields
+  v4float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
+  v4float fx_ey, fx_ez;           // Voxel quad +x neighbor fields
+  v4float fy_ez, fy_ex;           // Voxel quad +y neighbor fields
+  v4float fz_ex, fz_ey;           // Voxel quad +z neighbor fields
+  v4float dummy;
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
+
+  // Process the bulk of the voxels 4 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+
+    load_4x3_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+		 f0_ex, f0_ey, f0_ez );
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		 f0_cbx, f0_cby, f0_cbz );
+
+    load_4x3_tr( &fx0->ex, &fx1->ex, &fx2->ex, &fx3->ex,
+		 dummy, fx_ey, fx_ez );
+
+    load_4x3_tr( &fy0->ex, &fy1->ex, &fy2->ex, &fy3->ex,
+		 fy_ex, dummy, fy_ez );
+
+    load_4x2_tr( &fz0->ex, &fz1->ex, &fz2->ex, &fz3->ex,
+		 fz_ex, fz_ey );
+
+    f0_cbx += fnms( vpy, ( fy_ez - f0_ez ), vpz*( fz_ey - f0_ey ) );
+    f0_cby += fnms( vpz, ( fz_ex - f0_ex ), vpx*( fx_ez - f0_ez ) );
+    f0_cbz += fnms( vpx, ( fx_ey - f0_ey ), vpy*( fy_ex - f0_ex ) );
+
+    store_4x3_tr( f0_cbx, f0_cby, f0_cbz,
+		  &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx );
+  }
+}
+
+#else
+
+void
+advance_b_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No advance_b_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/advance_b_pipeline_v8.cc b/src/field_advance/standard/pipeline/advance_b_pipeline_v8.cc
new file mode 100644
index 00000000..a00bbcac
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_b_pipeline_v8.cc
@@ -0,0 +1,104 @@
+#define IN_sfa
+#define IN_advance_b_pipeline
+
+#include "advance_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+advance_b_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v8float vpx( px );
+  const v8float vpy( py );
+  const v8float vpz( pz );
+
+  v8float f0_ex,  f0_ey,  f0_ez;  // Voxel block electric fields
+  v8float f0_cbx, f0_cby, f0_cbz; // Voxel block magnetic fields
+  v8float fx_ey, fx_ez;           // Voxel block +x neighbor fields
+  v8float fy_ez, fy_ex;           // Voxel block +y neighbor fields
+  v8float fz_ex, fz_ey;           // Voxel block +z neighbor fields
+  v8float dummy;
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel block
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel block
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel block +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel block +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel block +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel block +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel block +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 8 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+    f04 = f0; fx4 = fx; fy4 = fy; fz4 = fz; NEXT_STENCIL();
+    f05 = f0; fx5 = fx; fy5 = fy; fz5 = fz; NEXT_STENCIL();
+    f06 = f0; fx6 = fx; fy6 = fy; fz6 = fz; NEXT_STENCIL();
+    f07 = f0; fx7 = fx; fy7 = fy; fz7 = fz; NEXT_STENCIL();
+
+    load_8x3_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+		 &f04->ex, &f05->ex, &f06->ex, &f07->ex,
+		 f0_ex, f0_ey, f0_ez );
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+		 f0_cbx, f0_cby, f0_cbz );
+
+    load_8x3_tr( &fx0->ex, &fx1->ex, &fx2->ex, &fx3->ex,
+		 &fx4->ex, &fx5->ex, &fx6->ex, &fx7->ex,
+		 dummy, fx_ey, fx_ez );
+
+    load_8x3_tr( &fy0->ex, &fy1->ex, &fy2->ex, &fy3->ex,
+		 &fy4->ex, &fy5->ex, &fy6->ex, &fy7->ex,
+		 fy_ex, dummy, fy_ez );
+
+    load_8x2_tr( &fz0->ex, &fz1->ex, &fz2->ex, &fz3->ex,
+		 &fz4->ex, &fz5->ex, &fz6->ex, &fz7->ex,
+		 fz_ex, fz_ey );
+
+    f0_cbx += fnms( vpy, ( fy_ez - f0_ez ), vpz*( fz_ey - f0_ey ) );
+    f0_cby += fnms( vpz, ( fz_ex - f0_ex ), vpx*( fx_ez - f0_ez ) );
+    f0_cbz += fnms( vpx, ( fx_ey - f0_ey ), vpy*( fy_ex - f0_ex ) );
+
+    store_8x3_tr( f0_cbx, f0_cby, f0_cbz,
+		  &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		  &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx );
+  }
+}
+
+#else
+
+void
+advance_b_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No advance_b_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline.cc b/src/field_advance/standard/pipeline/advance_e_pipeline.cc
new file mode 100644
index 00000000..8f79c264
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_e_pipeline.cc
@@ -0,0 +1,331 @@
+#define IN_sfa
+#define IN_advance_e_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an advance_e pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+advance_e_pipeline_scalar( pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_EX();
+    UPDATE_EY();
+    UPDATE_EZ();
+
+    NEXT_STENCIL();
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper advance_e pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+advance_e_pipeline( field_array_t * RESTRICT fa,
+                    float frac )
+{
+  if ( !fa  )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  if ( frac != 1 )
+  {
+    ERROR( ( "standard advance_e does not support frac != 1 yet" ) );
+  }
+
+  /***************************************************************************
+   * Begin tangential B ghost setup
+   ***************************************************************************/
+  
+  begin_remote_ghost_tang_b( fa->f, fa->g );
+
+  local_ghost_tang_b( fa->f, fa->g );
+
+  /***************************************************************************
+   * Update interior fields
+   * Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
+   * Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
+   * Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
+   ***************************************************************************/
+
+  // Do majority interior in a single pass.  The host handles
+  // stragglers.
+
+  pipeline_args_t args[1];
+  args->f = fa->f;
+  args->p = (sfa_params_t *)fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( advance_e, args, 0 );
+  
+  // While the pipelines are busy, do non-bulk interior fields
+
+  DECLARE_STENCIL();
+
+  // Do left over interior ex
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_EX();
+    }
+  }
+
+  // Do left over interior ey
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 2, 1, z   );
+    fx = &f( 1, 1, z   );
+    fz = &f( 2, 1, z-1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do left over interior ez
+  for( y = 2; y <= ny; y++ )
+  {
+    f0 = &f( 2, y,   1 );
+    fx = &f( 1, y,   1 );
+    fy = &f( 2, y-1, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  WAIT_PIPELINES();
+  
+  /***************************************************************************
+   * Finish tangential B ghost setup
+   ***************************************************************************/
+
+  end_remote_ghost_tang_b( fa->f, fa->g );
+
+  /***************************************************************************
+   * Update exterior fields
+   ***************************************************************************/
+
+  // Do exterior ex
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // Do exterior ey
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z   );
+      fx = &f( 0, y, z   );
+      fz = &f( 1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y, z   );
+      fx = &f( nx,   y, z   );
+      fz = &f( nx+1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, 1 );
+    fx = &f( 1, y, 1 );
+    fz = &f( 2, y, 0 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, nz+1 );
+    fx = &f( 1, y, nz+1 );
+    fz = &f( 2, y, nz   );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do exterior ez
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z );
+    fx = &f( 0, 1, z );
+    fy = &f( 1, 0, z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z );
+    fx = &f( 0, ny+1, z );
+    fy = &f( 1, ny,   z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z );
+      fx = &f( 0, y,   z );
+      fy = &f( 1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y,   z );
+      fx = &f( nx,   y,   z );
+      fy = &f( nx+1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline.h b/src/field_advance/standard/pipeline/advance_e_pipeline.h
new file mode 100644
index 00000000..df15c180
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_e_pipeline.h
@@ -0,0 +1,97 @@
+#ifndef _advance_e_pipeline_h_
+#define _advance_e_pipeline_h_
+
+#ifndef IN_advance_e_pipeline
+#error "Only include advance_e_pipeline.h in advance_e_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  field_t            * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                        \
+        field_t                * ALIGNED(128) f = args->f;       \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;   \
+  const grid_t                 *              g = args->g;       \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                  \
+                                                                 \
+  const float damp = args->p->damp;                              \
+  const float px   = (nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0; \
+  const float py   = (ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0; \
+  const float pz   = (nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0; \
+  const float cj   = g->dt/g->eps0;                              \
+                                                                 \
+  field_t * ALIGNED(16) f0;                                      \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;  \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+#define NEXT_STENCIL()                      \
+  f0++; fx++; fy++; fz++; x++;              \
+  if ( x > nx )                             \
+  {                                         \
+                  y++;               x = 2; \
+    if ( y > ny ) z++; if ( y > ny ) y = 2; \
+    INIT_STENCIL();                         \
+  }
+
+#define UPDATE_EX()                                         \
+  f0->tcax = ( py * ( f0->cbz * m[f0->fmatz].rmuz -         \
+		      fy->cbz * m[fy->fmatz].rmuz ) -       \
+               pz * ( f0->cby * m[f0->fmaty].rmuy -         \
+		      fz->cby * m[fz->fmaty].rmuy ) ) -     \
+             damp * f0->tcax;                               \
+  f0->ex   = m[f0->ematx].decayx * f0->ex +                 \
+             m[f0->ematx].drivex * ( f0->tcax - cj * f0->jfx )
+
+#define UPDATE_EY()                                         \
+  f0->tcay = ( pz * ( f0->cbx * m[f0->fmatx].rmux -         \
+		      fz->cbx * m[fz->fmatx].rmux ) -       \
+               px * ( f0->cbz * m[f0->fmatz].rmuz -         \
+		      fx->cbz * m[fx->fmatz].rmuz ) ) -     \
+             damp * f0->tcay;                               \
+  f0->ey   = m[f0->ematy].decayy * f0->ey +                 \
+             m[f0->ematy].drivey * ( f0->tcay - cj * f0->jfy )
+
+#define UPDATE_EZ()                                         \
+  f0->tcaz = ( px * ( f0->cby * m[f0->fmaty].rmuy -         \
+		      fx->cby * m[fx->fmaty].rmuy) -        \
+               py * ( f0->cbx * m[f0->fmatx].rmux -         \
+		      fy->cbx * m[fy->fmatx].rmux ) ) -     \
+             damp * f0->tcaz;                               \
+  f0->ez   = m[f0->ematz].decayz * f0->ez +                 \
+             m[f0->ematz].drivez * ( f0->tcaz - cj * f0->jfz )
+
+void
+advance_e_pipeline_scalar( pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline );
+
+void
+advance_e_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_e_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_e_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline );
+
+#endif // _advance_e_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline_v16.cc b/src/field_advance/standard/pipeline/advance_e_pipeline_v16.cc
new file mode 100644
index 00000000..7f14eb6e
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_e_pipeline_v16.cc
@@ -0,0 +1,251 @@
+#define IN_sfa
+#define IN_advance_e_pipeline
+
+#include "advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+advance_e_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v16float vdamp( damp );
+  const v16float vpx( px );
+  const v16float vpy( py );
+  const v16float vpz( pz );
+  const v16float vcj( cj );
+
+  v16float save0, save1, dummy;
+
+  v16float f0_ex,   f0_ey,   f0_ez;
+  v16float f0_cbx,  f0_cby,  f0_cbz;
+  v16float f0_tcax, f0_tcay, f0_tcaz;
+  v16float f0_jfx,  f0_jfy,  f0_jfz;
+  v16float          fx_cby,  fx_cbz;
+  v16float fy_cbx,           fy_cbz;
+  v16float fz_cbx,  fz_cby;
+  v16float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v16float            m_fx_rmuy, m_fx_rmuz;
+  v16float m_fy_rmux,            m_fy_rmuz;
+  v16float m_fz_rmux, m_fz_rmuy;
+  v16float m_f0_decayx, m_f0_drivex;
+  v16float m_f0_decayy, m_f0_drivey;
+  v16float m_f0_decayz, m_f0_drivez;
+
+  v16float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 16 at a time
+                               
+  INIT_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    f000 = f0; fx00 = fx; fy00 = fy; fz00 = fz; NEXT_STENCIL();
+    f001 = f0; fx01 = fx; fy01 = fy; fz01 = fz; NEXT_STENCIL();
+    f002 = f0; fx02 = fx; fy02 = fy; fz02 = fz; NEXT_STENCIL();
+    f003 = f0; fx03 = fx; fy03 = fy; fz03 = fz; NEXT_STENCIL();
+    f004 = f0; fx04 = fx; fy04 = fy; fz04 = fz; NEXT_STENCIL();
+    f005 = f0; fx05 = fx; fy05 = fy; fz05 = fz; NEXT_STENCIL();
+    f006 = f0; fx06 = fx; fy06 = fy; fz06 = fz; NEXT_STENCIL();
+    f007 = f0; fx07 = fx; fy07 = fy; fz07 = fz; NEXT_STENCIL();
+    f008 = f0; fx08 = fx; fy08 = fy; fz08 = fz; NEXT_STENCIL();
+    f009 = f0; fx09 = fx; fy09 = fy; fz09 = fz; NEXT_STENCIL();
+    f010 = f0; fx10 = fx; fy10 = fy; fz10 = fz; NEXT_STENCIL();
+    f011 = f0; fx11 = fx; fy11 = fy; fz11 = fz; NEXT_STENCIL();
+    f012 = f0; fx12 = fx; fy12 = fy; fz12 = fz; NEXT_STENCIL();
+    f013 = f0; fx13 = fx; fy13 = fy; fz13 = fz; NEXT_STENCIL();
+    f014 = f0; fx14 = fx; fy14 = fy; fz14 = fz; NEXT_STENCIL();
+    f015 = f0; fx15 = fx; fy15 = fy; fz15 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_16x4_tr( &f000->ex, &f001->ex, &f002->ex, &f003->ex,
+                  &f004->ex, &f005->ex, &f006->ex, &f007->ex,
+                  &f008->ex, &f009->ex, &f010->ex, &f011->ex,
+                  &f012->ex, &f013->ex, &f014->ex, &f015->ex,
+                  f0_ex, f0_ey, f0_ez, save0 );
+
+    load_16x3_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz );
+
+    load_16x4_tr( &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                  &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                  &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                  &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax,
+                  f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_16x3_tr( &f000->jfx, &f001->jfx, &f002->jfx, &f003->jfx,
+                  &f004->jfx, &f005->jfx, &f006->jfx, &f007->jfx,
+                  &f008->jfx, &f009->jfx, &f010->jfx, &f011->jfx,
+                  &f012->jfx, &f013->jfx, &f014->jfx, &f015->jfx,
+                  f0_jfx, f0_jfy, f0_jfz );
+
+    load_16x3_tr( &fx00->cbx, &fx01->cbx, &fx02->cbx, &fx03->cbx,
+                  &fx04->cbx, &fx05->cbx, &fx06->cbx, &fx07->cbx,
+                  &fx08->cbx, &fx09->cbx, &fx10->cbx, &fx11->cbx,
+                  &fx12->cbx, &fx13->cbx, &fx14->cbx, &fx15->cbx,
+                  dummy, fx_cby, fx_cbz );
+
+    load_16x3_tr( &fy00->cbx, &fy01->cbx, &fy02->cbx, &fy03->cbx,
+                  &fy04->cbx, &fy05->cbx, &fy06->cbx, &fy07->cbx,
+                  &fy08->cbx, &fy09->cbx, &fy10->cbx, &fy11->cbx,
+                  &fy12->cbx, &fy13->cbx, &fy14->cbx, &fy15->cbx,
+                  fy_cbx, dummy, fy_cbz );
+
+    load_16x2_tr( &fz00->cbx, &fz01->cbx, &fz02->cbx, &fz03->cbx,
+                  &fz04->cbx, &fz05->cbx, &fz06->cbx, &fz07->cbx,
+                  &fz08->cbx, &fz09->cbx, &fz10->cbx, &fz11->cbx,
+                  &fz12->cbx, &fz13->cbx, &fz14->cbx, &fz15->cbx,
+                  fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v16float( m[f##V##00->fmat##D].rmu##D, \
+                                                   m[f##V##01->fmat##D].rmu##D, \
+                                                   m[f##V##02->fmat##D].rmu##D, \
+                                                   m[f##V##03->fmat##D].rmu##D, \
+                                                   m[f##V##04->fmat##D].rmu##D, \
+                                                   m[f##V##05->fmat##D].rmu##D, \
+                                                   m[f##V##06->fmat##D].rmu##D, \
+                                                   m[f##V##07->fmat##D].rmu##D, \
+                                                   m[f##V##08->fmat##D].rmu##D, \
+                                                   m[f##V##09->fmat##D].rmu##D, \
+                                                   m[f##V##10->fmat##D].rmu##D, \
+                                                   m[f##V##11->fmat##D].rmu##D, \
+                                                   m[f##V##12->fmat##D].rmu##D, \
+                                                   m[f##V##13->fmat##D].rmu##D, \
+                                                   m[f##V##14->fmat##D].rmu##D, \
+                                                   m[f##V##15->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+    load_16x2_tr( &m[f000->ematx].decayx, &m[f001->ematx].decayx,
+                  &m[f002->ematx].decayx, &m[f003->ematx].decayx,
+                  &m[f004->ematx].decayx, &m[f005->ematx].decayx,
+                  &m[f006->ematx].decayx, &m[f007->ematx].decayx,
+                  &m[f008->ematx].decayx, &m[f009->ematx].decayx,
+                  &m[f010->ematx].decayx, &m[f011->ematx].decayx,
+                  &m[f012->ematx].decayx, &m[f013->ematx].decayx,
+                  &m[f014->ematx].decayx, &m[f015->ematx].decayx,
+                  m_f0_decayx, m_f0_drivex );
+
+    load_16x2_tr( &m[f000->ematy].decayy, &m[f001->ematy].decayy,
+                  &m[f002->ematy].decayy, &m[f003->ematy].decayy,
+                  &m[f004->ematy].decayy, &m[f005->ematy].decayy,
+                  &m[f006->ematy].decayy, &m[f007->ematy].decayy,
+                  &m[f008->ematy].decayy, &m[f009->ematy].decayy,
+                  &m[f010->ematy].decayy, &m[f011->ematy].decayy,
+                  &m[f012->ematy].decayy, &m[f013->ematy].decayy,
+                  &m[f014->ematy].decayy, &m[f015->ematy].decayy,
+                  m_f0_decayy, m_f0_drivey );
+
+    load_16x2_tr( &m[f000->ematz].decayz, &m[f001->ematz].decayz,
+                  &m[f002->ematz].decayz, &m[f003->ematz].decayz,
+                  &m[f004->ematz].decayz, &m[f005->ematz].decayz,
+                  &m[f006->ematz].decayz, &m[f007->ematz].decayz,
+                  &m[f008->ematz].decayz, &m[f009->ematz].decayz,
+                  &m[f010->ematz].decayz, &m[f011->ematz].decayz,
+                  &m[f012->ematz].decayz, &m[f013->ematz].decayz,
+                  &m[f014->ematz].decayz, &m[f015->ematz].decayz,
+                  m_f0_decayz, m_f0_drivez );
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy,
+                         fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                         vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz,
+                         fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                         vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx,
+                         fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                         vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) ) );
+
+    f0_ex = fma( m_f0_decayx, f0_ex, m_f0_drivex * fnms( vcj, f0_jfx, f0_tcax ) );
+    f0_ey = fma( m_f0_decayy, f0_ey, m_f0_drivey * fnms( vcj, f0_jfy, f0_tcay ) );
+    f0_ez = fma( m_f0_decayz, f0_ez, m_f0_drivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_16x3 versus load_16x4, store_16x4 is much more efficient
+    // than store_16x3.
+    //------------------------------------------------------------------------//
+
+    store_16x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                   &f000->ex, &f001->ex, &f002->ex, &f003->ex,
+                   &f004->ex, &f005->ex, &f006->ex, &f007->ex,
+                   &f008->ex, &f009->ex, &f010->ex, &f011->ex,
+                   &f012->ex, &f013->ex, &f014->ex, &f015->ex );
+
+    store_16x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                   &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                   &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                   &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                   &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax );
+  }
+}
+
+#else
+
+void
+advance_e_pipeline_v16( pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No advance_e_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline_v4.cc b/src/field_advance/standard/pipeline/advance_e_pipeline_v4.cc
new file mode 100644
index 00000000..9d5b0d53
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_e_pipeline_v4.cc
@@ -0,0 +1,170 @@
+#define IN_sfa
+#define IN_advance_e_pipeline
+
+#include "advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+advance_e_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v4float vdamp( damp );
+  const v4float vpx( px );
+  const v4float vpy( py );
+  const v4float vpz( pz );
+  const v4float vcj( cj );
+
+  v4float save0, save1, dummy;
+
+  v4float f0_ex,   f0_ey,   f0_ez;
+  v4float f0_cbx,  f0_cby,  f0_cbz;
+  v4float f0_tcax, f0_tcay, f0_tcaz;
+  v4float f0_jfx,  f0_jfy,  f0_jfz;
+  v4float          fx_cby,  fx_cbz;
+  v4float fy_cbx,           fy_cbz;
+  v4float fz_cbx,  fz_cby;
+  v4float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v4float            m_fx_rmuy, m_fx_rmuz;
+  v4float m_fy_rmux,            m_fy_rmuz;
+  v4float m_fz_rmux, m_fz_rmuy;
+  v4float m_f0_decayx, m_f0_drivex;
+  v4float m_f0_decayy, m_f0_drivey;
+  v4float m_f0_decayz, m_f0_drivez;
+
+  v4float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
+
+  // Process the bulk of the voxels 4 at a time
+                               
+  INIT_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_4x4_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                 f0_ex, f0_ey, f0_ez, save0 );
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_4x3_tr( &f00->jfx, &f01->jfx, &f02->jfx, &f03->jfx,
+                 f0_jfx, f0_jfy, f0_jfz );
+
+    load_4x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_4x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_4x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v4float( m[f##V##0->fmat##D].rmu##D, \
+                                                  m[f##V##1->fmat##D].rmu##D, \
+                                                  m[f##V##2->fmat##D].rmu##D, \
+                                                  m[f##V##3->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+    load_4x2_tr( &m[f00->ematx].decayx, &m[f01->ematx].decayx,
+                 &m[f02->ematx].decayx, &m[f03->ematx].decayx,
+                 m_f0_decayx, m_f0_drivex );
+
+    load_4x2_tr( &m[f00->ematy].decayy, &m[f01->ematy].decayy,
+                 &m[f02->ematy].decayy, &m[f03->ematy].decayy,
+                 m_f0_decayy, m_f0_drivey );
+
+    load_4x2_tr( &m[f00->ematz].decayz, &m[f01->ematz].decayz,
+                 &m[f02->ematz].decayz, &m[f03->ematz].decayz,
+                 m_f0_decayz, m_f0_drivez );
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy,
+                         fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                         vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz,
+                         fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                         vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx,
+                         fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                         vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) ) );
+
+    f0_ex = fma( m_f0_decayx, f0_ex, m_f0_drivex * fnms( vcj, f0_jfx, f0_tcax ) );
+    f0_ey = fma( m_f0_decayy, f0_ey, m_f0_drivey * fnms( vcj, f0_jfy, f0_tcay ) );
+    f0_ez = fma( m_f0_decayz, f0_ez, m_f0_drivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient than
+    // store_4x3.
+    //------------------------------------------------------------------------//
+
+    store_4x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                  &f00->ex, &f01->ex, &f02->ex, &f03->ex );
+
+    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax );
+  }
+}
+
+#else
+
+void
+advance_e_pipeline_v4( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No advance_e_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/advance_e_pipeline_v8.cc b/src/field_advance/standard/pipeline/advance_e_pipeline_v8.cc
new file mode 100644
index 00000000..dba3b752
--- /dev/null
+++ b/src/field_advance/standard/pipeline/advance_e_pipeline_v8.cc
@@ -0,0 +1,197 @@
+#define IN_sfa
+#define IN_advance_e_pipeline
+
+#include "advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+advance_e_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v8float vdamp( damp );
+  const v8float vpx( px );
+  const v8float vpy( py );
+  const v8float vpz( pz );
+  const v8float vcj( cj );
+
+  v8float save0, save1, dummy;
+
+  v8float f0_ex,   f0_ey,   f0_ez;
+  v8float f0_cbx,  f0_cby,  f0_cbz;
+  v8float f0_tcax, f0_tcay, f0_tcaz;
+  v8float f0_jfx,  f0_jfy,  f0_jfz;
+  v8float          fx_cby,  fx_cbz;
+  v8float fy_cbx,           fy_cbz;
+  v8float fz_cbx,  fz_cby;
+  v8float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v8float            m_fx_rmuy, m_fx_rmuz;
+  v8float m_fy_rmux,            m_fy_rmuz;
+  v8float m_fz_rmux, m_fz_rmuy;
+  v8float m_f0_decayx, m_f0_drivex;
+  v8float m_f0_decayy, m_f0_drivey;
+  v8float m_f0_decayz, m_f0_drivez;
+
+  v8float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel quad
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel quad
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel quad +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel quad +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel quad +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel quad +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel quad +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel quad +z neighbors
+
+  // Process the bulk of the voxels 8 at a time
+                               
+  INIT_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+    f04 = f0; fx4 = fx; fy4 = fy; fz4 = fz; NEXT_STENCIL();
+    f05 = f0; fx5 = fx; fy5 = fy; fz5 = fz; NEXT_STENCIL();
+    f06 = f0; fx6 = fx; fy6 = fy; fz6 = fz; NEXT_STENCIL();
+    f07 = f0; fx7 = fx; fy7 = fy; fz7 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_8x4_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                 &f04->ex, &f05->ex, &f06->ex, &f07->ex,
+                 f0_ex, f0_ey, f0_ez, save0 );
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_8x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_8x3_tr( &f00->jfx, &f01->jfx, &f02->jfx, &f03->jfx,
+                 &f04->jfx, &f05->jfx, &f06->jfx, &f07->jfx,
+                 f0_jfx, f0_jfy, f0_jfz );
+
+    load_8x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 &fx4->cbx, &fx5->cbx, &fx6->cbx, &fx7->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_8x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 &fy4->cbx, &fy5->cbx, &fy6->cbx, &fy7->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_8x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 &fz4->cbx, &fz5->cbx, &fz6->cbx, &fz7->cbx,
+                 fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v8float( m[f##V##0->fmat##D].rmu##D, \
+                                                  m[f##V##1->fmat##D].rmu##D, \
+                                                  m[f##V##2->fmat##D].rmu##D, \
+                                                  m[f##V##3->fmat##D].rmu##D, \
+                                                  m[f##V##4->fmat##D].rmu##D, \
+                                                  m[f##V##5->fmat##D].rmu##D, \
+                                                  m[f##V##6->fmat##D].rmu##D, \
+                                                  m[f##V##7->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+    load_8x2_tr( &m[f00->ematx].decayx, &m[f01->ematx].decayx,
+                 &m[f02->ematx].decayx, &m[f03->ematx].decayx,
+                 &m[f04->ematx].decayx, &m[f05->ematx].decayx,
+                 &m[f06->ematx].decayx, &m[f07->ematx].decayx,
+                 m_f0_decayx, m_f0_drivex );
+
+    load_8x2_tr( &m[f00->ematy].decayy, &m[f01->ematy].decayy,
+                 &m[f02->ematy].decayy, &m[f03->ematy].decayy,
+                 &m[f04->ematy].decayy, &m[f05->ematy].decayy,
+                 &m[f06->ematy].decayy, &m[f07->ematy].decayy,
+                 m_f0_decayy, m_f0_drivey );
+
+    load_8x2_tr( &m[f00->ematz].decayz, &m[f01->ematz].decayz,
+                 &m[f02->ematz].decayz, &m[f03->ematz].decayz,
+                 &m[f04->ematz].decayz, &m[f05->ematz].decayz,
+                 &m[f06->ematz].decayz, &m[f07->ematz].decayz,
+                 m_f0_decayz, m_f0_drivez );
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy,
+                         fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                         vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz,
+                         fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                         vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx,
+                         fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                         vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) ) );
+
+    f0_ex = fma( m_f0_decayx, f0_ex, m_f0_drivex * fnms( vcj, f0_jfx, f0_tcax ) );
+    f0_ey = fma( m_f0_decayy, f0_ey, m_f0_drivey * fnms( vcj, f0_jfy, f0_tcay ) );
+    f0_ez = fma( m_f0_decayz, f0_ez, m_f0_drivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_8x3 versus load_8x4, store_8x4 is much more efficient than
+    // store_8x3.
+    //------------------------------------------------------------------------//
+
+    store_8x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                  &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                  &f04->ex, &f05->ex, &f06->ex, &f07->ex );
+
+    store_8x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                  &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax );
+  }
+}
+
+#else
+
+void
+advance_e_pipeline_v8( pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No advance_e_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline.cc b/src/field_advance/standard/pipeline/clean_div_b_pipeline.cc
new file mode 100644
index 00000000..f8e00d8f
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline.cc
@@ -0,0 +1,328 @@
+#define IN_sfa
+#define IN_clean_div_b_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "clean_div_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for a clean_div_b pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+clean_div_b_pipeline_scalar( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+  
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  float px, py, pz, alphadt;
+
+  px = ( nx > 1 ) ? g->rdx : 0;
+  py = ( ny > 1 ) ? g->rdy : 0;
+  pz = ( nz > 1 ) ? g->rdz : 0;
+
+  alphadt = 0.3888889/( px*px + py*py + pz*pz );
+
+  px *= alphadt;
+  py *= alphadt;
+  pz *= alphadt;
+
+  // Process voxels assigned to this pipeline
+  
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+# define LOAD_STENCIL()     \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+  LOAD_STENCIL();
+  
+  for( ; n_voxel; n_voxel-- )
+  {
+    MARDER_CBX();
+    MARDER_CBY();
+    MARDER_CBZ();
+
+    f0++; fx++; fy++; fz++;
+
+    x++;
+    if ( x > nx )
+    {
+                    x = 2, y++;
+      if ( y > ny ) y = 2, z++;
+
+      LOAD_STENCIL();
+    }      
+  }
+
+# undef LOAD_STENCIL
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper clean_div_b pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+clean_div_b_pipeline( field_array_t * fa )
+{
+  pipeline_args_t args[1];
+  
+  field_t      *f, *f0, *fx, *fy, *fz;
+  const grid_t *g;
+  float        alphadt, px, py, pz;
+  int          x, y, z, nx, ny, nz;
+
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  f = fa->f;
+  g = fa->g;
+
+  nx = g->nx;
+  ny = g->ny;
+  nz = g->nz;
+
+  px = ( nx > 1 ) ? g->rdx : 0;
+  py = ( ny > 1 ) ? g->rdy : 0;
+  pz = ( nz > 1 ) ? g->rdz : 0;
+
+  alphadt = 0.3888889/( px*px + py*py + pz*pz );
+
+  px *= alphadt;
+  py *= alphadt;
+  pz *= alphadt;
+
+  // Have pipelines do Marder pass in interior.  The host handles
+  // stragglers.
+
+# if 0 // Original non-pipelined version
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 2, y,   z   );
+      fx = &f( 1, y,   z   );
+      fy = &f( 2, y-1, z   );
+      fz = &f( 2, y,   z-1 );
+
+      for( x = 2; x <= nx; x++ )
+      {
+	MARDER_CBX();
+	MARDER_CBY();
+	MARDER_CBZ();
+
+	f0++; fx++; fy++; fz++;
+      }
+    }
+  }
+# endif
+
+  // Begin setting derr ghosts
+  begin_remote_ghost_div_b( f, g );
+
+  local_ghost_div_b( f, g);
+
+  // Have pipelines do interior of the local domain
+  args->f = f;
+  args->g = g;
+
+  EXEC_PIPELINES( clean_div_b, args, 0 );
+
+  // Do left over interior bx
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, 1 );
+    fx = &f( 1, y, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      MARDER_CBX();
+
+      f0++;
+      fx++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 2, 1, z );
+    fx = &f( 1, 1, z );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      MARDER_CBX();
+
+      f0++;
+      fx++;
+    }
+  }
+
+  // Left over interior by
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z );
+      fy = &f( 1, y-1, z );
+
+      MARDER_CBY();
+    }
+  }
+
+  for( y = 2; y <= ny; y++ )
+  {
+    f0 = &f( 2, y,   1 );
+    fy = &f( 2, y-1, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      MARDER_CBY();
+
+      f0++;
+      fy++;
+    }
+  }
+
+  // Left over interior bz
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      MARDER_CBZ();
+
+      f0++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z   );
+      fz = &f( 1, y, z-1 );
+
+      MARDER_CBZ();
+    }
+  }
+
+  // Finish setting derr ghosts
+
+  end_remote_ghost_div_b( f, g );
+
+  // Do Marder pass in exterior
+
+  // Exterior bx
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z );
+      fx = &f( 0, y, z );
+
+      MARDER_CBX();
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y, z );
+      fx = &f( nx,   y, z );
+
+      MARDER_CBX();
+    }
+  }
+
+  // Exterior by
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z );
+    fy = &f( 1, 0, z );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      MARDER_CBY();
+
+      f0++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z );
+    fy = &f( 1, ny,   z );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      MARDER_CBY();
+
+      f0++;
+      fy++;
+    }
+  }
+
+  // Exterior bz
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 1, y, 1 );
+    fz = &f( 1, y, 0 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      MARDER_CBZ();
+
+      f0++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 1, y, nz+1 );
+    fz = &f( 1, y, nz   );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      MARDER_CBZ();
+
+      f0++;
+      fz++;
+    }
+  }
+
+  // Wait for pipelines to finish up cleaning div_b in interior
+  
+  WAIT_PIPELINES();
+  
+  local_adjust_norm_b(f,g);
+}
diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline.h b/src/field_advance/standard/pipeline/clean_div_b_pipeline.h
new file mode 100644
index 00000000..4b68c433
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline.h
@@ -0,0 +1,42 @@
+#ifndef _clean_div_b_pipeline_h_
+#define _clean_div_b_pipeline_h_
+
+#ifndef IN_clean_div_b_pipeline
+#error "Only include clean_div_b_pipeline.h in clean_div_b_pipeline source files."
+#endif
+
+#include "../../field_advance.h"
+
+typedef struct pipeline_args
+{
+  field_t      * ALIGNED(128) f;
+  const grid_t *              g;
+} pipeline_args_t;
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define MARDER_CBX() f0->cbx += px*( f0->div_b_err - fx->div_b_err )
+#define MARDER_CBY() f0->cby += py*( f0->div_b_err - fy->div_b_err )
+#define MARDER_CBZ() f0->cbz += pz*( f0->div_b_err - fz->div_b_err )
+
+void
+clean_div_b_pipeline_scalar( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline );
+
+void
+clean_div_b_pipeline_v4( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline );
+
+void
+clean_div_b_pipeline_v8( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline );
+
+void
+clean_div_b_pipeline_v16( pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline );
+
+#endif // _clean_div_b_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline_v16.cc b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v16.cc
new file mode 100644
index 00000000..098ab959
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v16.cc
@@ -0,0 +1,165 @@
+#define IN_sfa
+#define IN_clean_div_b_pipeline
+
+#include "clean_div_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+clean_div_b_pipeline_v16( pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+  
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  float px, py, pz, alphadt;
+
+  px = ( nx > 1 ) ? g->rdx : 0;
+  py = ( ny > 1 ) ? g->rdy : 0;
+  pz = ( nz > 1 ) ? g->rdz : 0;
+
+  alphadt = 0.3888889/( px*px + py*py + pz*pz );
+
+  px *= alphadt;
+  py *= alphadt;
+  pz *= alphadt;
+
+  const v16float vpx(px);
+  const v16float vpy(py);
+  const v16float vpz(pz);
+
+  v16float f0_cbx, f0_cby, f0_cbz; // Voxel block magnetic fields
+  v16float f0_div_b_err;           // Voxel block div b errs
+  v16float fx_div_b_err;           // Voxel block -x neighbor div b err
+  v16float fy_div_b_err;           // Voxel block -y neighbor div b err
+  v16float fz_div_b_err;           // Voxel block -z neighbor div b err
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process voxels assigned to this pipeline 
+  
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  // Process bulk of voxels 16 at a time
+
+# define LOAD_STENCIL()     \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+# define NEXT_STENCIL(n)      \
+  f0##n = f0++;               \
+  fx##n = fx++;               \
+  fy##n = fy++;               \
+  fz##n = fz++;               \
+  x++;                        \
+  if ( x > nx )               \
+  {			      \
+                  x = 2, y++; \
+    if ( y > ny ) y = 2, z++; \
+    LOAD_STENCIL();           \
+  }
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    NEXT_STENCIL(00);
+    NEXT_STENCIL(01);
+    NEXT_STENCIL(02);
+    NEXT_STENCIL(03);
+    NEXT_STENCIL(04);
+    NEXT_STENCIL(05);
+    NEXT_STENCIL(06);
+    NEXT_STENCIL(07);
+    NEXT_STENCIL(08);
+    NEXT_STENCIL(09);
+    NEXT_STENCIL(10);
+    NEXT_STENCIL(11);
+    NEXT_STENCIL(12);
+    NEXT_STENCIL(13);
+    NEXT_STENCIL(14);
+    NEXT_STENCIL(15);
+
+    load_16x4_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz, f0_div_b_err );
+
+    fx_div_b_err = v16float( fx00->div_b_err, fx01->div_b_err, fx02->div_b_err, fx03->div_b_err,
+                             fx04->div_b_err, fx05->div_b_err, fx06->div_b_err, fx07->div_b_err,
+                             fx08->div_b_err, fx09->div_b_err, fx10->div_b_err, fx11->div_b_err,
+                             fx12->div_b_err, fx13->div_b_err, fx14->div_b_err, fx15->div_b_err );
+
+    fy_div_b_err = v16float( fy00->div_b_err, fy01->div_b_err, fy02->div_b_err, fy03->div_b_err,
+                             fy04->div_b_err, fy05->div_b_err, fy06->div_b_err, fy07->div_b_err,
+                             fy08->div_b_err, fy09->div_b_err, fy10->div_b_err, fy11->div_b_err,
+                             fy12->div_b_err, fy13->div_b_err, fy14->div_b_err, fy15->div_b_err );
+
+    fz_div_b_err = v16float( fz00->div_b_err, fz01->div_b_err, fz02->div_b_err, fz03->div_b_err,
+                             fz04->div_b_err, fz05->div_b_err, fz06->div_b_err, fz07->div_b_err,
+                             fz08->div_b_err, fz09->div_b_err, fz10->div_b_err, fz11->div_b_err,
+                             fz12->div_b_err, fz13->div_b_err, fz14->div_b_err, fz15->div_b_err );
+
+    f0_cbx = fma( f0_div_b_err - fx_div_b_err, px, f0_cbx );
+    f0_cby = fma( f0_div_b_err - fy_div_b_err, py, f0_cby );
+    f0_cbz = fma( f0_div_b_err - fz_div_b_err, pz, f0_cbz );
+
+    store_16x4_tr( f0_cbx, f0_cby, f0_cbz, f0_div_b_err,
+                   &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                   &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                   &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                   &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx );
+  }
+
+# undef NEXT_STENCIL
+# undef LOAD_STENCIL
+}
+
+#else
+
+void
+clean_div_b_pipeline_v16( pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No clean_div_b_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline_v4.cc b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v4.cc
new file mode 100644
index 00000000..058dc9a3
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v4.cc
@@ -0,0 +1,126 @@
+#define IN_sfa
+#define IN_clean_div_b_pipeline
+
+#include "clean_div_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+clean_div_b_pipeline_v4( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+  
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  float px, py, pz, alphadt;
+
+  px = ( nx > 1 ) ? g->rdx : 0;
+  py = ( ny > 1 ) ? g->rdy : 0;
+  pz = ( nz > 1 ) ? g->rdz : 0;
+
+  alphadt = 0.3888889/( px*px + py*py + pz*pz );
+
+  px *= alphadt;
+  py *= alphadt;
+  pz *= alphadt;
+
+  const v4float vpx(px);
+  const v4float vpy(py);
+  const v4float vpz(pz);
+
+  v4float f0_cbx, f0_cby, f0_cbz; // Voxel block magnetic fields
+  v4float f0_div_b_err;           // Voxel block div b errs
+  v4float fx_div_b_err;           // Voxel block -x neighbor div b err
+  v4float fy_div_b_err;           // Voxel block -y neighbor div b err
+  v4float fz_div_b_err;           // Voxel block -z neighbor div b err
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel block
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel block +z neighbors
+
+  // Process voxels assigned to this pipeline 
+  
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  // Process bulk of voxels 4 at a time
+
+# define LOAD_STENCIL()     \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+# define NEXT_STENCIL(n)      \
+  f0##n = f0++;               \
+  fx##n = fx++;               \
+  fy##n = fy++;               \
+  fz##n = fz++;               \
+  x++;                        \
+  if ( x > nx )               \
+  {			      \
+                  x = 2, y++; \
+    if ( y > ny ) y = 2, z++; \
+    LOAD_STENCIL();           \
+  }
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    NEXT_STENCIL(0);
+    NEXT_STENCIL(1);
+    NEXT_STENCIL(2);
+    NEXT_STENCIL(3);
+
+    load_4x4_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 f0_cbx, f0_cby, f0_cbz, f0_div_b_err );
+
+    fx_div_b_err = v4float( fx0->div_b_err, fx1->div_b_err, fx2->div_b_err, fx3->div_b_err );
+
+    fy_div_b_err = v4float( fy0->div_b_err, fy1->div_b_err, fy2->div_b_err, fy3->div_b_err );
+
+    fz_div_b_err = v4float( fz0->div_b_err, fz1->div_b_err, fz2->div_b_err, fz3->div_b_err );
+
+    f0_cbx = fma( f0_div_b_err - fx_div_b_err, px, f0_cbx );
+    f0_cby = fma( f0_div_b_err - fy_div_b_err, py, f0_cby );
+    f0_cbz = fma( f0_div_b_err - fz_div_b_err, pz, f0_cbz );
+
+    store_4x4_tr( f0_cbx, f0_cby, f0_cbz, f0_div_b_err,
+                  &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx );
+  }
+
+# undef NEXT_STENCIL
+# undef LOAD_STENCIL
+}
+
+#else
+
+void
+clean_div_b_pipeline_v4( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No clean_div_b_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/clean_div_b_pipeline_v8.cc b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v8.cc
new file mode 100644
index 00000000..66ea3181
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_b_pipeline_v8.cc
@@ -0,0 +1,139 @@
+#define IN_sfa
+#define IN_clean_div_b_pipeline
+
+#include "clean_div_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+clean_div_b_pipeline_v8( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+
+  field_t * ALIGNED(32) f0;
+  field_t * ALIGNED(32) fx, * ALIGNED(32) fy, * ALIGNED(32) fz;
+  int x, y, z, n_voxel;
+  
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  float px, py, pz, alphadt;
+
+  px = ( nx > 1 ) ? g->rdx : 0;
+  py = ( ny > 1 ) ? g->rdy : 0;
+  pz = ( nz > 1 ) ? g->rdz : 0;
+
+  alphadt = 0.3888889/( px*px + py*py + pz*pz );
+
+  px *= alphadt;
+  py *= alphadt;
+  pz *= alphadt;
+
+  const v8float vpx(px);
+  const v8float vpy(py);
+  const v8float vpz(pz);
+
+  v8float f0_cbx, f0_cby, f0_cbz; // Voxel block magnetic fields
+  v8float f0_div_b_err;           // Voxel block div b errs
+  v8float fx_div_b_err;           // Voxel block -x neighbor div b err
+  v8float fy_div_b_err;           // Voxel block -y neighbor div b err
+  v8float fz_div_b_err;           // Voxel block -z neighbor div b err
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel block
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel block
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel block +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel block +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel block +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel block +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel block +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel block +z neighbors
+
+  // Process voxels assigned to this pipeline 
+  
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  // Process bulk of voxels 8 at a time
+
+# define LOAD_STENCIL()     \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+# define NEXT_STENCIL(n)      \
+  f0##n = f0++;               \
+  fx##n = fx++;               \
+  fy##n = fy++;               \
+  fz##n = fz++;               \
+  x++;                        \
+  if ( x > nx )               \
+  {			      \
+                  x = 2, y++; \
+    if ( y > ny ) y = 2, z++; \
+    LOAD_STENCIL();           \
+  }
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    NEXT_STENCIL(0);
+    NEXT_STENCIL(1);
+    NEXT_STENCIL(2);
+    NEXT_STENCIL(3);
+    NEXT_STENCIL(4);
+    NEXT_STENCIL(5);
+    NEXT_STENCIL(6);
+    NEXT_STENCIL(7);
+
+    load_8x4_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+                 f0_cbx, f0_cby, f0_cbz, f0_div_b_err );
+
+    fx_div_b_err = v8float( fx0->div_b_err, fx1->div_b_err, fx2->div_b_err, fx3->div_b_err,
+                            fx4->div_b_err, fx5->div_b_err, fx6->div_b_err, fx7->div_b_err );
+
+    fy_div_b_err = v8float( fy0->div_b_err, fy1->div_b_err, fy2->div_b_err, fy3->div_b_err,
+                            fy4->div_b_err, fy5->div_b_err, fy6->div_b_err, fy7->div_b_err );
+
+    fz_div_b_err = v8float( fz0->div_b_err, fz1->div_b_err, fz2->div_b_err, fz3->div_b_err,
+                            fz4->div_b_err, fz5->div_b_err, fz6->div_b_err, fz7->div_b_err );
+
+    f0_cbx = fma( f0_div_b_err - fx_div_b_err, px, f0_cbx );
+    f0_cby = fma( f0_div_b_err - fy_div_b_err, py, f0_cby );
+    f0_cbz = fma( f0_div_b_err - fz_div_b_err, pz, f0_cbz );
+
+    store_8x4_tr( f0_cbx, f0_cby, f0_cbz, f0_div_b_err,
+                  &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                  &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx );
+  }
+
+# undef NEXT_STENCIL
+# undef LOAD_STENCIL
+}
+
+#else
+
+void
+clean_div_b_pipeline_v8( pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No clean_div_b_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/clean_div_e_pipeline.c
new file mode 100644
index 00000000..05da2718
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_e_pipeline.c
@@ -0,0 +1,116 @@
+#define IN_sfa
+#define IN_clean_div_e_pipeline
+
+#include "clean_div_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+static void
+clean_div_e_pipeline_scalar( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline )
+{
+  DECLARE_STENCIL();
+  
+  int n_voxel;
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- ) {
+    MARDER_EX(); MARDER_EY(); MARDER_EZ();
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+clean_div_e_pipeline( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Do majority of field components in single pass on the pipelines.
+  // The host handles stragglers.
+
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *)fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( clean_div_e, args, 0 );
+
+  // While pipelines are busy, do left overs on the host
+
+  do {
+    DECLARE_STENCIL();
+    
+    // Do left over ex
+    for( y=1; y<=ny+1; y++ ) {
+      f0 = &f(1,y,nz+1);
+      fx = &f(2,y,nz+1);
+      for( x=1; x<=nx; x++ ) {
+        MARDER_EX();
+        f0++; fx++;
+      }
+    }
+    for( z=1; z<=nz; z++ ) {
+      f0 = &f(1,ny+1,z);
+      fx = &f(2,ny+1,z);
+      for( x=1; x<=nx; x++ ) {
+        MARDER_EX();
+        f0++; fx++;
+      }
+    }
+  
+    // Do left over ey
+    for( z=1; z<=nz+1; z++ ) {
+      for( y=1; y<=ny; y++ ) {
+        f0 = &f(nx+1,y,  z);
+        fy = &f(nx+1,y+1,z);
+        MARDER_EY();
+      }
+    }
+    for( y=1; y<=ny; y++ ) {
+      f0 = &f(1,y,  nz+1);
+      fy = &f(1,y+1,nz+1);
+      for( x=1; x<=nx; x++ ) {
+        MARDER_EY();
+        f0++; fy++;
+      }
+    }
+  
+    // Do left over ez
+    for( z=1; z<=nz; z++ ) {
+      f0 = &f(1,ny+1,z);
+      fz = &f(1,ny+1,z+1);
+      for( x=1; x<=nx+1; x++ ) {
+        MARDER_EZ();
+        f0++; fz++;
+      }
+    }
+    for( z=1; z<=nz; z++ ) {
+      for( y=1; y<=ny; y++ ) {
+        f0 = &f(nx+1,y,z);
+        fz = &f(nx+1,y,z+1);
+        MARDER_EZ();
+      }
+    }
+  } while(0);
+
+  WAIT_PIPELINES();
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
+
diff --git a/src/field_advance/standard/pipeline/clean_div_e_pipeline.h b/src/field_advance/standard/pipeline/clean_div_e_pipeline.h
new file mode 100644
index 00000000..bfd7b154
--- /dev/null
+++ b/src/field_advance/standard/pipeline/clean_div_e_pipeline.h
@@ -0,0 +1,63 @@
+#ifndef _clean_div_e_pipeline_h_
+#define _clean_div_e_pipeline_h_
+
+#ifndef IN_clean_div_e_pipeline
+#error "Only include clean_div_e_pipeline.h in clean_div_e_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  field_t            * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                                \
+  field_t                      * ALIGNED(128) f = args->f;               \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;           \
+  const grid_t                 *              g = args->g;               \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                          \
+                                                                         \
+  const float _rdx = (nx>1) ? g->rdx : 0;                                \
+  const float _rdy = (ny>1) ? g->rdy : 0;                                \
+  const float _rdz = (nz>1) ? g->rdz : 0;                                \
+  const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \
+  const float px   = alphadt*_rdx;                                       \
+  const float py   = alphadt*_rdy;                                       \
+  const float pz   = alphadt*_rdz;                                       \
+                                                                         \
+  field_t * ALIGNED(16) f0;                                              \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;          \
+  int x, y, z
+                     
+#define f(x,y,z) f[ VOXEL(x,y,z,nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x+1,y,  z  ); \
+  fy = &f(x,  y+1,z  ); \
+  fz = &f(x,  y,  z+1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 1; \
+    if( y>ny ) z++; if( y>ny ) y = 1; \
+    INIT_STENCIL();                   \
+  }
+
+#define MARDER_EX() \
+    f0->ex += m[f0->ematx].drivex*px*(fx->div_e_err-f0->div_e_err)
+#define MARDER_EY() \
+    f0->ey += m[f0->ematy].drivey*py*(fy->div_e_err-f0->div_e_err)
+#define MARDER_EZ() \
+    f0->ez += m[f0->ematz].drivez*pz*(fz->div_e_err-f0->div_e_err)
+
+static void
+clean_div_e_pipeline_scalar( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline );
+
+#endif // _clean_div_e_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline.cc b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.cc
new file mode 100644
index 00000000..322b9b82
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.cc
@@ -0,0 +1,326 @@
+#define IN_sfa
+#define IN_compute_curl_b_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for a compute_curl_b pipeline function which does
+// not make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+compute_curl_b_pipeline_scalar( pipeline_args_t * args,
+                                int pipeline_rank,
+                                int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_EX();
+    UPDATE_EY();
+    UPDATE_EZ();
+
+    NEXT_STENCIL();
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper compute_curl_b pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+compute_curl_b_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  //--------------------------------------------------------------------------//
+  // Begin tangential B ghost setup
+  //--------------------------------------------------------------------------//
+  
+  begin_remote_ghost_tang_b( fa->f, fa->g );
+
+  local_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update interior fields
+  //--------------------------------------------------------------------------//
+  // Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
+  // Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
+  // Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
+  //--------------------------------------------------------------------------//
+
+  // Do majority interior in a single pass.  The host handles
+  // stragglers.
+
+  pipeline_args_t args[1];
+  args->f = fa->f;
+  args->p = (sfa_params_t *)fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( compute_curl_b, args, 0 );
+  
+  // While the pipelines are busy, do non-bulk interior fields
+
+  DECLARE_STENCIL();
+
+  // Do left over interior ex
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_EX();
+    }
+  }
+
+  // Do left over interior ey
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 2, 1, z   );
+    fx = &f( 1, 1, z   );
+    fz = &f( 2, 1, z-1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do left over interior ez
+  for( y = 2; y <= ny; y++ )
+  {
+    f0 = &f( 2, y,   1 );
+    fx = &f( 1, y,   1 );
+    fy = &f( 2, y-1, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  WAIT_PIPELINES();
+  
+  //--------------------------------------------------------------------------//
+  // Finish tangential B ghost setup
+  //--------------------------------------------------------------------------//
+
+  end_remote_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update exterior fields
+  //--------------------------------------------------------------------------//
+
+  // Do exterior ex
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // Do exterior ey
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z   );
+      fx = &f( 0, y, z   );
+      fz = &f( 1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y, z   );
+      fx = &f( nx,   y, z   );
+      fz = &f( nx+1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, 1 );
+    fx = &f( 1, y, 1 );
+    fz = &f( 2, y, 0 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, nz+1 );
+    fx = &f( 1, y, nz+1 );
+    fz = &f( 2, y, nz   );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do exterior ez
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z );
+    fx = &f( 0, 1, z );
+    fy = &f( 1, 0, z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z );
+    fx = &f( 0, ny+1, z );
+    fy = &f( 1, ny,   z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z );
+      fx = &f( 0, y,   z );
+      fy = &f( 1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y,   z );
+      fx = &f( nx,   y,   z );
+      fy = &f( nx+1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h
new file mode 100644
index 00000000..e55827d2
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline.h
@@ -0,0 +1,86 @@
+#ifndef _compute_curl_b_pipeline_h_
+#define _compute_curl_b_pipeline_h_
+
+#ifndef IN_compute_curl_b_pipeline
+#error "Only include compute_curl_b_pipeline.h in compute_curl_b_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  field_t            * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                        \
+        field_t                * ALIGNED(128) f = args->f;       \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;   \
+  const grid_t                 *              g = args->g;       \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                  \
+                                                                 \
+  const float px = (nx>1) ? g->cvac*g->dt*g->rdx : 0;            \
+  const float py = (ny>1) ? g->cvac*g->dt*g->rdy : 0;            \
+  const float pz = (nz>1) ? g->cvac*g->dt*g->rdz : 0;            \
+                                                                 \
+  field_t * ALIGNED(16) f0;                                      \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;  \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define INIT_STENCIL()      \
+  f0 = &f( x,   y,   z   ); \
+  fx = &f( x-1, y,   z   ); \
+  fy = &f( x,   y-1, z   ); \
+  fz = &f( x,   y,   z-1 )
+
+#define NEXT_STENCIL()                        \
+  f0++; fx++; fy++; fz++; x++;                \
+  if ( x > nx )                               \
+  {                                           \
+                  y++;               x = 2;   \
+    if ( y > ny ) z++; if ( y > ny ) y = 2;   \
+    INIT_STENCIL();                           \
+  }
+
+#define UPDATE_EX()                                     \
+  f0->tcax = ( py * ( f0->cbz * m[f0->fmatz].rmuz -     \
+		      fy->cbz * m[fy->fmatz].rmuz) -    \
+               pz * ( f0->cby * m[f0->fmaty].rmuy -     \
+		      fz->cby * m[fz->fmaty].rmuy ) )
+
+#define UPDATE_EY()                                     \
+  f0->tcay = ( pz * ( f0->cbx * m[f0->fmatx].rmux -     \
+		      fz->cbx * m[fz->fmatx].rmux) -    \
+               px * ( f0->cbz * m[f0->fmatz].rmuz -     \
+		      fx->cbz * m[fx->fmatz].rmuz ) )
+
+#define UPDATE_EZ()                                     \
+  f0->tcaz = ( px * ( f0->cby * m[f0->fmaty].rmuy -     \
+		      fx->cby * m[fx->fmaty].rmuy) -    \
+               py * ( f0->cbx * m[f0->fmatx].rmux -     \
+		      fy->cbx * m[fy->fmatx].rmux ) )
+
+void
+compute_curl_b_pipeline_scalar( pipeline_args_t * args,
+                                int pipeline_rank,
+                                int n_pipeline );
+
+void
+compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline );
+
+void
+compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline );
+
+void
+compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline );
+
+#endif // _compute_curl_b_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v16.cc b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v16.cc
new file mode 100644
index 00000000..0da2042d
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v16.cc
@@ -0,0 +1,186 @@
+#define IN_sfa
+#define IN_compute_curl_b_pipeline
+
+#include "compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v16float vpx( px );
+  const v16float vpy( py );
+  const v16float vpz( pz );
+
+  v16float save1, dummy;
+
+  v16float f0_cbx,  f0_cby,  f0_cbz;
+  v16float f0_tcax, f0_tcay, f0_tcaz;
+  v16float          fx_cby,  fx_cbz;
+  v16float fy_cbx,           fy_cbz;
+  v16float fz_cbx,  fz_cby;
+  v16float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v16float            m_fx_rmuy, m_fx_rmuz;
+  v16float m_fy_rmux,            m_fy_rmuz;
+  v16float m_fz_rmux, m_fz_rmuy;
+
+  v16float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 16 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    f000 = f0; fx00 = fx; fy00 = fy; fz00 = fz; NEXT_STENCIL();
+    f001 = f0; fx01 = fx; fy01 = fy; fz01 = fz; NEXT_STENCIL();
+    f002 = f0; fx02 = fx; fy02 = fy; fz02 = fz; NEXT_STENCIL();
+    f003 = f0; fx03 = fx; fy03 = fy; fz03 = fz; NEXT_STENCIL();
+    f004 = f0; fx04 = fx; fy04 = fy; fz04 = fz; NEXT_STENCIL();
+    f005 = f0; fx05 = fx; fy05 = fy; fz05 = fz; NEXT_STENCIL();
+    f006 = f0; fx06 = fx; fy06 = fy; fz06 = fz; NEXT_STENCIL();
+    f007 = f0; fx07 = fx; fy07 = fy; fz07 = fz; NEXT_STENCIL();
+    f008 = f0; fx08 = fx; fy08 = fy; fz08 = fz; NEXT_STENCIL();
+    f009 = f0; fx09 = fx; fy09 = fy; fz09 = fz; NEXT_STENCIL();
+    f010 = f0; fx10 = fx; fy10 = fy; fz10 = fz; NEXT_STENCIL();
+    f011 = f0; fx11 = fx; fy11 = fy; fz11 = fz; NEXT_STENCIL();
+    f012 = f0; fx12 = fx; fy12 = fy; fz12 = fz; NEXT_STENCIL();
+    f013 = f0; fx13 = fx; fy13 = fy; fz13 = fz; NEXT_STENCIL();
+    f014 = f0; fx14 = fx; fy14 = fy; fz14 = fz; NEXT_STENCIL();
+    f015 = f0; fx15 = fx; fy15 = fy; fz15 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_16x3_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz );
+
+    load_16x4_tr( &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                  &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                  &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                  &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax,
+                  f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_16x3_tr( &fx00->cbx, &fx01->cbx, &fx02->cbx, &fx03->cbx,
+                  &fx04->cbx, &fx05->cbx, &fx06->cbx, &fx07->cbx,
+                  &fx08->cbx, &fx09->cbx, &fx10->cbx, &fx11->cbx,
+                  &fx12->cbx, &fx13->cbx, &fx14->cbx, &fx15->cbx,
+                  dummy, fx_cby, fx_cbz );
+
+    load_16x3_tr( &fy00->cbx, &fy01->cbx, &fy02->cbx, &fy03->cbx,
+                  &fy04->cbx, &fy05->cbx, &fy06->cbx, &fy07->cbx,
+                  &fy08->cbx, &fy09->cbx, &fy10->cbx, &fy11->cbx,
+                  &fy12->cbx, &fy13->cbx, &fy14->cbx, &fy15->cbx,
+                  fy_cbx, dummy, fy_cbz );
+
+    load_16x2_tr( &fz00->cbx, &fz01->cbx, &fz02->cbx, &fz03->cbx,
+                  &fz04->cbx, &fz05->cbx, &fz06->cbx, &fz07->cbx,
+                  &fz08->cbx, &fz09->cbx, &fz10->cbx, &fz11->cbx,
+                  &fz12->cbx, &fz13->cbx, &fz14->cbx, &fz15->cbx,
+                  fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v16float( m[f##V##00->fmat##D].rmu##D, \
+                                                   m[f##V##01->fmat##D].rmu##D, \
+                                                   m[f##V##02->fmat##D].rmu##D, \
+                                                   m[f##V##03->fmat##D].rmu##D, \
+                                                   m[f##V##04->fmat##D].rmu##D, \
+                                                   m[f##V##05->fmat##D].rmu##D, \
+                                                   m[f##V##06->fmat##D].rmu##D, \
+                                                   m[f##V##07->fmat##D].rmu##D, \
+                                                   m[f##V##08->fmat##D].rmu##D, \
+                                                   m[f##V##09->fmat##D].rmu##D, \
+                                                   m[f##V##10->fmat##D].rmu##D, \
+                                                   m[f##V##11->fmat##D].rmu##D, \
+                                                   m[f##V##12->fmat##D].rmu##D, \
+                                                   m[f##V##13->fmat##D].rmu##D, \
+                                                   m[f##V##14->fmat##D].rmu##D, \
+                                                   m[f##V##15->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fms( vpy,
+                   fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                   vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) );
+
+    f0_tcay = fms( vpz,
+                   fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                   vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) );
+
+    f0_tcaz = fms( vpx,
+                   fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                   vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_16x3 versus load_16x4, store_16x4 is much more efficient
+    // than store_16x3.
+    //------------------------------------------------------------------------//
+
+    store_16x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                   &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                   &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                   &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                   &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax );
+  }
+}
+
+#else
+
+void
+compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No compute_curl_b_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v4.cc b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v4.cc
new file mode 100644
index 00000000..c90812ca
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v4.cc
@@ -0,0 +1,132 @@
+#define IN_sfa
+#define IN_compute_curl_b_pipeline
+
+#include "compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v4float vpx( px );
+  const v4float vpy( py );
+  const v4float vpz( pz );
+
+  v4float save1, dummy;
+
+  v4float f0_cbx,  f0_cby,  f0_cbz;
+  v4float f0_tcax, f0_tcay, f0_tcaz;
+  v4float          fx_cby,  fx_cbz;
+  v4float fy_cbx,           fy_cbz;
+  v4float fz_cbx,  fz_cby;
+  v4float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v4float            m_fx_rmuy, m_fx_rmuz;
+  v4float m_fy_rmux,            m_fy_rmuz;
+  v4float m_fz_rmux, m_fz_rmuy;
+
+  v4float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel block
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 4 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		 f0_cbx, f0_cby, f0_cbz );
+
+    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+		 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_4x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+		 dummy, fx_cby, fx_cbz );
+
+    load_4x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+		 fy_cbx, dummy, fy_cbz );
+
+    load_4x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+		 fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v4float( m[f##V##0->fmat##D].rmu##D, \
+                                                  m[f##V##1->fmat##D].rmu##D, \
+                                                  m[f##V##2->fmat##D].rmu##D, \
+                                                  m[f##V##3->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fms( vpy,
+		   fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                   vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) );
+
+    f0_tcay = fms( vpz,
+		   fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                   vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) );
+
+    f0_tcaz = fms( vpx,
+		   fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                   vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient
+    // than store_4x3.
+    //------------------------------------------------------------------------//
+
+    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+		  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax );
+  }
+}
+
+#else
+
+void
+compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No compute_curl_b_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v8.cc b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v8.cc
new file mode 100644
index 00000000..4f723628
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_curl_b_pipeline_v8.cc
@@ -0,0 +1,150 @@
+#define IN_sfa
+#define IN_compute_curl_b_pipeline
+
+#include "compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v8float vpx( px );
+  const v8float vpy( py );
+  const v8float vpz( pz );
+
+  v8float save1, dummy;
+
+  v8float f0_cbx,  f0_cby,  f0_cbz;
+  v8float f0_tcax, f0_tcay, f0_tcaz;
+  v8float          fx_cby,  fx_cbz;
+  v8float fy_cbx,           fy_cbz;
+  v8float fz_cbx,  fz_cby;
+  v8float m_f0_rmux, m_f0_rmuy, m_f0_rmuz;
+  v8float            m_fx_rmuy, m_fx_rmuz;
+  v8float m_fy_rmux,            m_fy_rmuz;
+  v8float m_fz_rmux, m_fz_rmuy;
+
+  v8float f0_cbx_rmux, f0_cby_rmuy, f0_cbz_rmuz;
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel block
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel block
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel block +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel block +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel block +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel block +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel block +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 8 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+    f04 = f0; fx4 = fx; fy4 = fy; fz4 = fz; NEXT_STENCIL();
+    f05 = f0; fx5 = fx; fy5 = fy; fz5 = fz; NEXT_STENCIL();
+    f06 = f0; fx6 = fx; fy6 = fy; fz6 = fz; NEXT_STENCIL();
+    f07 = f0; fx7 = fx; fy7 = fy; fz7 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+		 f0_cbx, f0_cby, f0_cbz );
+
+    load_8x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+		 &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax,
+		 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_8x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+		 &fx4->cbx, &fx5->cbx, &fx6->cbx, &fx7->cbx,
+		 dummy, fx_cby, fx_cbz );
+
+    load_8x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+		 &fy4->cbx, &fy5->cbx, &fy6->cbx, &fy7->cbx,
+		 fy_cbx, dummy, fy_cbz );
+
+    load_8x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+		 &fz4->cbx, &fz5->cbx, &fz6->cbx, &fz7->cbx,
+		 fz_cbx, fz_cby );
+
+#   define LOAD_RMU(V,D) m_f##V##_rmu##D=v8float( m[f##V##0->fmat##D].rmu##D, \
+                                                  m[f##V##1->fmat##D].rmu##D, \
+                                                  m[f##V##2->fmat##D].rmu##D, \
+                                                  m[f##V##3->fmat##D].rmu##D, \
+                                                  m[f##V##4->fmat##D].rmu##D, \
+                                                  m[f##V##5->fmat##D].rmu##D, \
+                                                  m[f##V##6->fmat##D].rmu##D, \
+                                                  m[f##V##7->fmat##D].rmu##D )
+
+    LOAD_RMU(0,x); LOAD_RMU(0,y); LOAD_RMU(0,z);
+                   LOAD_RMU(x,y); LOAD_RMU(x,z);
+    LOAD_RMU(y,x);                LOAD_RMU(y,z);
+    LOAD_RMU(z,x); LOAD_RMU(z,y);
+
+#   undef LOAD_RMU
+
+    f0_cbx_rmux = f0_cbx * m_f0_rmux;
+    f0_cby_rmuy = f0_cby * m_f0_rmuy;
+    f0_cbz_rmuz = f0_cbz * m_f0_rmuz;
+
+    f0_tcax = fms( vpy,
+		   fnms( fy_cbz, m_fy_rmuz, f0_cbz_rmuz ),
+                   vpz * fnms( fz_cby, m_fz_rmuy, f0_cby_rmuy ) );
+
+    f0_tcay = fms( vpz,
+		   fnms( fz_cbx, m_fz_rmux, f0_cbx_rmux ),
+                   vpx * fnms( fx_cbz, m_fx_rmuz, f0_cbz_rmuz ) );
+
+    f0_tcaz = fms( vpx,
+		   fnms( fx_cby, m_fx_rmuy, f0_cby_rmuy ),
+                   vpy * fnms( fy_cbx, m_fy_rmux, f0_cbx_rmux ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_8x3 versus load_8x4, store_8x4 is much more efficient
+    // than store_8x3.
+    //------------------------------------------------------------------------//
+
+    store_8x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+		  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+		  &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax );
+  }
+}
+
+#else
+
+void
+compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No compute_curl_b_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.cc b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.cc
new file mode 100644
index 00000000..43d476cc
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.cc
@@ -0,0 +1,296 @@
+#define IN_sfa
+#define IN_compute_div_b_err_pipeline
+
+#include "compute_div_b_err_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+void
+compute_div_b_err_pipeline_scalar( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+  
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  const float px = (nx>1) ? g->rdx : 0;
+  const float py = (ny>1) ? g->rdy : 0;
+  const float pz = (nz>1) ? g->rdz : 0;
+
+  // Process the voxels assigned to this pipeline
+  
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+# define LOAD_STENCIL() \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x+1,y,  z  ); \
+  fy = &f(x,  y+1,z  ); \
+  fz = &f(x,  y,  z+1)
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel; n_voxel-- ) {
+    f0->div_b_err = px*( fx->cbx - f0->cbx ) +
+                    py*( fy->cby - f0->cby ) +
+                    pz*( fz->cbz - f0->cbz );
+    f0++; fx++; fy++; fz++;
+    
+    x++;
+    if( x>nx ) {
+      x=1, y++;
+      if( y>ny ) y=1, z++;
+      LOAD_STENCIL();
+    }
+  }
+
+# undef LOAD_STENCIL
+
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+using namespace v4;
+
+static void
+compute_div_b_err_pipeline_v4( pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  const float px = (nx>1) ? g->rdx : 0;
+  const float py = (ny>1) ? g->rdy : 0;
+  const float pz = (nz>1) ? g->rdz : 0;
+
+  const v4float vpx(px);
+  const v4float vpy(py);
+  const v4float vpz(pz);
+
+  v4float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
+  v4float f0_div_b_err;           // Voxel quad div b errs
+  v4float fx_cbx;                 // Voxel quad +x neighbor x magnetic fields
+  v4float fy_cby;                 // Voxel quad +y neighbor y magnetic fields
+  v4float fz_cbz;                 // Voxel quad +z neighbor z magnetic fields
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +x neighbors
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +x neighbors
+
+  // Process the voxels assigned to this pipeline 
+  
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  // Process bulk of voxels 4 at a time
+
+# define LOAD_STENCIL() \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x+1,y,  z  ); \
+  fy = &f(x,  y+1,z  ); \
+  fz = &f(x,  y,  z+1)
+
+# define NEXT_STENCIL(n) \
+  f0##n = f0++;          \
+  fx##n = fx++;          \
+  fy##n = fy++;          \
+  fz##n = fz++;          \
+  x++;                   \
+  if( x>nx ) {           \
+    x=1, y++;            \
+    if( y>ny ) y=1, z++; \
+    LOAD_STENCIL();      \
+  }
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel>3; n_voxel-=4 ) {
+    NEXT_STENCIL(0); NEXT_STENCIL(1); NEXT_STENCIL(2); NEXT_STENCIL(3);
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx, f0_cbx, f0_cby, f0_cbz );
+
+    fx_cbx = v4float( fx0->cbx, fx1->cbx, fx2->cbx, fx3->cbx );
+    fy_cby = v4float( fy0->cby, fy1->cby, fy2->cby, fy3->cby );
+    fz_cbz = v4float( fz0->cbz, fz1->cbz, fz2->cbz, fz3->cbz );
+
+    f0_div_b_err = fma( vpx,fx_cbx-f0_cbx, fma( vpy,fy_cby-f0_cby, vpz*(fz_cbz-f0_cbz) ) );
+
+    store_4x1_tr( f0_div_b_err, &f00->div_b_err, &f01->div_b_err, &f02->div_b_err, &f03->div_b_err );
+  }
+
+# undef NEXT_STENCIL
+# undef LOAD_STENCIL
+
+}
+
+#endif
+
+#if defined(V8_ACCELERATION) && defined(HAS_V8_PIPELINE)
+
+using namespace v8;
+
+static void
+compute_div_b_err_pipeline_v8( pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline )
+{
+  field_t      * ALIGNED(128) f = args->f;
+  const grid_t *              g = args->g;
+
+  field_t * ALIGNED(16) f0;
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  const float px = (nx>1) ? g->rdx : 0;
+  const float py = (ny>1) ? g->rdy : 0;
+  const float pz = (nz>1) ? g->rdz : 0;
+
+  const v8float vpx(px);
+  const v8float vpy(py);
+  const v8float vpz(pz);
+
+  v8float f0_cbx, f0_cby, f0_cbz; // Voxel quad magnetic fields
+  v8float f0_div_b_err;           // Voxel quad div b errs
+  v8float fx_cbx;                 // Voxel quad +x neighbor x magnetic fields
+  v8float fy_cby;                 // Voxel quad +y neighbor y magnetic fields
+  v8float fz_cbz;                 // Voxel quad +z neighbor z magnetic fields
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
+  field_t * ALIGNED(16) f04, * ALIGNED(16) f05, * ALIGNED(16) f06, * ALIGNED(16) f07; // Voxel quad
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
+  field_t * ALIGNED(16) fx4, * ALIGNED(16) fx5, * ALIGNED(16) fx6, * ALIGNED(16) fx7; // Voxel quad +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
+  field_t * ALIGNED(16) fy4, * ALIGNED(16) fy5, * ALIGNED(16) fy6, * ALIGNED(16) fy7; // Voxel quad +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
+  field_t * ALIGNED(16) fz4, * ALIGNED(16) fz5, * ALIGNED(16) fz6, * ALIGNED(16) fz7; // Voxel quad +z neighbors
+
+  // Process the voxels assigned to this pipeline 
+  
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  // Process bulk of voxels 8 at a time
+
+# define LOAD_STENCIL() \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x+1,y,  z  ); \
+  fy = &f(x,  y+1,z  ); \
+  fz = &f(x,  y,  z+1)
+
+# define NEXT_STENCIL(n) \
+  f0##n = f0++;          \
+  fx##n = fx++;          \
+  fy##n = fy++;          \
+  fz##n = fz++;          \
+  x++;                   \
+  if( x>nx ) {           \
+    x=1, y++;            \
+    if( y>ny ) y=1, z++; \
+    LOAD_STENCIL();      \
+  }
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel>3; n_voxel-=8 ) {
+    NEXT_STENCIL(0); NEXT_STENCIL(1); NEXT_STENCIL(2); NEXT_STENCIL(3);
+    NEXT_STENCIL(4); NEXT_STENCIL(5); NEXT_STENCIL(6); NEXT_STENCIL(7);
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+		 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+		 f0_cbx, f0_cby, f0_cbz );
+
+    fx_cbx = v8float( fx0->cbx, fx1->cbx, fx2->cbx, fx3->cbx,
+		      fx4->cbx, fx5->cbx, fx6->cbx, fx7->cbx );
+
+    fy_cby = v8float( fy0->cby, fy1->cby, fy2->cby, fy3->cby,
+		      fy4->cby, fy5->cby, fy6->cby, fy7->cby );
+
+    fz_cbz = v8float( fz0->cbz, fz1->cbz, fz2->cbz, fz3->cbz,
+		      fz4->cbz, fz5->cbz, fz6->cbz, fz7->cbz );
+
+    f0_div_b_err = fma( vpx,fx_cbx-f0_cbx, fma( vpy,fy_cby-f0_cby, vpz*(fz_cbz-f0_cbz) ) );
+
+    store_8x1_tr( f0_div_b_err,
+		  &f00->div_b_err, &f01->div_b_err, &f02->div_b_err, &f03->div_b_err,
+		  &f04->div_b_err, &f05->div_b_err, &f06->div_b_err, &f07->div_b_err );
+  }
+
+# undef NEXT_STENCIL
+# undef LOAD_STENCIL
+
+}
+
+#endif
+
+void
+compute_div_b_err_pipeline( field_array_t * RESTRICT fa )
+{
+  pipeline_args_t args[1];
+
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+  
+# if 0 // Original non-pipelined version
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fx = &f( 2, y,   z   );
+      fy = &f( 1, y+1, z   );
+      fz = &f( 1, y,   z+1 );
+
+      for( x = 1; x <= nx; x++ )
+      {
+	f0->div_b_err = px*( fx->cbx - f0->cbx ) +
+	                py*( fy->cby - f0->cby ) +
+                        pz*( fz->cbz - f0->cbz );
+
+	f0++;
+	fx++;
+	fy++;
+	fz++;
+      }
+    }
+  }
+# endif
+
+  args->f = fa->f;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( compute_div_b_err, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h
new file mode 100644
index 00000000..6f1cddd7
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_div_b_err_pipeline.h
@@ -0,0 +1,23 @@
+#ifndef _compute_div_b_err_pipeline_h_
+#define _compute_div_b_err_pipeline_h_
+
+#ifndef IN_compute_div_b_err_pipeline
+#error "Only include compute_div_b_err_pipeline.h in compute_div_b_err_pipeline source files."
+#endif
+
+#include "../../field_advance.h"
+
+typedef struct pipeline_args
+{
+  field_t      * ALIGNED(128) f;
+  const grid_t *              g;
+} pipeline_args_t;
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+void
+compute_div_b_err_pipeline_scalar( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline );
+
+#endif // _compute_div_b_err_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c
new file mode 100644
index 00000000..3bdf3c58
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c
@@ -0,0 +1,173 @@
+// Note: This is virtually identical to compute_rhob
+
+#define IN_sfa
+#define IN_compute_div_e_err_pipeline
+
+#include "compute_div_e_err_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+void
+compute_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_DERR_E();
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+compute_div_e_err_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have pipelines compute the interior of local domain (the host
+  // handles stragglers in the interior)
+
+  // Begin setting normal e ghosts
+
+  begin_remote_ghost_norm_e( fa->f, fa->g );
+
+  local_ghost_norm_e( fa->f, fa->g );
+
+  // Have pipelines compute interior of local domain
+
+  pipeline_args_t args[1];  
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( compute_div_e_err, args, 0 );
+
+  // While pipelines are busy, have host compute the exterior
+  // of the local domain
+
+  DECLARE_STENCIL();
+
+  // Finish setting normal e ghosts
+  end_remote_ghost_norm_e( fa->f, fa->g );
+
+  // z faces, x edges, y edges and all corners
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fx = &f( 0, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fx = &f( 0, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // y faces, z edges
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fx = &f( 0, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fx = &f( 0, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // x faces
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fx = &f( 0, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_DERR_E();
+
+      f0 = &f( nx+1, y,   z   );
+      fx = &f( nx,   y,   z   );
+      fy = &f( nx+1, y-1, z   );
+      fz = &f( nx+1, y,   z-1 );
+
+      UPDATE_DERR_E();
+    }
+  }
+
+  // Finish up setting interior
+
+  WAIT_PIPELINES();
+
+  local_adjust_div_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h
new file mode 100644
index 00000000..9a0c1472
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.h
@@ -0,0 +1,59 @@
+#ifndef _compute_div_e_err_pipeline_h_
+#define _compute_div_e_err_pipeline_h_
+
+#ifndef IN_compute_div_e_err_pipeline
+#error "Only include compute_div_e_err_pipeline.h in compute_div_e_err_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  /**/  field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                       \
+  /**/  field_t                * ALIGNED(128) f = args->f;      \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
+  const grid_t                 *              g = args->g;      \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
+                                                                \
+  const float px = (nx>1) ? g->rdx : 0;                         \
+  const float py = (ny>1) ? g->rdy : 0;                         \
+  const float pz = (nz>1) ? g->rdz : 0;                         \
+  const float cj = 1./g->eps0;                                  \
+                                                                \
+  field_t * ALIGNED(16) f0;                                     \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x-1,y,  z  ); \
+  fy = &f(x,  y-1,z  ); \
+  fz = &f(x,  y,  z-1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 2; \
+    if( y>ny ) z++; if( y>ny ) y = 2; \
+    INIT_STENCIL();                   \
+  }
+
+#define UPDATE_DERR_E() f0->div_e_err = m[f0->nmat].nonconductive * \
+  ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) +    \
+    py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) +    \
+    pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) -    \
+    cj*( f0->rhof + f0->rhob ) )
+
+void
+compute_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline );
+
+#endif // _compute_div_e_err_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/compute_rhob_pipeline.c
new file mode 100644
index 00000000..3f9e977d
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rhob_pipeline.c
@@ -0,0 +1,174 @@
+// Note: This is virtually identical to compute_div_e_err
+
+#define IN_sfa
+#define IN_compute_rhob_pipeline
+
+#include "compute_rhob_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+void
+compute_rhob_pipeline_scalar( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_DERR_E();
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+compute_rhob_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have pipelines compute the interior of local domain (the host
+  // handles stragglers in the interior)
+
+  // Begin setting normal e ghosts
+
+  begin_remote_ghost_norm_e( fa->f, fa->g );
+
+  local_ghost_norm_e( fa->f, fa->g );
+
+  // Have pipelines compute interior of local domain
+
+  pipeline_args_t args[1];  
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *)fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( compute_rhob, args, 0 );
+
+  // While pipelines are busy, have host compute the exterior
+  // of the local domain
+
+  DECLARE_STENCIL();
+
+  // Finish setting normal e ghosts
+  end_remote_ghost_norm_e( fa->f, fa->g );
+
+  // z faces, x edges, y edges and all corners
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fx = &f( 0, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fx = &f( 0, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // y faces, z edges
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fx = &f( 0, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fx = &f( 0, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // x faces
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fx = &f( 0, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_DERR_E();
+
+      f0 = &f( nx+1, y,   z   );
+      fx = &f( nx,   y,   z   );
+      fy = &f( nx+1, y-1, z   );
+      fz = &f( nx+1, y,   z-1 );
+
+      UPDATE_DERR_E();
+    }
+  }
+
+  // Finish up setting interior
+
+  WAIT_PIPELINES();
+
+  local_adjust_rhob( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/compute_rhob_pipeline.h b/src/field_advance/standard/pipeline/compute_rhob_pipeline.h
new file mode 100644
index 00000000..33f94308
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rhob_pipeline.h
@@ -0,0 +1,58 @@
+#ifndef _compute_rhob_pipeline_h_
+#define _compute_rhob_pipeline_h_
+
+#ifndef IN_compute_rhob_pipeline
+#error "Only include compute_rhob_pipeline.h in compute_rhob_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  /**/  field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                       \
+  /**/  field_t                * ALIGNED(128) f = args->f;      \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
+  const grid_t                 *              g = args->g;      \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
+                                                                \
+  const float px = (nx>1) ? g->eps0*g->rdx : 0;                 \
+  const float py = (ny>1) ? g->eps0*g->rdy : 0;                 \
+  const float pz = (nz>1) ? g->eps0*g->rdz : 0;                 \
+                                                                \
+  field_t * ALIGNED(16) f0;                                     \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x-1,y,  z  ); \
+  fy = &f(x,  y-1,z  ); \
+  fz = &f(x,  y,  z-1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 2; \
+    if( y>ny ) z++; if( y>ny ) y = 2; \
+    INIT_STENCIL();                   \
+  }
+
+#define UPDATE_DERR_E() f0->rhob = m[f0->nmat].nonconductive *   \
+  ( px*( m[f0->ematx].epsx*f0->ex - m[fx->ematx].epsx*fx->ex ) + \
+    py*( m[f0->ematy].epsy*f0->ey - m[fy->ematy].epsy*fy->ey ) + \
+    pz*( m[f0->ematz].epsz*f0->ez - m[fz->ematz].epsz*fz->ez ) - \
+    f0->rhof )
+
+void
+compute_rhob_pipeline_scalar( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline );
+
+#endif // _compute_rhob_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c
new file mode 100644
index 00000000..dfcf8a05
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c
@@ -0,0 +1,111 @@
+#define IN_sfa
+#define IN_compute_rms_div_b_err_pipeline
+
+#include "compute_rms_div_b_err_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+static void
+compute_rms_div_b_err_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline )
+{
+  const field_t * ALIGNED(128) f = args->f;
+  const grid_t  *              g = args->g;
+                             
+  const field_t * ALIGNED(16) f0;
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  double err;
+
+  // Process voxels assigned to this pipeline
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+  
+  f0 = &f(x,y,z);
+
+  err = 0;
+  for( ; n_voxel; n_voxel-- )
+  {
+    err += f0->div_b_err*f0->div_b_err;
+    f0++;
+
+    x++;
+    if ( x > nx )
+    {
+      x=1, y++;
+      if( y>ny ) y=1, z++;
+      f0 = &f(x,y,z);
+    }
+  }
+    
+  args->err[pipeline_rank] = err;
+}
+
+double
+compute_rms_div_b_err_pipeline( const field_array_t * fa )
+{
+  pipeline_args_t args[1];
+  int p;
+  
+  double err = 0, local[2], global[2];
+
+  if ( !fa )
+  {
+    ERROR( ( "Bad args") );
+  }
+
+# if 0 // Original non-pipelined version
+  field_t * ALIGNED(16) f0;
+
+  int z, y, x;
+
+  int nx = g->nx;
+  int ny = g->ny;
+  int nz = g->nz;
+
+  err = 0;
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z );
+      for( x = 1; x <= nx; x++ )
+      {
+        err += f0->div_b_err * f0->div_b_err;
+
+        f0++;
+      }
+    }
+  }
+# endif
+
+  args->f = fa->f;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( compute_rms_div_b_err, args, 0 );
+
+  WAIT_PIPELINES();
+
+  err = 0;
+  for( p = 0; p <= N_PIPELINE; p++ )
+  {
+    err += args->err[p];
+  }
+
+  local[0] = err * fa->g->dV;
+
+  local[1] = ( fa->g->nx * fa->g->ny * fa->g->nz ) * fa->g->dV;
+
+  mp_allsum_d( local, global, 2 );
+
+  return fa->g->eps0 * sqrt( global[0] / global[1] );
+}
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h
new file mode 100644
index 00000000..9260ae2f
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.h
@@ -0,0 +1,24 @@
+#ifndef _compute_rms_div_b_err_pipeline_h_
+#define _compute_rms_div_b_err_pipeline_h_
+
+#ifndef IN_compute_rms_div_b_err_pipeline
+#error "Only include compute_rms_div_b_err_pipeline.h in compute_rms_div_b_err_pipeline source files."
+#endif
+
+#include "../../field_advance.h"
+
+typedef struct pipeline_args
+{
+  const field_t * ALIGNED(128) f;
+  const grid_t  *              g;
+  double err[MAX_PIPELINE+1];
+} pipeline_args_t;
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+static void
+compute_rms_div_b_err_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline );
+
+#endif // _compute_rms_div_b_err_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c
new file mode 100644
index 00000000..682e2f8b
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c
@@ -0,0 +1,184 @@
+#define IN_sfa
+#define IN_compute_rms_div_e_err_pipeline
+
+#include "compute_rms_div_e_err_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+static void
+compute_rms_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline )
+{
+  const field_t * ALIGNED(128) f = args->f;
+  const grid_t  *              g = args->g;
+  
+  const field_t * ALIGNED(16) f0;
+
+  int x, y, z, n_voxel;
+
+  const int nx = g->nx;
+  const int ny = g->ny;
+  const int nz = g->nz;
+
+  double err;
+
+  // Process voxels assigned to this pipeline
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  f0 = &f(x,y,z);
+
+  err = 0;
+  for( ; n_voxel; n_voxel-- )
+  {
+    err += f0->div_e_err * f0->div_e_err;
+
+    f0++;
+
+    x++;
+    if ( x > nx )
+    {
+      x=2, y++;
+      if ( y > ny ) y=2, z++;
+      f0 = &f(x,y,z);
+    }
+  }
+
+  args->err[pipeline_rank] = err;
+}
+
+double
+compute_rms_div_e_err_pipeline( const field_array_t * RESTRICT fa )
+{
+  pipeline_args_t args[1];
+  const field_t * f, * f0;
+  const grid_t * RESTRICT g;
+  double err = 0, local[2], global[2];
+  int x, y, z, nx, ny, nz, p;
+
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  f = fa->f;
+  g = fa->g; 
+
+#if 0 // Original non-pipelined version
+  for( z=2; z<=nz; z++ )
+  {
+    for( y=2; y<=ny; y++ )
+    {
+      for( x=2; x<=nx; x++ )
+      {
+        err += f0->div_e_err*f0->div_e_err;
+        f0++;
+      }
+    }
+  }
+# endif
+  
+  // Have the pipelines accumulate the interior of the local domain
+  // (the host handled stragglers in the interior).
+
+  args->f = f;
+  args->g = g;
+
+  EXEC_PIPELINES( compute_rms_div_e_err, args, 0 );
+
+  // Have the host accumulate the exterior of the local domain
+
+  nx = g->nx;
+  ny = g->ny;
+  nz = g->nz;
+
+  // Do exterior faces
+
+  for( y=2; y<=ny; y++ )
+  {
+    for( z=2; z<=nz; z++ )
+    {
+      f0 = &f(   1, y, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+      f0 = &f(nx+1, y, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+    }
+  }
+
+  for( z=2; z<=nz; z++ )
+  {
+    for( x=2; x<=nx; x++ )
+    {
+      f0 = &f( x,   1, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+      f0 = &f( x,ny+1, z); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+    }
+  }
+
+  for( x=2; x<=nx; x++ )
+  {
+    for( y=2; y<=ny; y++ )
+    {
+      f0 = &f(   x,   y,   1); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+      f0 = &f(   x,   y,nz+1); err += 0.5*(double)f0->div_e_err*(double)f0->div_e_err;
+    }
+  }
+
+  // Do exterior edges
+
+  for( x=2; x<=nx; x++ )
+  {
+    f0 = &f(   x,   1,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(   x,ny+1,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(   x,   1,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(   x,ny+1,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+  }
+
+  for( y=2; y<=ny; y++ )
+  {
+    f0 = &f(   1,   y,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(   1,   y,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(nx+1,   y,   1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(nx+1,   y,nz+1); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+  }
+
+  for( z=2; z<=nz; z++ )
+  {
+    f0 = &f(   1,   1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(nx+1,   1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(   1,ny+1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+    f0 = &f(nx+1,ny+1,   z); err += 0.25*(double)f0->div_e_err*(double)f0->div_e_err;
+  }
+
+  // Do exterior corners
+
+  f0 = &f(   1,   1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(nx+1,   1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(   1,ny+1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(nx+1,ny+1,   1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(   1,   1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(nx+1,   1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(   1,ny+1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  f0 = &f(nx+1,ny+1,nz+1); err += 0.125*(double)f0->div_e_err*(double)f0->div_e_err;
+  
+  // Reduce the results from the host and pipelines
+
+  WAIT_PIPELINES();
+
+  for( p = 0; p <= N_PIPELINE; p++ )
+  {
+    err += args->err[p];
+  }
+
+  // Reduce the results from all nodes
+
+  local[0] = err * g->dV;
+
+  local[1] = ( g->nx * g->ny * g->nz ) * g->dV;
+
+  mp_allsum_d( local, global, 2 );
+
+  return g->eps0 * sqrt( global[0] / global[1] );
+}
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h
new file mode 100644
index 00000000..bf3396df
--- /dev/null
+++ b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.h
@@ -0,0 +1,24 @@
+#ifndef _compute_rms_div_e_err_pipeline_h_
+#define _compute_rms_div_e_err_pipeline_h_
+
+#ifndef IN_compute_rms_div_e_err_pipeline
+#error "Only include compute_rms_div_e_err_pipeline.h in compute_rms_div_e_err_pipeline source files."
+#endif
+
+#include "../../field_advance.h"
+
+typedef struct pipeline_args
+{
+  const field_t * ALIGNED(128) f;
+  const grid_t  *              g;
+  double err[MAX_PIPELINE+1];
+} pipeline_args_t;
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+static void
+compute_rms_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline );
+
+#endif // _compute_rms_div_e_err_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/energy_f_pipeline.c b/src/field_advance/standard/pipeline/energy_f_pipeline.c
new file mode 100644
index 00000000..0db28cd6
--- /dev/null
+++ b/src/field_advance/standard/pipeline/energy_f_pipeline.c
@@ -0,0 +1,95 @@
+// FIXME: USE THE DISCRETIZED VARIATIONAL PRINCIPLE DEFINITION OF ENERGY
+
+#define IN_sfa
+#define IN_energy_f_pipeline
+
+#include "energy_f_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+ 
+void
+energy_f_pipeline_scalar( pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  DECLARE_STENCIL();
+  
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+  
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    REDUCE_EN();
+    NEXT_STENCIL();
+  }
+
+  args->en[pipeline_rank][0] = en_ex;
+  args->en[pipeline_rank][1] = en_ey;
+  args->en[pipeline_rank][2] = en_ez;
+  args->en[pipeline_rank][3] = en_bx;
+  args->en[pipeline_rank][4] = en_by;
+  args->en[pipeline_rank][5] = en_bz;
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+energy_f_pipeline( double * global,
+                   const field_array_t * RESTRICT fa )
+{
+  if ( !global || !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have each pipeline and the host handle a portion of the
+  // local voxels
+  
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( energy_f, args, 0 );
+
+  WAIT_PIPELINES();
+
+  // Reduce results from each pipelines
+  
+  int p;
+  for( p = 1; p <= N_PIPELINE; p++ )
+  {
+    args->en[0][0] += args->en[p][0];
+    args->en[0][1] += args->en[p][1];
+    args->en[0][2] += args->en[p][2];
+    args->en[0][3] += args->en[p][3];
+    args->en[0][4] += args->en[p][4];
+    args->en[0][5] += args->en[p][5];
+  }
+    
+  // Convert to physical units and reduce results between nodes
+  
+  double v0 = 0.5 * fa->g->eps0 * fa->g->dV;
+
+  args->en[0][0] *= v0;
+  args->en[0][1] *= v0;
+  args->en[0][2] *= v0;
+  args->en[0][3] *= v0;
+  args->en[0][4] *= v0;
+  args->en[0][5] *= v0;
+
+  // Reduce results between nodes
+
+  mp_allsum_d( args->en[0], global, 6 );
+}
diff --git a/src/field_advance/standard/pipeline/energy_f_pipeline.h b/src/field_advance/standard/pipeline/energy_f_pipeline.h
new file mode 100644
index 00000000..549e587d
--- /dev/null
+++ b/src/field_advance/standard/pipeline/energy_f_pipeline.h
@@ -0,0 +1,74 @@
+#ifndef _energy_f_pipeline_h_
+#define _energy_f_pipeline_h_
+
+#ifndef IN_energy_f_pipeline
+#error "Only include energy_f_pipeline.h in energy_f_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  const field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+  double en[ MAX_PIPELINE+1 ][ 6 ];
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                                  \
+  const field_t                * ALIGNED(128) f = args->f;                 \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;             \
+  const grid_t                 *              g = args->g;                 \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                            \
+                                                                           \
+  const field_t * ALIGNED(16) f0;                                          \
+  const field_t * ALIGNED(16) fx,  * ALIGNED(16) fy,  * ALIGNED(16) fz;    \
+  const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy;   \
+  double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()   \
+  f0  = &f(x,  y,  z  ); \
+  fx  = &f(x+1,y,  z  ); \
+  fy  = &f(x,  y+1,z  ); \
+  fz  = &f(x,  y,  z+1); \
+  fyz = &f(x,  y+1,z+1); \
+  fzx = &f(x+1,y,  z+1); \
+  fxy = &f(x+1,y+1,z  )
+
+#define NEXT_STENCIL()                              \
+  f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \
+  if( x>nx ) {                                      \
+    /**/       y++;            x = 1;               \
+    if( y>ny ) z++; if( y>ny ) y = 1;               \
+    INIT_STENCIL();                                 \
+  }
+
+#define REDUCE_EN()                                       \
+  en_ex += 0.25*( m[ f0->ematx].epsx* f0->ex * f0->ex +   \
+                  m[ fy->ematx].epsx* fy->ex * fy->ex +   \
+                  m[ fz->ematx].epsx* fz->ex * fz->ex +   \
+                  m[fyz->ematx].epsx*fyz->ex *fyz->ex );  \
+  en_ey += 0.25*( m[ f0->ematy].epsy* f0->ey * f0->ey +   \
+                  m[ fz->ematy].epsy* fz->ey * fz->ey +   \
+                  m[ fx->ematy].epsy* fx->ey * fx->ey +   \
+                  m[fzx->ematy].epsy*fzx->ey *fzx->ey );  \
+  en_ez += 0.25*( m[ f0->ematz].epsz* f0->ez * f0->ez +   \
+                  m[ fx->ematz].epsz* fx->ez * fx->ez +   \
+                  m[ fy->ematz].epsz* fy->ez * fy->ez +   \
+                  m[fxy->ematz].epsz*fxy->ez *fxy->ez );  \
+  en_bx += 0.5 *( m[ f0->fmatx].rmux* f0->cbx* f0->cbx +  \
+                  m[ fx->fmatx].rmux* fx->cbx* fx->cbx ); \
+  en_by += 0.5 *( m[ f0->fmaty].rmuy* f0->cby* f0->cby +  \
+                  m[ fy->fmaty].rmuy* fy->cby* fy->cby ); \
+  en_bz += 0.5 *( m[ f0->fmatz].rmuz* f0->cbz* f0->cbz +  \
+                  m[ fz->fmatz].rmuz* fz->cbz* fz->cbz )
+
+void
+energy_f_pipeline_scalar( pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline );
+
+#endif // _energy_f_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.cc b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.cc
new file mode 100644
index 00000000..d6f7c6ce
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.cc
@@ -0,0 +1,332 @@
+#define IN_sfa
+#define IN_vacuum_advance_e_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "vacuum_advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an vacuum_advance_e pipeline function which
+// does not make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+vacuum_advance_e_pipeline_scalar( pipeline_args_t * args,
+                                  int pipeline_rank,
+                                  int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_EX();
+    UPDATE_EY();
+    UPDATE_EZ();
+
+    NEXT_STENCIL();
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_advance_e pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+vacuum_advance_e_pipeline( field_array_t * RESTRICT fa,
+                           float frac )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  if ( frac != 1 )
+  {
+    ERROR( ( "standard advance_e does not support frac != 1 yet" ) );
+  }
+
+  //--------------------------------------------------------------------------//
+  // Begin tangential B ghost setup
+  //--------------------------------------------------------------------------//
+
+  begin_remote_ghost_tang_b( fa->f, fa->g );
+
+  local_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update interior fields
+  //--------------------------------------------------------------------------//
+  // Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
+  // Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
+  // Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
+  //--------------------------------------------------------------------------//
+
+  // Do majority of interior in a single pass.  The host handles stragglers.
+
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *)fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_advance_e, args, 0 );
+
+  // While the pipelines are busy, do non-bulk interior fields
+
+  DECLARE_STENCIL();
+
+  // Do left over interior ex
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_EX();
+    }
+  }
+
+  // Do left over interior ey
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 2, 1, z   );
+    fx = &f( 1, 1, z   );
+    fz = &f( 2, 1, z-1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do left over interior ez
+  for( y = 2; y <= ny; y++ )
+  {
+    f0 = &f( 2, y,   1 );
+    fx = &f( 1, y,   1 );
+    fy = &f( 2, y-1, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  WAIT_PIPELINES();
+
+  //--------------------------------------------------------------------------//
+  // Finish tangential B ghost setup
+  //--------------------------------------------------------------------------//
+
+  end_remote_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update exterior fields
+  //--------------------------------------------------------------------------//
+
+  // Do exterior ex
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // Do exterior ey
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z   );
+      fx = &f( 0, y, z   );
+      fz = &f( 1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y, z   );
+      fx = &f( nx,   y, z   );
+      fz = &f( nx+1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, 1 );
+    fx = &f( 1, y, 1 );
+    fz = &f( 2, y, 0 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, nz+1 );
+    fx = &f( 1, y, nz+1 );
+    fz = &f( 2, y, nz   );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do exterior ez
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z );
+    fx = &f( 0, 1, z );
+    fy = &f( 1, 0, z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z );
+    fx = &f( 0, ny+1, z );
+    fy = &f( 1, ny,   z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z );
+      fx = &f( 0, y,   z );
+      fy = &f( 1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y,   z );
+      fx = &f( nx,   y,   z );
+      fy = &f( nx+1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h
new file mode 100644
index 00000000..ee08b2e3
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline.h
@@ -0,0 +1,91 @@
+#ifndef _vacuum_advance_e_pipeline_h_
+#define _vacuum_advance_e_pipeline_h_
+
+#ifndef IN_vacuum_advance_e_pipeline
+#error "Only include vacuum_advance_e_pipeline.h in vacuum_advance_e_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+        field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                                    \
+        field_t                * ALIGNED(128) f = args->f;                   \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;               \
+  const grid_t                 *              g = args->g;                   \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                              \
+                                                                             \
+  const float decayx = m->decayx, drivex = m->drivex;                        \
+  const float decayy = m->decayy, drivey = m->drivey;                        \
+  const float decayz = m->decayz, drivez = m->drivez;                        \
+  const float damp   = args->p->damp;                                        \
+  const float px_muz = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuz; \
+  const float px_muy = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuy; \
+  const float py_mux = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmux; \
+  const float py_muz = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmuz; \
+  const float pz_muy = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmuy; \
+  const float pz_mux = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmux; \
+  const float cj     = g->dt/g->eps0;                                        \
+                                                                             \
+  field_t * ALIGNED(16) f0;                                                  \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;              \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define INIT_STENCIL()        \
+  f0 = &f( x,   y,   z   );   \
+  fx = &f( x-1, y,   z   );   \
+  fy = &f( x,   y-1, z   );   \
+  fz = &f( x,   y,   z-1 )
+
+#define NEXT_STENCIL()                        \
+  f0++; fx++; fy++; fz++; x++;                \
+  if ( x > nx )                               \
+  {                                           \
+                  y++;               x = 2;   \
+    if ( y > ny ) z++; if ( y > ny ) y = 2;   \
+    INIT_STENCIL();                           \
+  }
+
+#define UPDATE_EX()                                                 \
+  f0->tcax = ( py_muz * ( f0->cbz - fy->cbz ) -                     \
+	       pz_muy * ( f0->cby - fz->cby ) ) - damp * f0->tcax;  \
+  f0->ex   = decayx * f0->ex + drivex * ( f0->tcax - cj * f0->jfx )
+
+#define UPDATE_EY()                                                 \
+  f0->tcay = ( pz_mux * ( f0->cbx - fz->cbx ) -                     \
+	       px_muz * ( f0->cbz - fx->cbz ) ) - damp * f0->tcay;  \
+  f0->ey   = decayy * f0->ey + drivey * ( f0->tcay - cj * f0->jfy )
+
+#define UPDATE_EZ()                                                 \
+  f0->tcaz = ( px_muy * ( f0->cby - fx->cby ) -                     \
+	       py_mux * ( f0->cbx - fy->cbx ) ) - damp * f0->tcaz;  \
+  f0->ez   = decayz * f0->ez + drivez * ( f0->tcaz - cj * f0->jfz )
+
+void
+vacuum_advance_e_pipeline_scalar( pipeline_args_t * args,
+                                  int pipeline_rank,
+                                  int n_pipeline );
+
+void
+vacuum_advance_e_pipeline_v4( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline );
+
+void
+vacuum_advance_e_pipeline_v8( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline );
+
+void
+vacuum_advance_e_pipeline_v16( pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline );
+
+#endif // _vacuum_advance_e_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v16.cc b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v16.cc
new file mode 100644
index 00000000..717125b5
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v16.cc
@@ -0,0 +1,189 @@
+#define IN_sfa
+#define IN_vacuum_advance_e_pipeline
+
+#include "vacuum_advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+vacuum_advance_e_pipeline_v16( pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v16float vdecayx( decayx ), vdrivex( drivex );
+  const v16float vdecayy( decayy ), vdrivey( drivey );
+  const v16float vdecayz( decayz ), vdrivez( drivez );
+  const v16float vdamp( damp );
+  const v16float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v16float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v16float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+  const v16float vcj( cj );
+
+  v16float save0, save1, dummy;
+
+  v16float f0_ex,   f0_ey,   f0_ez;
+  v16float f0_cbx,  f0_cby,  f0_cbz;
+  v16float f0_tcax, f0_tcay, f0_tcaz;
+  v16float f0_jfx,  f0_jfy,  f0_jfz;
+  v16float          fx_cby,  fx_cbz;
+  v16float fy_cbx,           fy_cbz;
+  v16float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 16 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    f000 = f0; fx00 = fx; fy00 = fy; fz00 = fz; NEXT_STENCIL();
+    f001 = f0; fx01 = fx; fy01 = fy; fz01 = fz; NEXT_STENCIL();
+    f002 = f0; fx02 = fx; fy02 = fy; fz02 = fz; NEXT_STENCIL();
+    f003 = f0; fx03 = fx; fy03 = fy; fz03 = fz; NEXT_STENCIL();
+    f004 = f0; fx04 = fx; fy04 = fy; fz04 = fz; NEXT_STENCIL();
+    f005 = f0; fx05 = fx; fy05 = fy; fz05 = fz; NEXT_STENCIL();
+    f006 = f0; fx06 = fx; fy06 = fy; fz06 = fz; NEXT_STENCIL();
+    f007 = f0; fx07 = fx; fy07 = fy; fz07 = fz; NEXT_STENCIL();
+    f008 = f0; fx08 = fx; fy08 = fy; fz08 = fz; NEXT_STENCIL();
+    f009 = f0; fx09 = fx; fy09 = fy; fz09 = fz; NEXT_STENCIL();
+    f010 = f0; fx10 = fx; fy10 = fy; fz10 = fz; NEXT_STENCIL();
+    f011 = f0; fx11 = fx; fy11 = fy; fz11 = fz; NEXT_STENCIL();
+    f012 = f0; fx12 = fx; fy12 = fy; fz12 = fz; NEXT_STENCIL();
+    f013 = f0; fx13 = fx; fy13 = fy; fz13 = fz; NEXT_STENCIL();
+    f014 = f0; fx14 = fx; fy14 = fy; fz14 = fz; NEXT_STENCIL();
+    f015 = f0; fx15 = fx; fy15 = fy; fz15 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_16x4_tr( &f000->ex, &f001->ex, &f002->ex, &f003->ex,
+                  &f004->ex, &f005->ex, &f006->ex, &f007->ex,
+                  &f008->ex, &f009->ex, &f010->ex, &f011->ex,
+                  &f012->ex, &f013->ex, &f014->ex, &f015->ex,
+                  f0_ex, f0_ey, f0_ez, save0 );
+
+    load_16x3_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz );
+
+    load_16x4_tr( &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                  &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                  &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                  &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax,
+                  f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_16x3_tr( &f000->jfx, &f001->jfx, &f002->jfx, &f003->jfx,
+                  &f004->jfx, &f005->jfx, &f006->jfx, &f007->jfx,
+                  &f008->jfx, &f009->jfx, &f010->jfx, &f011->jfx,
+                  &f012->jfx, &f013->jfx, &f014->jfx, &f015->jfx,
+                  f0_jfx, f0_jfy, f0_jfz );
+
+    load_16x3_tr( &fx00->cbx, &fx01->cbx, &fx02->cbx, &fx03->cbx,
+                  &fx04->cbx, &fx05->cbx, &fx06->cbx, &fx07->cbx,
+                  &fx08->cbx, &fx09->cbx, &fx10->cbx, &fx11->cbx,
+                  &fx12->cbx, &fx13->cbx, &fx14->cbx, &fx15->cbx,
+                  dummy, fx_cby, fx_cbz );
+
+    load_16x3_tr( &fy00->cbx, &fy01->cbx, &fy02->cbx, &fy03->cbx,
+                  &fy04->cbx, &fy05->cbx, &fy06->cbx, &fy07->cbx,
+                  &fy08->cbx, &fy09->cbx, &fy10->cbx, &fy11->cbx,
+                  &fy12->cbx, &fy13->cbx, &fy14->cbx, &fy15->cbx,
+                  fy_cbx, dummy, fy_cbz );
+
+    load_16x2_tr( &fz00->cbx, &fz01->cbx, &fz02->cbx, &fz03->cbx,
+                  &fz04->cbx, &fz05->cbx, &fz06->cbx, &fz07->cbx,
+                  &fz08->cbx, &fz09->cbx, &fz10->cbx, &fz11->cbx,
+                  &fz12->cbx, &fz13->cbx, &fz14->cbx, &fz15->cbx,
+                  fz_cbx, fz_cby );
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy_muz,
+                         ( f0_cbz - fy_cbz ),
+                         vpz_muy * ( f0_cby - fz_cby ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz_mux,
+                         ( f0_cbx - fz_cbx ),
+                         vpx_muz * ( f0_cbz - fx_cbz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx_muy,
+                         ( f0_cby - fx_cby ),
+                         vpy_mux * ( f0_cbx - fy_cbx ) ) );
+
+    f0_ex   = fma( vdecayx, f0_ex, vdrivex * fnms( vcj, f0_jfx, f0_tcax ) );
+
+    f0_ey   = fma( vdecayy, f0_ey, vdrivey * fnms( vcj, f0_jfy, f0_tcay ) );
+
+    f0_ez   = fma( vdecayz, f0_ez, vdrivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_16x3 versus load_16x4, store_16x4 is much more efficient
+    // than store_16x3.
+    //------------------------------------------------------------------------//
+
+    store_16x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                   &f000->ex, &f001->ex, &f002->ex, &f003->ex,
+                   &f004->ex, &f005->ex, &f006->ex, &f007->ex,
+                   &f008->ex, &f009->ex, &f010->ex, &f011->ex,
+                   &f012->ex, &f013->ex, &f014->ex, &f015->ex );
+
+    store_16x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                   &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                   &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                   &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                   &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_advance_e_pipeline_v16( pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No vacuum_advance_e_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v4.cc b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v4.cc
new file mode 100644
index 00000000..403369ef
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v4.cc
@@ -0,0 +1,138 @@
+#define IN_sfa
+#define IN_vacuum_advance_e_pipeline
+
+#include "vacuum_advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+vacuum_advance_e_pipeline_v4( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v4float vdecayx( decayx ), vdrivex( drivex );
+  const v4float vdecayy( decayy ), vdrivey( drivey );
+  const v4float vdecayz( decayz ), vdrivez( drivez );
+  const v4float vdamp( damp );
+  const v4float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v4float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v4float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+  const v4float vcj( cj );
+
+  v4float save0, save1, dummy;
+
+  v4float f0_ex,   f0_ey,   f0_ez;
+  v4float f0_cbx,  f0_cby,  f0_cbz;
+  v4float f0_tcax, f0_tcay, f0_tcaz;
+  v4float f0_jfx,  f0_jfy,  f0_jfz;
+  v4float          fx_cby,  fx_cbz;
+  v4float fy_cbx,           fy_cbz;
+  v4float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel block
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 4 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_4x4_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                 f0_ex, f0_ey, f0_ez, save0 );
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_4x3_tr( &f00->jfx, &f01->jfx, &f02->jfx, &f03->jfx,
+                 f0_jfx, f0_jfy, f0_jfz );
+
+    load_4x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_4x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_4x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 fz_cbx, fz_cby );
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy_muz,
+                         ( f0_cbz - fy_cbz ),
+                         vpz_muy * ( f0_cby - fz_cby ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz_mux,
+                         ( f0_cbx - fz_cbx ),
+                         vpx_muz * ( f0_cbz - fx_cbz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx_muy,
+                         ( f0_cby - fx_cby ),
+                         vpy_mux * ( f0_cbx - fy_cbx ) ) );
+
+    f0_ex   = fma( vdecayx, f0_ex, vdrivex * fnms( vcj, f0_jfx, f0_tcax ) );
+
+    f0_ey   = fma( vdecayy, f0_ey, vdrivey * fnms( vcj, f0_jfy, f0_tcay ) );
+
+    f0_ez   = fma( vdecayz, f0_ez, vdrivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient
+    // than store_4x3.
+    //------------------------------------------------------------------------//
+
+    store_4x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                  &f00->ex, &f01->ex, &f02->ex, &f03->ex );
+
+    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_advance_e_pipeline_v4( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No vacuum_advance_e_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v8.cc b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v8.cc
new file mode 100644
index 00000000..30e435ed
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_advance_e_pipeline_v8.cc
@@ -0,0 +1,155 @@
+#define IN_sfa
+#define IN_vacuum_advance_e_pipeline
+
+#include "vacuum_advance_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+vacuum_advance_e_pipeline_v8( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v8float vdecayx( decayx ), vdrivex( drivex );
+  const v8float vdecayy( decayy ), vdrivey( drivey );
+  const v8float vdecayz( decayz ), vdrivez( drivez );
+  const v8float vdamp( damp );
+  const v8float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v8float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v8float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+  const v8float vcj( cj );
+
+  v8float save0, save1, dummy;
+
+  v8float f0_ex,   f0_ey,   f0_ez;
+  v8float f0_cbx,  f0_cby,  f0_cbz;
+  v8float f0_tcax, f0_tcay, f0_tcaz;
+  v8float f0_jfx,  f0_jfy,  f0_jfz;
+  v8float          fx_cby,  fx_cbz;
+  v8float fy_cbx,           fy_cbz;
+  v8float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel block
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel block
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel block +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel block +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel block +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel block +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel block +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 8 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+    f04 = f0; fx4 = fx; fy4 = fy; fz4 = fz; NEXT_STENCIL();
+    f05 = f0; fx5 = fx; fy5 = fy; fz5 = fz; NEXT_STENCIL();
+    f06 = f0; fx6 = fx; fy6 = fy; fz6 = fz; NEXT_STENCIL();
+    f07 = f0; fx7 = fx; fy7 = fy; fz7 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_8x4_tr( &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                 &f04->ex, &f05->ex, &f06->ex, &f07->ex,
+                 f0_ex, f0_ey, f0_ez, save0 );
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_8x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_8x3_tr( &f00->jfx, &f01->jfx, &f02->jfx, &f03->jfx,
+                 &f04->jfx, &f05->jfx, &f06->jfx, &f07->jfx,
+                 f0_jfx, f0_jfy, f0_jfz );
+
+    load_8x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 &fx4->cbx, &fx5->cbx, &fx6->cbx, &fx7->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_8x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 &fy4->cbx, &fy5->cbx, &fy6->cbx, &fy7->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_8x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 &fz4->cbx, &fz5->cbx, &fz6->cbx, &fz7->cbx,
+                 fz_cbx, fz_cby );
+
+    f0_tcax = fnms( vdamp,
+                    f0_tcax,
+                    fms( vpy_muz,
+                         ( f0_cbz - fy_cbz ),
+                         vpz_muy * ( f0_cby - fz_cby ) ) );
+
+    f0_tcay = fnms( vdamp,
+                    f0_tcay,
+                    fms( vpz_mux,
+                         ( f0_cbx - fz_cbx ),
+                         vpx_muz * ( f0_cbz - fx_cbz ) ) );
+
+    f0_tcaz = fnms( vdamp,
+                    f0_tcaz,
+                    fms( vpx_muy,
+                         ( f0_cby - fx_cby ),
+                         vpy_mux * ( f0_cbx - fy_cbx ) ) );
+
+    f0_ex   = fma( vdecayx, f0_ex, vdrivex * fnms( vcj, f0_jfx, f0_tcax ) );
+
+    f0_ey   = fma( vdecayy, f0_ey, vdrivey * fnms( vcj, f0_jfy, f0_tcay ) );
+
+    f0_ez   = fma( vdecayz, f0_ez, vdrivez * fnms( vcj, f0_jfz, f0_tcaz ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_8x3 versus load_8x4, store_8x4 is much more efficient
+    // than store_8x3.
+    //------------------------------------------------------------------------//
+
+    store_8x4_tr( f0_ex, f0_ey, f0_ez, save0,
+                  &f00->ex, &f01->ex, &f02->ex, &f03->ex,
+                  &f04->ex, &f05->ex, &f06->ex, &f07->ex );
+
+    store_8x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                  &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_advance_e_pipeline_v8( pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No vacuum_advance_e_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c
new file mode 100644
index 00000000..c616dda9
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c
@@ -0,0 +1,146 @@
+#define IN_sfa
+#define IN_vacuum_clean_div_e_pipeline
+
+#include "vacuum_clean_div_e_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+static void
+vacuum_clean_div_e_pipeline_scalar( pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline )
+{
+  DECLARE_STENCIL();
+  
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    MARDER_EX();
+    MARDER_EY();
+    MARDER_EZ();
+
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+vacuum_clean_div_e_pipeline( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Do majority of field components in single pass on the pipelines.
+  // The host handles stragglers.
+
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_clean_div_e, args, 0 );
+
+  // While pipelines are busy, do left overs on the host
+
+  DECLARE_STENCIL();
+  
+  // Do left over ex
+  for( y=1; y<=ny+1; y++ )
+  {
+    f0 = &f(1,y,nz+1);
+    fx = &f(2,y,nz+1);
+
+    for( x=1; x<=nx; x++ )
+    {
+      MARDER_EX();
+
+      f0++; fx++;
+    }
+  }
+
+  for( z=1; z<=nz; z++ )
+  {
+    f0 = &f(1,ny+1,z);
+    fx = &f(2,ny+1,z);
+
+    for( x=1; x<=nx; x++ )
+    {
+      MARDER_EX();
+
+      f0++;
+      fx++;
+    }
+  }
+
+  // Do left over ey
+  for( z=1; z<=nz+1; z++ )
+  {
+    for( y=1; y<=ny; y++ )
+    {
+      f0 = &f(nx+1,y,  z);
+      fy = &f(nx+1,y+1,z);
+
+      MARDER_EY();
+    }
+  }
+
+  for( y=1; y<=ny; y++ )
+  {
+    f0 = &f(1,y,  nz+1);
+    fy = &f(1,y+1,nz+1);
+
+    for( x=1; x<=nx; x++ )
+    {
+      MARDER_EY();
+
+      f0++;
+      fy++;
+    }
+  }
+
+  // Do left over ez
+  for( z=1; z<=nz; z++ )
+  {
+    f0 = &f(1,ny+1,z);
+    fz = &f(1,ny+1,z+1);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      MARDER_EZ();
+
+      f0++;
+      fz++;
+    }
+  }
+
+  for( z=1; z<=nz; z++ )
+  {
+    for( y=1; y<=ny; y++ )
+    {
+      f0 = &f(nx+1,y,z);
+      fz = &f(nx+1,y,z+1);
+
+      MARDER_EZ();
+    }
+  }
+
+  WAIT_PIPELINES();
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h
new file mode 100644
index 00000000..bf031369
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.h
@@ -0,0 +1,60 @@
+#ifndef _vacuum_clean_div_e_pipeline_h_
+#define _vacuum_clean_div_e_pipeline_h_
+
+#ifndef IN_vacuum_clean_div_e_pipeline
+#error "Only include vacuum_clean_div_e_pipeline.h in vacuum_clean_div_e_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  field_t            * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                                \
+  field_t                      * ALIGNED(128) f = args->f;               \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;           \
+  const grid_t                 *              g = args->g;               \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                          \
+                                                                         \
+  const float _rdx = (nx>1) ? g->rdx : 0;                                \
+  const float _rdy = (ny>1) ? g->rdy : 0;                                \
+  const float _rdz = (nz>1) ? g->rdz : 0;                                \
+  const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \
+  const float px   = (alphadt*_rdx)*m->drivex;                           \
+  const float py   = (alphadt*_rdy)*m->drivey;                           \
+  const float pz   = (alphadt*_rdz)*m->drivez;                           \
+                                                                         \
+  field_t * ALIGNED(16) f0;                                              \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;          \
+  int x, y, z
+                     
+#define f(x,y,z) f[ VOXEL(x,y,z,nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x+1,y,  z  ); \
+  fy = &f(x,  y+1,z  ); \
+  fz = &f(x,  y,  z+1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 1; \
+    if( y>ny ) z++; if( y>ny ) y = 1; \
+    INIT_STENCIL();                   \
+  }
+
+#define MARDER_EX() f0->ex += px*(fx->div_e_err-f0->div_e_err)
+#define MARDER_EY() f0->ey += py*(fy->div_e_err-f0->div_e_err)
+#define MARDER_EZ() f0->ez += pz*(fz->div_e_err-f0->div_e_err)
+
+static void
+vacuum_clean_div_e_pipeline_scalar( pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline );
+
+#endif // _vacuum_clean_div_e_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.cc b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.cc
new file mode 100644
index 00000000..67878409
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.cc
@@ -0,0 +1,326 @@
+#define IN_sfa
+#define IN_vacuum_compute_curl_b_pipeline
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "vacuum_compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for a vacuum_compute_curl_b pipeline function
+// which does not make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+vacuum_compute_curl_b_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_EX();
+    UPDATE_EY();
+    UPDATE_EZ();
+
+    NEXT_STENCIL();
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_compute_curl_b
+// pipeline function.
+//----------------------------------------------------------------------------//
+
+void
+vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  //--------------------------------------------------------------------------//
+  // Begin tangential B ghost setup
+  //--------------------------------------------------------------------------//
+
+  begin_remote_ghost_tang_b( fa->f, fa->g );
+
+  local_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update interior fields
+  //--------------------------------------------------------------------------//
+  // Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
+  // Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
+  // Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
+  //--------------------------------------------------------------------------//
+
+  // Do majority interior in a single pass. The host handles stragglers.
+
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_compute_curl_b, args, 0 );
+
+  // While the pipelines are busy, do non-bulk interior fields
+
+  DECLARE_STENCIL();
+
+  // Do left over interior ex
+  for( z = 2; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z   );
+      fy = &f( 1, y-1, z   );
+      fz = &f( 1, y,   z-1 );
+
+      UPDATE_EX();
+    }
+  }
+
+  // Do left over interior ey
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 2, 1, z   );
+    fx = &f( 1, 1, z   );
+    fz = &f( 2, 1, z-1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do left over interior ez
+  for( y = 2; y <= ny; y++ )
+  {
+    f0 = &f( 2, y,   1 );
+    fx = &f( 1, y,   1 );
+    fy = &f( 2, y-1, 1 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  WAIT_PIPELINES();
+  
+  //--------------------------------------------------------------------------//
+  // Finish tangential B ghost setup
+  //--------------------------------------------------------------------------//
+
+  end_remote_ghost_tang_b( fa->f, fa->g );
+
+  //--------------------------------------------------------------------------//
+  // Update exterior fields
+  //--------------------------------------------------------------------------//
+
+  // Do exterior ex
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   1 );
+    fy = &f( 1, y-1, 1 );
+    fz = &f( 1, y,   0 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny+1; y++ )
+  {
+    f0 = &f( 1, y,   nz+1 );
+    fy = &f( 1, y-1, nz+1 );
+    fz = &f( 1, y,   nz   );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z   );
+    fy = &f( 1, 0, z   );
+    fz = &f( 1, 1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z = 2; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z   );
+    fy = &f( 1, ny,   z   );
+    fz = &f( 1, ny+1, z-1 );
+
+    for( x = 1; x <= nx; x++ )
+    {
+      UPDATE_EX();
+
+      f0++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // Do exterior ey
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( 1, y, z   );
+      fx = &f( 0, y, z   );
+      fz = &f( 1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( z = 1; z <= nz+1; z++ )
+  {
+    for( y = 1; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y, z   );
+      fx = &f( nx,   y, z   );
+      fz = &f( nx+1, y, z-1 );
+
+      UPDATE_EY();
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, 1 );
+    fx = &f( 1, y, 1 );
+    fz = &f( 2, y, 0 );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  for( y = 1; y <= ny; y++ )
+  {
+    f0 = &f( 2, y, nz+1 );
+    fx = &f( 1, y, nz+1 );
+    fz = &f( 2, y, nz   );
+
+    for( x = 2; x <= nx; x++ )
+    {
+      UPDATE_EY();
+
+      f0++;
+      fx++;
+      fz++;
+    }
+  }
+
+  // Do exterior ez
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, 1, z );
+    fx = &f( 0, 1, z );
+    fy = &f( 1, 0, z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    f0 = &f( 1, ny+1, z );
+    fx = &f( 0, ny+1, z );
+    fy = &f( 1, ny,   z );
+
+    for( x = 1; x <= nx+1; x++ )
+    {
+      UPDATE_EZ();
+
+      f0++;
+      fx++;
+      fy++;
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( 1, y,   z );
+      fx = &f( 0, y,   z );
+      fy = &f( 1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  for( z = 1; z <= nz; z++ )
+  {
+    for( y = 2; y <= ny; y++ )
+    {
+      f0 = &f( nx+1, y,   z );
+      fx = &f( nx,   y,   z );
+      fy = &f( nx+1, y-1, z );
+
+      UPDATE_EZ();
+    }
+  }
+
+  local_adjust_tang_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h
new file mode 100644
index 00000000..ca78b79d
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline.h
@@ -0,0 +1,80 @@
+#ifndef _vacuum_compute_curl_b_pipeline_h_
+#define _vacuum_compute_curl_b_pipeline_h_
+
+#ifndef IN_vacuum_compute_curl_b_pipeline
+#error "Only include vacuum_compute_curl_b_pipeline.h in vacuum_compute_curl_b_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+        field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                               \
+        field_t                * ALIGNED(128) f = args->f;              \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;          \
+  const grid_t                 *              g = args->g;              \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                         \
+                                                                        \
+  const float px_muz = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuz;     \
+  const float px_muy = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuy;     \
+  const float py_mux = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmux;     \
+  const float py_muz = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmuz;     \
+  const float pz_muy = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmuy;     \
+  const float pz_mux = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmux;     \
+                                                                        \
+  field_t * ALIGNED(16) f0;                                             \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;         \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+#define INIT_STENCIL()        \
+  f0 = &f( x,   y,   z   );   \
+  fx = &f( x-1, y,   z   );   \
+  fy = &f( x,   y-1, z   );   \
+  fz = &f( x,   y,   z-1 )
+
+#define NEXT_STENCIL()                        \
+  f0++; fx++; fy++; fz++; x++;                \
+  if ( x > nx )                               \
+  {                                           \
+                  y++;               x = 2;   \
+    if ( y > ny ) z++; if ( y > ny ) y = 2;   \
+    INIT_STENCIL();                           \
+  }
+
+#define UPDATE_EX() f0->tcax = ( py_muz * ( f0->cbz - fy->cbz ) - \
+                                 pz_muy * ( f0->cby - fz->cby ) )
+
+#define UPDATE_EY() f0->tcay = ( pz_mux * ( f0->cbx - fz->cbx ) - \
+                                 px_muz * ( f0->cbz - fx->cbz ) )
+
+#define UPDATE_EZ() f0->tcaz = ( px_muy * ( f0->cby - fx->cby ) - \
+                                 py_mux * ( f0->cbx - fy->cbx ) )
+
+void
+vacuum_compute_curl_b_pipeline_scalar( pipeline_args_t * args,
+                                       int pipeline_rank,
+                                       int n_pipeline );
+
+void
+vacuum_compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline );
+
+void
+vacuum_compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline );
+
+void
+vacuum_compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline );
+
+#endif // _vacuum_compute_curl_b_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v16.cc b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v16.cc
new file mode 100644
index 00000000..bac6d121
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v16.cc
@@ -0,0 +1,146 @@
+#define IN_sfa
+#define IN_vacuum_compute_curl_b_pipeline
+
+#include "vacuum_compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+vacuum_compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v16float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v16float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v16float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+
+  v16float save1, dummy;
+
+  v16float f0_cbx,  f0_cby,  f0_cbz;
+  v16float f0_tcax, f0_tcay, f0_tcaz;
+  v16float          fx_cby,  fx_cbz;
+  v16float fy_cbx,           fy_cbz;
+  v16float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(16) f000, * ALIGNED(16) f001, * ALIGNED(16) f002, * ALIGNED(16) f003; // Voxel block
+  field_t * ALIGNED(16) f004, * ALIGNED(16) f005, * ALIGNED(16) f006, * ALIGNED(16) f007; // Voxel block
+  field_t * ALIGNED(16) f008, * ALIGNED(16) f009, * ALIGNED(16) f010, * ALIGNED(16) f011; // Voxel block
+  field_t * ALIGNED(16) f012, * ALIGNED(16) f013, * ALIGNED(16) f014, * ALIGNED(16) f015; // Voxel block
+
+  field_t * ALIGNED(16) fx00, * ALIGNED(16) fx01, * ALIGNED(16) fx02, * ALIGNED(16) fx03; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx04, * ALIGNED(16) fx05, * ALIGNED(16) fx06, * ALIGNED(16) fx07; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx08, * ALIGNED(16) fx09, * ALIGNED(16) fx10, * ALIGNED(16) fx11; // Voxel block +x neighbors
+  field_t * ALIGNED(16) fx12, * ALIGNED(16) fx13, * ALIGNED(16) fx14, * ALIGNED(16) fx15; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy00, * ALIGNED(16) fy01, * ALIGNED(16) fy02, * ALIGNED(16) fy03; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy04, * ALIGNED(16) fy05, * ALIGNED(16) fy06, * ALIGNED(16) fy07; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy08, * ALIGNED(16) fy09, * ALIGNED(16) fy10, * ALIGNED(16) fy11; // Voxel block +y neighbors
+  field_t * ALIGNED(16) fy12, * ALIGNED(16) fy13, * ALIGNED(16) fy14, * ALIGNED(16) fy15; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz00, * ALIGNED(16) fz01, * ALIGNED(16) fz02, * ALIGNED(16) fz03; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz04, * ALIGNED(16) fz05, * ALIGNED(16) fz06, * ALIGNED(16) fz07; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz08, * ALIGNED(16) fz09, * ALIGNED(16) fz10, * ALIGNED(16) fz11; // Voxel block +z neighbors
+  field_t * ALIGNED(16) fz12, * ALIGNED(16) fz13, * ALIGNED(16) fz14, * ALIGNED(16) fz15; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 16 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 15; n_voxel -= 16 )
+  {
+    f000 = f0; fx00 = fx; fy00 = fy; fz00 = fz; NEXT_STENCIL();
+    f001 = f0; fx01 = fx; fy01 = fy; fz01 = fz; NEXT_STENCIL();
+    f002 = f0; fx02 = fx; fy02 = fy; fz02 = fz; NEXT_STENCIL();
+    f003 = f0; fx03 = fx; fy03 = fy; fz03 = fz; NEXT_STENCIL();
+    f004 = f0; fx04 = fx; fy04 = fy; fz04 = fz; NEXT_STENCIL();
+    f005 = f0; fx05 = fx; fy05 = fy; fz05 = fz; NEXT_STENCIL();
+    f006 = f0; fx06 = fx; fy06 = fy; fz06 = fz; NEXT_STENCIL();
+    f007 = f0; fx07 = fx; fy07 = fy; fz07 = fz; NEXT_STENCIL();
+    f008 = f0; fx08 = fx; fy08 = fy; fz08 = fz; NEXT_STENCIL();
+    f009 = f0; fx09 = fx; fy09 = fy; fz09 = fz; NEXT_STENCIL();
+    f010 = f0; fx10 = fx; fy10 = fy; fz10 = fz; NEXT_STENCIL();
+    f011 = f0; fx11 = fx; fy11 = fy; fz11 = fz; NEXT_STENCIL();
+    f012 = f0; fx12 = fx; fy12 = fy; fz12 = fz; NEXT_STENCIL();
+    f013 = f0; fx13 = fx; fy13 = fy; fz13 = fz; NEXT_STENCIL();
+    f014 = f0; fx14 = fx; fy14 = fy; fz14 = fz; NEXT_STENCIL();
+    f015 = f0; fx15 = fx; fy15 = fy; fz15 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_16x3_tr( &f000->cbx, &f001->cbx, &f002->cbx, &f003->cbx,
+                  &f004->cbx, &f005->cbx, &f006->cbx, &f007->cbx,
+                  &f008->cbx, &f009->cbx, &f010->cbx, &f011->cbx,
+                  &f012->cbx, &f013->cbx, &f014->cbx, &f015->cbx,
+                  f0_cbx, f0_cby, f0_cbz );
+
+    load_16x4_tr( &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                  &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                  &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                  &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax,
+                  f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_16x3_tr( &fx00->cbx, &fx01->cbx, &fx02->cbx, &fx03->cbx,
+                  &fx04->cbx, &fx05->cbx, &fx06->cbx, &fx07->cbx,
+                  &fx08->cbx, &fx09->cbx, &fx10->cbx, &fx11->cbx,
+                  &fx12->cbx, &fx13->cbx, &fx14->cbx, &fx15->cbx,
+                  dummy, fx_cby, fx_cbz );
+
+    load_16x3_tr( &fy00->cbx, &fy01->cbx, &fy02->cbx, &fy03->cbx,
+                  &fy04->cbx, &fy05->cbx, &fy06->cbx, &fy07->cbx,
+                  &fy08->cbx, &fy09->cbx, &fy10->cbx, &fy11->cbx,
+                  &fy12->cbx, &fy13->cbx, &fy14->cbx, &fy15->cbx,
+                  fy_cbx, dummy, fy_cbz );
+
+    load_16x2_tr( &fz00->cbx, &fz01->cbx, &fz02->cbx, &fz03->cbx,
+                  &fz04->cbx, &fz05->cbx, &fz06->cbx, &fz07->cbx,
+                  &fz08->cbx, &fz09->cbx, &fz10->cbx, &fz11->cbx,
+                  &fz12->cbx, &fz13->cbx, &fz14->cbx, &fz15->cbx,
+                  fz_cbx, fz_cby );
+
+    f0_tcax = fms( vpy_muz, ( f0_cbz - fy_cbz ), vpz_muy * ( f0_cby - fz_cby ) );
+
+    f0_tcay = fms( vpz_mux, ( f0_cbx - fz_cbx ), vpx_muz * ( f0_cbz - fx_cbz ) );
+
+    f0_tcaz = fms( vpx_muy, ( f0_cby - fx_cby ), vpy_mux * ( f0_cbx - fy_cbx ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_16x3 versus load_16x4, store_16x4 is much more efficient
+    // than store_16x3.
+    //------------------------------------------------------------------------//
+
+    store_16x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                   &f000->tcax, &f001->tcax, &f002->tcax, &f003->tcax,
+                   &f004->tcax, &f005->tcax, &f006->tcax, &f007->tcax,
+                   &f008->tcax, &f009->tcax, &f010->tcax, &f011->tcax,
+                   &f012->tcax, &f013->tcax, &f014->tcax, &f015->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_compute_curl_b_pipeline_v16( pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No vacuum_compute_curl_b_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v4.cc b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v4.cc
new file mode 100644
index 00000000..e12fd2a8
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v4.cc
@@ -0,0 +1,104 @@
+#define IN_sfa
+#define IN_vacuum_compute_curl_b_pipeline
+
+#include "vacuum_compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+vacuum_compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v4float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v4float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v4float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+
+  v4float save1, dummy;
+
+  v4float f0_cbx,  f0_cby,  f0_cbz;
+  v4float f0_tcax, f0_tcay, f0_tcaz;
+  v4float          fx_cby,  fx_cbz;
+  v4float fy_cbx,           fy_cbz;
+  v4float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel block
+
+  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel block +x neighbors
+
+  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel block +y neighbors
+
+  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 4 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 3; n_voxel -= 4 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_4x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_4x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_4x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_4x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 fz_cbx, fz_cby );
+
+    f0_tcax = fms( vpy_muz, ( f0_cbz - fy_cbz ), vpz_muy * ( f0_cby - fz_cby ) );
+
+    f0_tcay = fms( vpz_mux, ( f0_cbx - fz_cbx ), vpx_muz * ( f0_cbz - fx_cbz ) );
+
+    f0_tcaz = fms( vpx_muy, ( f0_cby - fx_cby ), vpy_mux * ( f0_cbx - fy_cbx ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient
+    // than store_4x3.
+    //------------------------------------------------------------------------//
+
+    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_compute_curl_b_pipeline_v4( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No vacuum_compute_curl_b_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v8.cc b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v8.cc
new file mode 100644
index 00000000..b7553963
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_curl_b_pipeline_v8.cc
@@ -0,0 +1,118 @@
+#define IN_sfa
+#define IN_vacuum_compute_curl_b_pipeline
+
+#include "vacuum_compute_curl_b_pipeline.h"
+
+#include "../sfa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+vacuum_compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  const v8float vpx_muz( px_muz ), vpx_muy( px_muy );
+  const v8float vpy_mux( py_mux ), vpy_muz( py_muz );
+  const v8float vpz_muy( pz_muy ), vpz_mux( pz_mux );
+
+  v8float save1, dummy;
+
+  v8float f0_cbx,  f0_cby,  f0_cbz;
+  v8float f0_tcax, f0_tcay, f0_tcaz;
+  v8float          fx_cby,  fx_cbz;
+  v8float fy_cbx,           fy_cbz;
+  v8float fz_cbx,  fz_cby;
+
+  field_t * ALIGNED(32) f00, * ALIGNED(32) f01, * ALIGNED(32) f02, * ALIGNED(32) f03; // Voxel block
+  field_t * ALIGNED(32) f04, * ALIGNED(32) f05, * ALIGNED(32) f06, * ALIGNED(32) f07; // Voxel block
+
+  field_t * ALIGNED(32) fx0, * ALIGNED(32) fx1, * ALIGNED(32) fx2, * ALIGNED(32) fx3; // Voxel block +x neighbors
+  field_t * ALIGNED(32) fx4, * ALIGNED(32) fx5, * ALIGNED(32) fx6, * ALIGNED(32) fx7; // Voxel block +x neighbors
+
+  field_t * ALIGNED(32) fy0, * ALIGNED(32) fy1, * ALIGNED(32) fy2, * ALIGNED(32) fy3; // Voxel block +y neighbors
+  field_t * ALIGNED(32) fy4, * ALIGNED(32) fy5, * ALIGNED(32) fy6, * ALIGNED(32) fy7; // Voxel block +y neighbors
+
+  field_t * ALIGNED(32) fz0, * ALIGNED(32) fz1, * ALIGNED(32) fz2, * ALIGNED(32) fz3; // Voxel block +z neighbors
+  field_t * ALIGNED(32) fz4, * ALIGNED(32) fz5, * ALIGNED(32) fz6, * ALIGNED(32) fz7; // Voxel block +z neighbors
+
+  // Process the bulk of the voxels 8 at a time
+
+  INIT_STENCIL();
+
+  for( ; n_voxel > 7; n_voxel -= 8 )
+  {
+    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
+    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
+    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
+    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
+    f04 = f0; fx4 = fx; fy4 = fy; fz4 = fz; NEXT_STENCIL();
+    f05 = f0; fx5 = fx; fy5 = fy; fz5 = fz; NEXT_STENCIL();
+    f06 = f0; fx6 = fx; fy6 = fy; fz6 = fz; NEXT_STENCIL();
+    f07 = f0; fx7 = fx; fy7 = fy; fz7 = fz; NEXT_STENCIL();
+
+    //------------------------------------------------------------------------//
+    // Load field data.
+    //------------------------------------------------------------------------//
+
+    load_8x3_tr( &f00->cbx, &f01->cbx, &f02->cbx, &f03->cbx,
+                 &f04->cbx, &f05->cbx, &f06->cbx, &f07->cbx,
+                 f0_cbx, f0_cby, f0_cbz );
+
+    load_8x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                 &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax,
+                 f0_tcax, f0_tcay, f0_tcaz, save1 );
+
+    load_8x3_tr( &fx0->cbx, &fx1->cbx, &fx2->cbx, &fx3->cbx,
+                 &fx4->cbx, &fx5->cbx, &fx6->cbx, &fx7->cbx,
+                 dummy, fx_cby, fx_cbz );
+
+    load_8x3_tr( &fy0->cbx, &fy1->cbx, &fy2->cbx, &fy3->cbx,
+                 &fy4->cbx, &fy5->cbx, &fy6->cbx, &fy7->cbx,
+                 fy_cbx, dummy, fy_cbz );
+
+    load_8x2_tr( &fz0->cbx, &fz1->cbx, &fz2->cbx, &fz3->cbx,
+                 &fz4->cbx, &fz5->cbx, &fz6->cbx, &fz7->cbx,
+                 fz_cbx, fz_cby );
+
+    f0_tcax = fms( vpy_muz, ( f0_cbz - fy_cbz ), vpz_muy * ( f0_cby - fz_cby ) );
+
+    f0_tcay = fms( vpz_mux, ( f0_cbx - fz_cbx ), vpx_muz * ( f0_cbz - fx_cbz ) );
+
+    f0_tcaz = fms( vpx_muy, ( f0_cby - fx_cby ), vpy_mux * ( f0_cbx - fy_cbx ) );
+
+    //------------------------------------------------------------------------//
+    // Note:
+    //------------------------------------------------------------------------//
+    // Unlike load_8x3 versus load_8x4, store_8x4 is much more efficient
+    // than store_8x3.
+    //------------------------------------------------------------------------//
+
+    store_8x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1,
+                  &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax,
+                  &f04->tcax, &f05->tcax, &f06->tcax, &f07->tcax );
+  }
+}
+
+#else
+
+void
+vacuum_compute_curl_b_pipeline_v8( pipeline_args_t * args,
+                                   int pipeline_rank,
+                                   int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No vacuum_compute_curl_b_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c
new file mode 100644
index 00000000..54cb2de6
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c
@@ -0,0 +1,174 @@
+// Note: This is virtually identical to vacuum_compute_rhob
+
+#define IN_sfa
+#define IN_vacuum_compute_div_e_err_pipeline
+
+#include "vacuum_compute_div_e_err_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+void
+vacuum_compute_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                          int pipeline_rank,
+                                          int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_DERR_E();
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+vacuum_compute_div_e_err_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have pipelines compute the interior of local domain (the host
+  // handles stragglers in the interior)
+
+  // Begin setting normal e ghosts
+
+  begin_remote_ghost_norm_e( fa->f, fa->g );
+
+  local_ghost_norm_e( fa->f, fa->g );
+
+  // Have pipelines compute interior of local domain
+
+  pipeline_args_t args[1];  
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_compute_div_e_err, args, 0 );
+
+  // While pipelines are busy, have host compute the exterior
+  // of the local domain
+
+  DECLARE_STENCIL();
+
+  // Finish setting normal e ghosts
+  end_remote_ghost_norm_e( fa->f, fa->g );
+
+  // z faces, x edges, y edges and all corners
+  for( y=1; y<=ny+1; y++ )
+  {
+    f0 = &f(1,y,  1);
+    fx = &f(0,y,  1);
+    fy = &f(1,y-1,1);
+    fz = &f(1,y,  0);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y=1; y<=ny+1; y++ )
+  {
+    f0 = &f(1,y,  nz+1);
+    fx = &f(0,y,  nz+1);
+    fy = &f(1,y-1,nz+1);
+    fz = &f(1,y,  nz);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // y faces, z edges
+  for( z=2; z<=nz; z++ )
+  {
+    f0 = &f(1,1,z);
+    fx = &f(0,1,z);
+    fy = &f(1,0,z);
+    fz = &f(1,1,z-1);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z=2; z<=nz; z++ )
+  {
+    f0 = &f(1,ny+1,z);
+    fx = &f(0,ny+1,z);
+    fy = &f(1,ny,  z);
+    fz = &f(1,ny+1,z-1);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // x faces
+  for( z=2; z<=nz; z++ )
+  {
+    for( y=2; y<=ny; y++ )
+    {
+      f0 = &f(1,y,  z);
+      fx = &f(0,y,  z);
+      fy = &f(1,y-1,z);
+      fz = &f(1,y,  z-1);
+
+      UPDATE_DERR_E();
+
+      f0 = &f(nx+1,y,  z);
+      fx = &f(nx,  y,  z);
+      fy = &f(nx+1,y-1,z);
+      fz = &f(nx+1,y,  z-1);
+
+      UPDATE_DERR_E();
+    }
+  }
+
+  // Finish up setting interior
+
+  WAIT_PIPELINES();
+
+  local_adjust_div_e( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h
new file mode 100644
index 00000000..b7690a30
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.h
@@ -0,0 +1,59 @@
+#ifndef _vacuum_compute_div_e_err_pipeline_h_
+#define _vacuum_compute_div_e_err_pipeline_h_
+
+#ifndef IN_vacuum_compute_div_e_err_pipeline
+#error "Only include vacuum_compute_div_e_err_pipeline.h in vacuum_compute_div_e_err_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  /**/  field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                       \
+  /**/  field_t                * ALIGNED(128) f = args->f;      \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
+  const grid_t                 *              g = args->g;      \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
+                                                                \
+  const float nc = m->nonconductive;                            \
+  const float px = ((nx>1) ? g->rdx : 0)*m->epsx;               \
+  const float py = ((ny>1) ? g->rdy : 0)*m->epsy;               \
+  const float pz = ((nz>1) ? g->rdz : 0)*m->epsz;               \
+  const float cj = 1./g->eps0;                                  \
+                                                                \
+  field_t * ALIGNED(16) f0;                                     \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x-1,y,  z  ); \
+  fy = &f(x,  y-1,z  ); \
+  fz = &f(x,  y,  z-1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 2; \
+    if( y>ny ) z++; if( y>ny ) y = 2; \
+    INIT_STENCIL();                   \
+  }
+
+#define UPDATE_DERR_E() f0->div_e_err = nc*( px*( f0->ex - fx->ex ) +   \
+                                             py*( f0->ey - fy->ey ) +   \
+                                             pz*( f0->ez - fz->ez ) -   \
+                                             cj*( f0->rhof + f0->rhob ) )
+
+void
+vacuum_compute_div_e_err_pipeline_scalar( pipeline_args_t * args,
+                                          int pipeline_rank,
+                                          int n_pipeline );
+
+#endif // _vacuum_compute_div_e_err_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c
new file mode 100644
index 00000000..c30a0ccf
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c
@@ -0,0 +1,174 @@
+// Note: This is virtually identical to vacuum_compute_div_e_err
+
+#define IN_sfa
+#define IN_vacuum_compute_rhob_pipeline
+
+#include "vacuum_compute_rhob_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+void
+vacuum_compute_rhob_pipeline_scalar( pipeline_args_t * args,
+                                     int pipeline_rank,
+                                     int n_pipeline )
+{
+  DECLARE_STENCIL();
+
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    UPDATE_DERR_E();
+
+    NEXT_STENCIL();
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+vacuum_compute_rhob_pipeline( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have pipelines compute the interior of local domain (the host
+  // handles stragglers in the interior)
+
+  // Begin setting normal e ghosts
+
+  begin_remote_ghost_norm_e( fa->f, fa->g );
+
+  local_ghost_norm_e( fa->f, fa->g );
+
+  // Have pipelines compute interior of local domain
+
+  pipeline_args_t args[1];  
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_compute_rhob, args, 0 );
+
+  // While pipelines are busy, have host compute the exterior
+  // of the local domain
+
+  DECLARE_STENCIL();
+
+  // Finish setting normal e ghosts
+  end_remote_ghost_norm_e( fa->f, fa->g );
+
+  // z faces, x edges, y edges and all corners
+  for( y=1; y<=ny+1; y++ )
+  {
+    f0 = &f(1,y,  1);
+    fx = &f(0,y,  1);
+    fy = &f(1,y-1,1);
+    fz = &f(1,y,  0);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( y=1; y<=ny+1; y++ )
+  {
+    f0 = &f(1,y,  nz+1);
+    fx = &f(0,y,  nz+1);
+    fy = &f(1,y-1,nz+1);
+    fz = &f(1,y,  nz);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // y faces, z edges
+  for( z=2; z<=nz; z++ )
+  {
+    f0 = &f(1,1,z);
+    fx = &f(0,1,z);
+    fy = &f(1,0,z);
+    fz = &f(1,1,z-1);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  for( z=2; z<=nz; z++ )
+  {
+    f0 = &f(1,ny+1,z);
+    fx = &f(0,ny+1,z);
+    fy = &f(1,ny,  z);
+    fz = &f(1,ny+1,z-1);
+
+    for( x=1; x<=nx+1; x++ )
+    {
+      UPDATE_DERR_E();
+      f0++;
+      fx++;
+      fy++;
+      fz++;
+    }
+  }
+
+  // x faces
+  for( z=2; z<=nz; z++ )
+  {
+    for( y=2; y<=ny; y++ )
+    {
+      f0 = &f(1,y,  z);
+      fx = &f(0,y,  z);
+      fy = &f(1,y-1,z);
+      fz = &f(1,y,  z-1);
+
+      UPDATE_DERR_E();
+
+      f0 = &f(nx+1,y,  z);
+      fx = &f(nx,  y,  z);
+      fy = &f(nx+1,y-1,z);
+      fz = &f(nx+1,y,  z-1);
+
+      UPDATE_DERR_E();
+    }
+  }
+
+  // Finish up setting interior
+
+  WAIT_PIPELINES();
+
+  local_adjust_rhob( fa->f, fa->g );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h
new file mode 100644
index 00000000..e6f54bd6
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.h
@@ -0,0 +1,57 @@
+#ifndef _vacuum_compute_rhob_pipeline_h_
+#define _vacuum_compute_rhob_pipeline_h_
+
+#ifndef IN_vacuum_compute_rhob_pipeline
+#error "Only include vacuum_compute_rhob_pipeline.h in vacuum_compute_rhob_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  /**/  field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                       \
+  /**/  field_t                * ALIGNED(128) f = args->f;      \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
+  const grid_t                 *              g = args->g;      \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
+                                                                \
+  const float nc = m->nonconductive;                            \
+  const float px = (nx>1) ? g->eps0*m->epsx*g->rdx : 0;         \
+  const float py = (ny>1) ? g->eps0*m->epsy*g->rdy : 0;         \
+  const float pz = (nz>1) ? g->eps0*m->epsz*g->rdz : 0;         \
+                                                                \
+  field_t * ALIGNED(16) f0;                                     \
+  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()  \
+  f0 = &f(x,  y,  z  ); \
+  fx = &f(x-1,y,  z  ); \
+  fy = &f(x,  y-1,z  ); \
+  fz = &f(x,  y,  z-1)
+
+#define NEXT_STENCIL()                \
+  f0++; fx++; fy++; fz++; x++;        \
+  if( x>nx ) {                        \
+    /**/       y++;            x = 2; \
+    if( y>ny ) z++; if( y>ny ) y = 2; \
+    INIT_STENCIL();                   \
+  }
+
+#define UPDATE_DERR_E() f0->rhob = nc*( px*( f0->ex - fx->ex ) + \
+                                        py*( f0->ey - fy->ey ) + \
+                                        pz*( f0->ez - fz->ez ) - f0->rhof )
+
+void
+vacuum_compute_rhob_pipeline_scalar( pipeline_args_t * args,
+                                     int pipeline_rank,
+                                     int n_pipeline );
+
+#endif // _vacuum_compute_rhob_pipeline_h_
diff --git a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c
new file mode 100644
index 00000000..2ca13330
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c
@@ -0,0 +1,97 @@
+// FIXME: USE THE DISCRETIZED VARIATIONAL DEFINITION OF ENERGY
+
+#define IN_sfa
+#define IN_vacuum_energy_f_pipeline
+
+#include "vacuum_energy_f_pipeline.h"
+
+#include "../sfa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+ 
+void
+vacuum_energy_f_pipeline_scalar( pipeline_args_t * args,
+                                 int pipeline_rank,
+                                 int n_pipeline )
+{
+  DECLARE_STENCIL();
+  
+  int n_voxel;
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+  
+  INIT_STENCIL();
+  for( ; n_voxel; n_voxel-- )
+  {
+    REDUCE_EN();
+
+    NEXT_STENCIL();
+  }
+
+  args->en[pipeline_rank][0] = en_ex;
+  args->en[pipeline_rank][1] = en_ey;
+  args->en[pipeline_rank][2] = en_ez;
+  args->en[pipeline_rank][3] = en_bx;
+  args->en[pipeline_rank][4] = en_by;
+  args->en[pipeline_rank][5] = en_bz;
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "Not implemented"
+
+#endif
+
+void
+vacuum_energy_f_pipeline( double * global,
+                          const field_array_t * RESTRICT fa )
+{
+  if ( !global || !fa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have each pipeline and the host handle a portion of the
+  // local voxels
+  
+  pipeline_args_t args[1];
+
+  args->f = fa->f;
+  args->p = (sfa_params_t *) fa->params;
+  args->g = fa->g;
+
+  EXEC_PIPELINES( vacuum_energy_f, args, 0 );
+
+  WAIT_PIPELINES();
+
+  // Reduce results from each pipelines
+  
+  int p;
+
+  for( p=1; p<=N_PIPELINE; p++ )
+  {
+    args->en[0][0] += args->en[p][0];
+    args->en[0][1] += args->en[p][1];
+    args->en[0][2] += args->en[p][2];
+    args->en[0][3] += args->en[p][3];
+    args->en[0][4] += args->en[p][4];
+    args->en[0][5] += args->en[p][5];
+  }
+    
+  // Convert to physical units and reduce results between nodes
+  
+  double v0 = 0.5*fa->g->eps0*fa->g->dV;
+
+  args->en[0][0] *= v0;
+  args->en[0][1] *= v0;
+  args->en[0][2] *= v0;
+  args->en[0][3] *= v0;
+  args->en[0][4] *= v0;
+  args->en[0][5] *= v0;
+
+  // Reduce results between nodes
+
+  mp_allsum_d( args->en[0], global, 6 );
+}
diff --git a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h
new file mode 100644
index 00000000..902d0c5a
--- /dev/null
+++ b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.h
@@ -0,0 +1,81 @@
+#ifndef _vacuum_energy_f_pipeline_h_
+#define _vacuum_energy_f_pipeline_h_
+
+#ifndef IN_vacuum_energy_f_pipeline
+#error "Only include vacuum_energy_f_pipeline.h in vacuum_energy_f_pipeline source files."
+#endif
+
+#include "../sfa_private.h"
+
+typedef struct pipeline_args
+{
+  const field_t      * ALIGNED(128) f;
+  const sfa_params_t *              p;
+  const grid_t       *              g;
+  double en[MAX_PIPELINE+1][6];
+} pipeline_args_t;
+
+#define DECLARE_STENCIL()                                                  \
+  const field_t                * ALIGNED(128) f = args->f;                 \
+  const material_coefficient_t * ALIGNED(128) m = args->p->mc;             \
+  const grid_t                 *              g = args->g;                 \
+  const int nx = g->nx, ny = g->ny, nz = g->nz;                            \
+                                                                           \
+  const float qepsx = 0.25*m->epsx;                                        \
+  const float qepsy = 0.25*m->epsy;                                        \
+  const float qepsz = 0.25*m->epsz;                                        \
+  const float hrmux = 0.50*m->rmux; /* was previously 0.25 in master */	   \
+  const float hrmuy = 0.50*m->rmuy; /* was previously 0.25 in master */	   \
+  const float hrmuz = 0.50*m->rmuz; /* was previously 0.25 in master */	   \
+                                                                           \
+  const field_t * ALIGNED(16) f0;                                          \
+  const field_t * ALIGNED(16) fx,  * ALIGNED(16) fy,  * ALIGNED(16) fz;    \
+  const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy;   \
+  double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \
+  int x, y, z
+
+#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
+
+#define INIT_STENCIL()   \
+  f0  = &f(x,  y,  z  ); \
+  fx  = &f(x+1,y,  z  ); \
+  fy  = &f(x,  y+1,z  ); \
+  fz  = &f(x,  y,  z+1); \
+  fyz = &f(x,  y+1,z+1); \
+  fzx = &f(x+1,y,  z+1); \
+  fxy = &f(x+1,y+1,z  )
+
+#define NEXT_STENCIL()                              \
+  f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \
+  if( x>nx ) {                                      \
+    /**/       y++;            x = 1;               \
+    if( y>ny ) z++; if( y>ny ) y = 1;               \
+    INIT_STENCIL();                                 \
+  }
+
+#define REDUCE_EN()                     \
+  en_ex += qepsx*(  f0->ex * f0->ex +   \
+                    fy->ex * fy->ex +   \
+                    fz->ex * fz->ex +   \
+                   fyz->ex *fyz->ex );  \
+  en_ey += qepsy*(  f0->ey * f0->ey +   \
+                    fz->ey * fz->ey +   \
+                    fx->ey * fx->ey +   \
+                   fzx->ey *fzx->ey );  \
+  en_ez += qepsz*(  f0->ez * f0->ez +   \
+                    fx->ez * fx->ez +   \
+                    fy->ez * fy->ez +   \
+                   fxy->ez *fxy->ez );  \
+  en_bx += hrmux*(  f0->cbx* f0->cbx +  \
+                    fx->cbx* fx->cbx ); \
+  en_by += hrmuy*(  f0->cby* f0->cby +  \
+                    fy->cby* fy->cby ); \
+  en_bz += hrmuz*(  f0->cbz* f0->cbz +  \
+                    fz->cbz* fz->cbz )
+
+void
+vacuum_energy_f_pipeline_scalar( pipeline_args_t * args,
+                                 int pipeline_rank,
+                                 int n_pipeline );
+
+#endif // _vacuum_energy_f_pipeline_h_
diff --git a/src/field_advance/standard/remote.c b/src/field_advance/standard/remote.c
index 1bbde9eb..f1daf24b 100644
--- a/src/field_advance/standard/remote.c
+++ b/src/field_advance/standard/remote.c
@@ -67,9 +67,9 @@ begin_remote_ghost_tang_b( field_t      * ALIGNED(128) field,
 
 # define BEGIN_RECV(i,j,k,X,Y,Z) \
   begin_recv_port(i,j,k,(1+n##Y*(n##Z+1)+n##Z*(n##Y+1))*sizeof(float),g)
-  BEGIN_RECV(-1, 0, 0,x,y,z);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 1, 0, 0,x,y,z);
   BEGIN_RECV( 0, 1, 0,y,z,x);
   BEGIN_RECV( 0, 0, 1,z,x,y);
@@ -86,9 +86,9 @@ begin_remote_ghost_tang_b( field_t      * ALIGNED(128) field,
       begin_send_port( i, j, k, size, g );                  \
     }                                                       \
   } END_PRIMITIVE
-  BEGIN_SEND(-1, 0, 0,x,y,z);
-  BEGIN_SEND( 0,-1, 0,y,z,x);
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 1, 0, 0,x,y,z);
   BEGIN_SEND( 0, 1, 0,y,z,x);
   BEGIN_SEND( 0, 0, 1,z,x,y);
@@ -115,18 +115,18 @@ end_remote_ghost_tang_b( field_t      * ALIGNED(128) field,
         field(x,y,z).cb##Z = rw*(*(p++)) + lw*field(x+i,y+j,z+k).cb##Z; \
     }                                                                   \
   } END_PRIMITIVE
-  END_RECV(-1, 0, 0,x,y,z);
-  END_RECV( 0,-1, 0,y,z,x);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV((-1), 0, 0,x,y,z);
+  END_RECV( 0,(-1), 0,y,z,x);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 1, 0, 0,x,y,z);
   END_RECV( 0, 1, 0,y,z,x);
   END_RECV( 0, 0, 1,z,x,y);
 # undef END_RECV
 
 # define END_SEND(i,j,k,X,Y,Z) end_send_port(i,j,k,g)
-  END_SEND(-1, 0, 0,x,y,z);
-  END_SEND( 0,-1, 0,y,z,x);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND((-1), 0, 0,x,y,z);
+  END_SEND( 0,(-1), 0,y,z,x);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 1, 0, 0,x,y,z);
   END_SEND( 0, 1, 0,y,z,x);
   END_SEND( 0, 0, 1,z,x,y);
@@ -142,9 +142,9 @@ begin_remote_ghost_norm_e( field_t      * ALIGNED(128) field,
 
 # define BEGIN_RECV(i,j,k,X,Y,Z) \
   begin_recv_port(i,j,k,( 1 + (n##Y+1)*(n##Z+1) )*sizeof(float),g)
-  BEGIN_RECV(-1, 0, 0,x,y,z);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 1, 0, 0,x,y,z);
   BEGIN_RECV( 0, 1, 0,y,z,x);
   BEGIN_RECV( 0, 0, 1,z,x,y);
@@ -160,9 +160,9 @@ begin_remote_ghost_norm_e( field_t      * ALIGNED(128) field,
       begin_send_port( i, j, k, size, g );                  \
     }                                                       \
   } END_PRIMITIVE
-  BEGIN_SEND(-1, 0, 0,x,y,z);
-  BEGIN_SEND( 0,-1, 0,y,z,x);
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 1, 0, 0,x,y,z);
   BEGIN_SEND( 0, 1, 0,y,z,x);
   BEGIN_SEND( 0, 0, 1,z,x,y);
@@ -187,18 +187,18 @@ end_remote_ghost_norm_e( field_t      * ALIGNED(128) field,
         field(x,y,z).e##X = rw*(*(p++)) + lw*field(x+i,y+j,z+k).e##X; \
     }                                                                 \
   } END_PRIMITIVE
-  END_RECV(-1, 0, 0,x,y,z);
-  END_RECV( 0,-1, 0,y,z,x);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV((-1), 0, 0,x,y,z);
+  END_RECV( 0,(-1), 0,y,z,x);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 1, 0, 0,x,y,z);
   END_RECV( 0, 1, 0,y,z,x);
   END_RECV( 0, 0, 1,z,x,y);
 # undef END_RECV
 
 # define END_SEND(i,j,k,X,Y,Z) end_send_port(i,j,k,g)
-  END_SEND(-1, 0, 0,x,y,z);
-  END_SEND( 0,-1, 0,y,z,x);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND((-1), 0, 0,x,y,z);
+  END_SEND( 0,(-1), 0,y,z,x);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 1, 0, 0,x,y,z);
   END_SEND( 0, 1, 0,y,z,x);
   END_SEND( 0, 0, 1,z,x,y);
@@ -214,9 +214,9 @@ begin_remote_ghost_div_b( field_t      * ALIGNED(128) field,
 
 # define BEGIN_RECV(i,j,k,X,Y,Z) \
   begin_recv_port(i,j,k,(1+n##Y*n##Z)*sizeof(float),g)
-  BEGIN_RECV(-1, 0, 0,x,y,z);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 1, 0, 0,x,y,z);
   BEGIN_RECV( 0, 1, 0,y,z,x);
   BEGIN_RECV( 0, 0, 1,z,x,y);
@@ -232,9 +232,9 @@ begin_remote_ghost_div_b( field_t      * ALIGNED(128) field,
       begin_send_port( i, j, k, size, g );                   \
     }                                                        \
   } END_PRIMITIVE
-  BEGIN_SEND(-1, 0, 0,x,y,z);
-  BEGIN_SEND( 0,-1, 0,y,z,x);
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 1, 0, 0,x,y,z);
   BEGIN_SEND( 0, 1, 0,y,z,x);
   BEGIN_SEND( 0, 0, 1,z,x,y);
@@ -260,18 +260,18 @@ end_remote_ghost_div_b( field_t      * ALIGNED(128) field,
                                  lw*field(x+i,y+j,z+k).div_b_err;       \
     }                                                                   \
   } END_PRIMITIVE
-  END_RECV(-1, 0, 0,x,y,z);
-  END_RECV( 0,-1, 0,y,z,x);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV((-1), 0, 0,x,y,z);
+  END_RECV( 0,(-1), 0,y,z,x);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 1, 0, 0,x,y,z);
   END_RECV( 0, 1, 0,y,z,x);
   END_RECV( 0, 0, 1,z,x,y);
 # undef END_RECV
 
 # define END_SEND(i,j,k,X,Y,Z) end_send_port(i,j,k,g)
-  END_SEND(-1, 0, 0,x,y,z);
-  END_SEND( 0,-1, 0,y,z,x);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND((-1), 0, 0,x,y,z);
+  END_SEND( 0,(-1), 0,y,z,x);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 1, 0, 0,x,y,z);
   END_SEND( 0, 1, 0,y,z,x);
   END_SEND( 0, 0, 1,z,x,y);
@@ -376,33 +376,33 @@ synchronize_tang_e_norm_b( field_array_t * RESTRICT fa ) {
 # define END_SEND(i,j,k,X,Y,Z) end_send_port( i, j, k, g )
 
   // Exchange x-faces
-  BEGIN_RECV(-1, 0, 0,x,y,z);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
   BEGIN_RECV( 1, 0, 0,x,y,z);
-  BEGIN_SEND(-1, 0, 0,x,y,z);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
   BEGIN_SEND( 1, 0, 0,x,y,z);
-  END_SEND(-1, 0, 0,x,y,z);
+  END_SEND((-1), 0, 0,x,y,z);
   END_SEND( 1, 0, 0,x,y,z);
-  END_RECV(-1, 0, 0,x,y,z);
+  END_RECV((-1), 0, 0,x,y,z);
   END_RECV( 1, 0, 0,x,y,z);
 
   // Exchange y-faces
-  BEGIN_SEND( 0,-1, 0,y,z,x);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
   BEGIN_SEND( 0, 1, 0,y,z,x);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
   BEGIN_RECV( 0, 1, 0,y,z,x);
-  END_RECV( 0,-1, 0,y,z,x);
+  END_RECV( 0,(-1), 0,y,z,x);
   END_RECV( 0, 1, 0,y,z,x);
-  END_SEND( 0,-1, 0,y,z,x);
+  END_SEND( 0,(-1), 0,y,z,x);
   END_SEND( 0, 1, 0,y,z,x);
 
   // Exchange z-faces
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 0, 0, 1,z,x,y);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 0, 0, 1,z,x,y);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 0, 0, 1,z,x,y);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 0, 0, 1,z,x,y);
 
 # undef BEGIN_RECV
@@ -472,33 +472,33 @@ synchronize_jf( field_array_t * RESTRICT fa ) {
 # define END_SEND(i,j,k,X,Y,Z) end_send_port( i, j, k, g )
 
   // Exchange x-faces
-  BEGIN_SEND(-1, 0, 0,x,y,z);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
   BEGIN_SEND( 1, 0, 0,x,y,z);
-  BEGIN_RECV(-1, 0, 0,x,y,z);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
   BEGIN_RECV( 1, 0, 0,x,y,z);
-  END_RECV(-1, 0, 0,x,y,z);
+  END_RECV((-1), 0, 0,x,y,z);
   END_RECV( 1, 0, 0,x,y,z);
-  END_SEND(-1, 0, 0,x,y,z);
+  END_SEND((-1), 0, 0,x,y,z);
   END_SEND( 1, 0, 0,x,y,z);
 
   // Exchange y-faces
-  BEGIN_SEND( 0,-1, 0,y,z,x);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
   BEGIN_SEND( 0, 1, 0,y,z,x);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
   BEGIN_RECV( 0, 1, 0,y,z,x);
-  END_RECV( 0,-1, 0,y,z,x);
+  END_RECV( 0,(-1), 0,y,z,x);
   END_RECV( 0, 1, 0,y,z,x);
-  END_SEND( 0,-1, 0,y,z,x);
+  END_SEND( 0,(-1), 0,y,z,x);
   END_SEND( 0, 1, 0,y,z,x);
 
   // Exchange z-faces
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 0, 0, 1,z,x,y);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 0, 0, 1,z,x,y);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 0, 0, 1,z,x,y);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 0, 0, 1,z,x,y);
 
 # undef BEGIN_RECV
@@ -588,33 +588,33 @@ synchronize_rho( field_array_t * RESTRICT fa ) {
 # define END_SEND(i,j,k,X,Y,Z) end_send_port( i, j, k, g )
 
   // Exchange x-faces
-  BEGIN_SEND(-1, 0, 0,x,y,z);
+  BEGIN_SEND((-1), 0, 0,x,y,z);
   BEGIN_SEND( 1, 0, 0,x,y,z);
-  BEGIN_RECV(-1, 0, 0,x,y,z);
+  BEGIN_RECV((-1), 0, 0,x,y,z);
   BEGIN_RECV( 1, 0, 0,x,y,z);
-  END_RECV(-1, 0, 0,x,y,z);
+  END_RECV((-1), 0, 0,x,y,z);
   END_RECV( 1, 0, 0,x,y,z);
-  END_SEND(-1, 0, 0,x,y,z);
+  END_SEND((-1), 0, 0,x,y,z);
   END_SEND( 1, 0, 0,x,y,z);
 
   // Exchange y-faces
-  BEGIN_SEND( 0,-1, 0,y,z,x);
+  BEGIN_SEND( 0,(-1), 0,y,z,x);
   BEGIN_SEND( 0, 1, 0,y,z,x);
-  BEGIN_RECV( 0,-1, 0,y,z,x);
+  BEGIN_RECV( 0,(-1), 0,y,z,x);
   BEGIN_RECV( 0, 1, 0,y,z,x);
-  END_RECV( 0,-1, 0,y,z,x);
+  END_RECV( 0,(-1), 0,y,z,x);
   END_RECV( 0, 1, 0,y,z,x);
-  END_SEND( 0,-1, 0,y,z,x);
+  END_SEND( 0,(-1), 0,y,z,x);
   END_SEND( 0, 1, 0,y,z,x);
 
   // Exchange z-faces
-  BEGIN_SEND( 0, 0,-1,z,x,y);
+  BEGIN_SEND( 0, 0,(-1),z,x,y);
   BEGIN_SEND( 0, 0, 1,z,x,y);
-  BEGIN_RECV( 0, 0,-1,z,x,y);
+  BEGIN_RECV( 0, 0,(-1),z,x,y);
   BEGIN_RECV( 0, 0, 1,z,x,y);
-  END_RECV( 0, 0,-1,z,x,y);
+  END_RECV( 0, 0,(-1),z,x,y);
   END_RECV( 0, 0, 1,z,x,y);
-  END_SEND( 0, 0,-1,z,x,y);
+  END_SEND( 0, 0,(-1),z,x,y);
   END_SEND( 0, 0, 1,z,x,y);
 
 # undef BEGIN_RECV
diff --git a/src/field_advance/standard/sfa.c b/src/field_advance/standard/sfa.c
index f3025857..070f327b 100644
--- a/src/field_advance/standard/sfa.c
+++ b/src/field_advance/standard/sfa.c
@@ -1,4 +1,5 @@
 #define IN_sfa
+
 #include "sfa_private.h"
 
 static field_advance_kernels_t sfa_kernels = {
@@ -46,14 +47,16 @@ static field_advance_kernels_t sfa_kernels = {
 
 static float
 minf( float a, 
-      float b ) {
+      float b )
+{
   return a<b ? a : b;
 }
 
 static sfa_params_t *
-create_sfa_params( grid_t           * g,
+create_sfa_params( grid_t * g,
                    const material_t * m_list,
-                   float              damp ) {
+                   float damp )
+{
   sfa_params_t * p;
   float ax, ay, az, cg2;
   material_coefficient_t *mc;
diff --git a/src/field_advance/standard/sfa_private.h b/src/field_advance/standard/sfa_private.h
index 6303d458..6f71d049 100644
--- a/src/field_advance/standard/sfa_private.h
+++ b/src/field_advance/standard/sfa_private.h
@@ -8,19 +8,22 @@
 #endif
 
 #define IN_field_advance
+
 #include "../field_advance_private.h"
 
-typedef struct material_coefficient {
+typedef struct material_coefficient
+{
   float decayx, drivex;         // Decay of ex and drive of (curl H)x and Jx
   float decayy, drivey;         // Decay of ey and drive of (curl H)y and Jy
   float decayz, drivez;         // Decay of ez and drive of (curl H)z and Jz
   float rmux, rmuy, rmuz;       // Reciprocle of relative permeability
   float nonconductive;          // Divergence cleaning related coefficients
-  float epsx, epsy, epsz; 
+  float epsx, epsy, epsz;
   float pad[3];                 // For 64-byte alignment and future expansion
 } material_coefficient_t;
 
-typedef struct sfa_params {
+typedef struct sfa_params
+{
   material_coefficient_t * mc;
   int n_mc;
   float damp;
@@ -46,7 +49,11 @@ clear_rhof( field_array_t * RESTRICT fa );
 
 void
 advance_b( field_array_t * RESTRICT fa,
-           float                    frac );
+           float frac );
+
+void
+advance_b_pipeline( field_array_t * RESTRICT fa,
+                    float _frac );
 
 // In advance_e.c
 
@@ -69,11 +76,19 @@ advance_b( field_array_t * RESTRICT fa,
 
 void
 advance_e( field_array_t * RESTRICT fa,
-           float                    frac );
+           float frac );
+
+void
+advance_e_pipeline( field_array_t * RESTRICT fa,
+                    float frac );
 
 void
 vacuum_advance_e( field_array_t * RESTRICT fa,
-                  float                    frac );
+                  float frac );
+
+void
+vacuum_advance_e_pipeline( field_array_t * RESTRICT fa,
+                           float frac );
 
 // In energy_f.c
 
@@ -91,13 +106,21 @@ vacuum_advance_e( field_array_t * RESTRICT fa,
 // vacuum_energy_f is the high performance version for uniform regions
 
 void
-energy_f( /**/  double        * RESTRICT en, // 6 elem array
+energy_f( double * RESTRICT en, // 6 elem array
           const field_array_t * RESTRICT fa );
 
 void
-vacuum_energy_f( /**/  double        * RESTRICT en, // 6 elem array
+energy_f_pipeline( double * global,
+                   const field_array_t * RESTRICT fa );
+
+void
+vacuum_energy_f( double * RESTRICT en, // 6 elem array
                  const field_array_t * RESTRICT fa );
 
+void
+vacuum_energy_f_pipeline( double * global,
+                          const field_array_t * RESTRICT fa );
+
 // In compute_curl_b.c
 
 // compute_curl_b applies the following difference equations to the
@@ -111,9 +134,15 @@ vacuum_energy_f( /**/  double        * RESTRICT en, // 6 elem array
 void
 compute_curl_b( field_array_t * RESTRICT fa );
 
+void
+compute_curl_b_pipeline( field_array_t * RESTRICT fa );
+
 void
 vacuum_compute_curl_b( field_array_t * RESTRICT fa );
 
+void
+vacuum_compute_curl_b_pipeline( field_array_t * RESTRICT fa );
+
 // The theory behind the Marder correction is that the Ampere and
 // Faraday equations can be modified as follows:
 //   pB/pt = -curl E    --> pB/pt = -curl E     + alpha grad div B
@@ -239,9 +268,15 @@ vacuum_compute_curl_b( field_array_t * RESTRICT fa );
 void
 compute_rhob( field_array_t * RESTRICT fa );
 
+void
+compute_rhob_pipeline( field_array_t * RESTRICT fa );
+
 void
 vacuum_compute_rhob( field_array_t * RESTRICT fa );
 
+void
+vacuum_compute_rhob_pipeline( field_array_t * RESTRICT fa );
+
 // In compute_div_e_err.c
 
 // compute_div_e_err applies the following difference equation:
@@ -254,9 +289,15 @@ vacuum_compute_rhob( field_array_t * RESTRICT fa );
 void
 compute_div_e_err( field_array_t * RESTRICT fa );
 
+void
+compute_div_e_err_pipeline( field_array_t * RESTRICT fa );
+
 void
 vacuum_compute_div_e_err( field_array_t * RESTRICT fa );
 
+void
+vacuum_compute_div_e_err_pipeline( field_array_t * RESTRICT fa );
+
 // In compute_rms_div_e_err.c
 
 // compute_rms_div_e_err returns
@@ -269,6 +310,9 @@ vacuum_compute_div_e_err( field_array_t * RESTRICT fa );
 double
 compute_rms_div_e_err( const field_array_t * RESTRICT fa );
 
+double
+compute_rms_div_e_err_pipeline( const field_array_t * RESTRICT fa );
+
 // In clean_div_e.c
 
 // clean_div_e applies the following difference equation:
@@ -280,9 +324,15 @@ compute_rms_div_e_err( const field_array_t * RESTRICT fa );
 void
 clean_div_e( field_array_t * RESTRICT fa );
 
+void
+clean_div_e_pipeline( field_array_t * fa );
+
 void
 vacuum_clean_div_e( field_array_t * RESTRICT fa );
 
+void
+vacuum_clean_div_e_pipeline( field_array_t * fa );
+
 // In compute_div_b_err.c
 
 // compute_div_b_err applies the following difference equation:
@@ -291,6 +341,9 @@ vacuum_clean_div_e( field_array_t * RESTRICT fa );
 void
 compute_div_b_err( field_array_t * RESTRICT fa );
 
+void
+compute_div_b_err_pipeline( field_array_t * RESTRICT fa );
+
 // In compute_rms_div_b_err.c
 
 // compute_rms_div_b_err returns
@@ -304,6 +357,9 @@ compute_div_b_err( field_array_t * RESTRICT fa );
 double
 compute_rms_div_b_err( const field_array_t * RESTRICT fa );
 
+double
+compute_rms_div_b_err_pipeline( const field_array_t * fa );
+
 // In clean_div_b.c
 
 // clean_div_b applies the following difference equation:
@@ -313,6 +369,9 @@ compute_rms_div_b_err( const field_array_t * RESTRICT fa );
 void
 clean_div_b( field_array_t * RESTRICT fa );
 
+void
+clean_div_b_pipeline( field_array_t * fa );
+
 // Internode functions
 
 // In remote.c
@@ -329,66 +388,66 @@ synchronize_rho( field_array_t * RESTRICT fa );
 // In local.c
 
 void
-local_ghost_tang_b( field_t      * ALIGNED(128) f,
-                    const grid_t *              g );
+local_ghost_tang_b( field_t * ALIGNED(128) f,
+                    const grid_t * g );
 
 void
-local_ghost_norm_e( field_t      * ALIGNED(128) f,
-                    const grid_t *              g );
+local_ghost_norm_e( field_t * ALIGNED(128) f,
+                    const grid_t * g );
 
 void
-local_ghost_div_b( field_t      * ALIGNED(128) f,
-                   const grid_t *              g );
+local_ghost_div_b( field_t * ALIGNED(128) f,
+                   const grid_t * g );
 
 void
-local_adjust_tang_e( field_t      * ALIGNED(128) f,
-                     const grid_t *              g );
+local_adjust_tang_e( field_t * ALIGNED(128) f,
+                     const grid_t * g );
 
 void
-local_adjust_div_e( field_t      * ALIGNED(128) f,
-                    const grid_t *              g );
+local_adjust_div_e( field_t * ALIGNED(128) f,
+                    const grid_t * g );
 
 void
-local_adjust_norm_b( field_t      * ALIGNED(128) f,
-                     const grid_t *              g );
+local_adjust_norm_b( field_t * ALIGNED(128) f,
+                     const grid_t * g );
 
 void
-local_adjust_jf( field_t      * ALIGNED(128) f,
-                 const grid_t *              g );
+local_adjust_jf( field_t * ALIGNED(128) f,
+                 const grid_t * g );
 
 void
-local_adjust_rhof( field_t      * ALIGNED(128) f,
-                   const grid_t *              g );
+local_adjust_rhof( field_t * ALIGNED(128) f,
+                   const grid_t * g );
 
 void
-local_adjust_rhob( field_t      * ALIGNED(128) f,
-                   const grid_t *              g );
+local_adjust_rhob( field_t * ALIGNED(128) f,
+                   const grid_t * g );
 
 // In remote.c
 
 void
-begin_remote_ghost_tang_b( field_t      * ALIGNED(128) f,
-                           const grid_t *              g );
+begin_remote_ghost_tang_b( field_t * ALIGNED(128) f,
+                           const grid_t * g );
 
 void
-end_remote_ghost_tang_b( field_t      * ALIGNED(128) f,
-                         const grid_t *              g );
+end_remote_ghost_tang_b( field_t * ALIGNED(128) f,
+                         const grid_t * g );
 
 void
-begin_remote_ghost_norm_e( field_t      * ALIGNED(128) f,
-                           const grid_t *              g );
+begin_remote_ghost_norm_e( field_t * ALIGNED(128) f,
+                           const grid_t * g );
 
 void
-end_remote_ghost_norm_e( field_t      * ALIGNED(128) f,
-                         const grid_t *              g );
+end_remote_ghost_norm_e( field_t * ALIGNED(128) f,
+                         const grid_t * g );
 
 void
-begin_remote_ghost_div_b( field_t      * ALIGNED(128) f,
-                          const grid_t *              g );
+begin_remote_ghost_div_b( field_t * ALIGNED(128) f,
+                          const grid_t * g );
 
 void
-end_remote_ghost_div_b( field_t      * ALIGNED(128) f,
-                        const grid_t *              g );
+end_remote_ghost_div_b( field_t * ALIGNED(128) f,
+                        const grid_t * g );
 
 END_C_DECLS
 
diff --git a/src/field_advance/standard/vacuum_advance_e.cc b/src/field_advance/standard/vacuum_advance_e.cc
index f58f379c..6391b651 100644
--- a/src/field_advance/standard/vacuum_advance_e.cc
+++ b/src/field_advance/standard/vacuum_advance_e.cc
@@ -1,363 +1,25 @@
-// Note: This is similar to vacuum_compute_curl_b
-
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                                    \
-  /**/  field_t                * ALIGNED(128) f = args->f;                   \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;               \
-  const grid_t                 *              g = args->g;                   \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                              \
-                                                                             \
-  const float decayx = m->decayx, drivex = m->drivex;                        \
-  const float decayy = m->decayy, drivey = m->drivey;                        \
-  const float decayz = m->decayz, drivez = m->drivez;                        \
-  const float damp   = args->p->damp;                                        \
-  const float px_muz = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuz; \
-  const float px_muy = ((nx>1) ? (1+damp)*g->cvac*g->dt*g->rdx : 0)*m->rmuy; \
-  const float py_mux = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmux; \
-  const float py_muz = ((ny>1) ? (1+damp)*g->cvac*g->dt*g->rdy : 0)*m->rmuz; \
-  const float pz_muy = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmuy; \
-  const float pz_mux = ((nz>1) ? (1+damp)*g->cvac*g->dt*g->rdz : 0)*m->rmux; \
-  const float cj     = g->dt/g->eps0;                                        \
-                                                                             \
-  field_t * ALIGNED(16) f0;                                                  \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;              \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_EX() \
-  f0->tcax = ( py_muz*(f0->cbz-fy->cbz) - pz_muy*(f0->cby-fz->cby) ) - \
-             damp*f0->tcax; \
-  f0->ex   = decayx*f0->ex + drivex*( f0->tcax - cj*f0->jfx )
-#define UPDATE_EY() \
-  f0->tcay = ( pz_mux*(f0->cbx-fz->cbx) - px_muz*(f0->cbz-fx->cbz) ) - \
-             damp*f0->tcay; \
-  f0->ey   = decayy*f0->ey + drivey*( f0->tcay - cj*f0->jfy )
-#define UPDATE_EZ() \
-  f0->tcaz = ( px_muy*(f0->cby-fx->cby) - py_mux*(f0->cbx-fy->cbx) ) - \
-             damp*f0->tcaz; \
-  f0->ez   = decayz*f0->ez + drivez*( f0->tcaz - cj*f0->jfz )
-
-void
-vacuum_advance_e_pipeline( pipeline_args_t * args,
-                           int pipeline_rank,
-                           int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_EX(); UPDATE_EY(); UPDATE_EZ(); 
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-vacuum_advance_e_pipeline_v4( pipeline_args_t * args,
-                              int pipeline_rank,
-                              int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  const v4float vdecayx( decayx ), vdrivex( drivex );
-  const v4float vdecayy( decayy ), vdrivey( drivey );
-  const v4float vdecayz( decayz ), vdrivez( drivez );
-  const v4float vdamp( damp );
-  const v4float vpx_muz( px_muz ), vpx_muy( px_muy );
-  const v4float vpy_mux( py_mux ), vpy_muz( py_muz );
-  const v4float vpz_muy( pz_muy ), vpz_mux( pz_mux );
-  const v4float vcj( cj );
-
-  v4float save0, save1, dummy;
-
-  v4float f0_ex,   f0_ey,   f0_ez;
-  v4float f0_cbx,  f0_cby,  f0_cbz;
-  v4float f0_tcax, f0_tcay, f0_tcaz;
-  v4float f0_jfx,  f0_jfy,  f0_jfz;
-  v4float          fx_cby,  fx_cbz;
-  v4float fy_cbx,           fy_cbz;
-  v4float fz_cbx,  fz_cby;
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
-
-  // Process the bulk of the voxels 4 at a time
-
-  INIT_STENCIL();
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
-    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
-    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
-    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
-
-    load_4x4_tr( &f00->ex,   &f01->ex,   &f02->ex,   &f03->ex,   f0_ex,   f0_ey,   f0_ez,   save0 );
-    load_4x3_tr( &f00->cbx,  &f01->cbx,  &f02->cbx,  &f03->cbx,  f0_cbx,  f0_cby,  f0_cbz         );
-    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax, f0_tcax, f0_tcay, f0_tcaz, save1 );
-    load_4x3_tr( &f00->jfx,  &f01->jfx,  &f02->jfx,  &f03->jfx,  f0_jfx,  f0_jfy,  f0_jfz         );
-
-    load_4x3_tr( &fx0->cbx,  &fx1->cbx,  &fx2->cbx,  &fx3->cbx,  dummy,   fx_cby,  fx_cbz         );
-    load_4x3_tr( &fy0->cbx,  &fy1->cbx,  &fy2->cbx,  &fy3->cbx,  fy_cbx,  dummy,   fy_cbz         );
-    load_4x2_tr( &fz0->cbx,  &fz1->cbx,  &fz2->cbx,  &fz3->cbx,  fz_cbx,  fz_cby   /**/           );
-
-    f0_tcax = fnms( vdamp,f0_tcax, fms( vpy_muz,(f0_cbz-fy_cbz), vpz_muy*(f0_cby-fz_cby) ) );
-    f0_tcay = fnms( vdamp,f0_tcay, fms( vpz_mux,(f0_cbx-fz_cbx), vpx_muz*(f0_cbz-fx_cbz) ) );
-    f0_tcaz = fnms( vdamp,f0_tcaz, fms( vpx_muy,(f0_cby-fx_cby), vpy_mux*(f0_cbx-fy_cbx) ) );
-
-    f0_ex   = fma( vdecayx,f0_ex, vdrivex*fnms( vcj,f0_jfx, f0_tcax ) );
-    f0_ey   = fma( vdecayy,f0_ey, vdrivey*fnms( vcj,f0_jfy, f0_tcay ) );
-    f0_ez   = fma( vdecayz,f0_ez, vdrivez*fnms( vcj,f0_jfz, f0_tcaz ) );
 
-    // Note: Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient than store_4x3!
-    store_4x4_tr( f0_ex,   f0_ey,   f0_ez,   save0, &f00->ex,    &f01->ex,    &f02->ex,    &f03->ex   );
-    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1, &f00->tcax,  &f01->tcax,  &f02->tcax,  &f03->tcax );
-  }
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_advance_e function.
+//----------------------------------------------------------------------------//
 
 void
 vacuum_advance_e( field_array_t * RESTRICT fa,
-                  float frac ) {
-  if( !fa     ) ERROR(( "Bad args" ));
-  if( frac!=1 ) ERROR(( "standard advance_e does not support frac!=1 yet" ));
-
-  /***************************************************************************
-   * Begin tangential B ghost setup
-   ***************************************************************************/
-  
-  begin_remote_ghost_tang_b( fa->f, fa->g );
-  local_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update interior fields
-   * Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
-   * Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
-   * Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
-   ***************************************************************************/
-
-  // Do majority interior in a single pass.  The host handles
-  // stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_advance_e, args, 0 );
-
-  // While the pipelines are busy, do non-bulk interior fields
-
-  DECLARE_STENCIL();
-
-  // Do left over interior ex
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_EX();
-    }
-  }
-
-  // Do left over interior ey
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(2,1,z);
-    fx = &f(1,1,z);
-    fz = &f(2,1,z-1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do left over interior ez
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(2,y,  1);
-    fx = &f(1,y,  1);
-    fy = &f(2,y-1,1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-
-  WAIT_PIPELINES();
-  
-  /***************************************************************************
-   * Finish tangential B ghost setup
-   ***************************************************************************/
-
-  end_remote_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update exterior fields
-   ***************************************************************************/
-
-  // Do exterior ex
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
+                  float frac )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  // Do exterior ey
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(0,y,z);
-      fz = &f(1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fx = &f(nx,  y,z);
-      fz = &f(nx+1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,1);
-    fx = &f(1,y,1);
-    fz = &f(2,y,0);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,nz+1);
-    fx = &f(1,y,nz+1);
-    fz = &f(2,y,nz  );
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do exterior ez
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      UPDATE_EZ();
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      UPDATE_EZ();
-    }
+  if ( frac != 1 )
+  {
+    ERROR( ( "standard advance_e does not support frac != 1 yet" ) );
   }
 
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_advance_e_pipeline( fa, frac );
 }
diff --git a/src/field_advance/standard/vacuum_clean_div_e.c b/src/field_advance/standard/vacuum_clean_div_e.c
index 0fec2207..35c5470c 100644
--- a/src/field_advance/standard/vacuum_clean_div_e.c
+++ b/src/field_advance/standard/vacuum_clean_div_e.c
@@ -1,145 +1,20 @@
 #define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  field_t            * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                                \
-  field_t                      * ALIGNED(128) f = args->f;               \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;           \
-  const grid_t                 *              g = args->g;               \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                          \
-                                                                         \
-  const float _rdx = (nx>1) ? g->rdx : 0;                                \
-  const float _rdy = (ny>1) ? g->rdy : 0;                                \
-  const float _rdz = (nz>1) ? g->rdz : 0;                                \
-  const float alphadt = 0.3888889/( _rdx*_rdx + _rdy*_rdy + _rdz*_rdz ); \
-  const float px   = (alphadt*_rdx)*m->drivex;                           \
-  const float py   = (alphadt*_rdy)*m->drivey;                           \
-  const float pz   = (alphadt*_rdz)*m->drivez;                           \
-                                                                         \
-  field_t * ALIGNED(16) f0;                                              \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;          \
-  int x, y, z
-                     
-#define f(x,y,z) f[ VOXEL(x,y,z,nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x+1,y,  z  ); \
-  fy = &f(x,  y+1,z  ); \
-  fz = &f(x,  y,  z+1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 1; \
-    if( y>ny ) z++; if( y>ny ) y = 1; \
-    INIT_STENCIL();                   \
-  }
-
-#define MARDER_EX() f0->ex += px*(fx->div_e_err-f0->div_e_err)
-#define MARDER_EY() f0->ey += py*(fy->div_e_err-f0->div_e_err)
-#define MARDER_EZ() f0->ez += pz*(fz->div_e_err-f0->div_e_err)
 
-static void
-vacuum_clean_div_e_pipeline( pipeline_args_t * args,
-                             int pipeline_rank,
-                             int n_pipeline ) {
-  DECLARE_STENCIL();
-  
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    MARDER_EX(); MARDER_EY(); MARDER_EZ();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_clean_div_e
+// function.
+//----------------------------------------------------------------------------//
 
 void
-vacuum_clean_div_e( field_array_t * fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Do majority of field components in single pass on the pipelines.
-  // The host handles stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_clean_div_e, args, 0 );
-
-  // While pipelines are busy, do left overs on the host
-
-  DECLARE_STENCIL();
-  
-  // Do left over ex
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,nz+1);
-    fx = &f(2,y,nz+1);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_EX();
-      f0++; fx++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(2,ny+1,z);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_EX();
-      f0++; fx++;
-    }
+vacuum_clean_div_e( field_array_t * fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  // Do left over ey
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fy = &f(nx+1,y+1,z);
-      MARDER_EY();
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fy = &f(1,y+1,nz+1);
-    for( x=1; x<=nx; x++ ) {
-      MARDER_EY();
-      f0++; fy++;
-    }
-  }
-
-  // Do left over ez
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fz = &f(1,ny+1,z+1);
-    for( x=1; x<=nx+1; x++ ) {
-      MARDER_EZ();
-      f0++; fz++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fz = &f(nx+1,y,z+1);
-      MARDER_EZ();
-    }
-  }
-
-  WAIT_PIPELINES();
-
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_clean_div_e_pipeline( fa );
 }
-
diff --git a/src/field_advance/standard/vacuum_compute_curl_b.cc b/src/field_advance/standard/vacuum_compute_curl_b.cc
index e48ab407..0d79e7f8 100644
--- a/src/field_advance/standard/vacuum_compute_curl_b.cc
+++ b/src/field_advance/standard/vacuum_compute_curl_b.cc
@@ -1,336 +1,20 @@
-// Note: This is similar to vacuum_advance_e
-
 #define IN_sfa
-#define HAS_V4_PIPELINE
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                               \
-  /**/  field_t                * ALIGNED(128) f = args->f;              \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;          \
-  const grid_t                 *              g = args->g;              \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                         \
-                                                                        \
-  const float px_muz = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuz;     \
-  const float px_muy = ((nx>1) ? g->cvac*g->dt*g->rdx : 0)*m->rmuy;     \
-  const float py_mux = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmux;     \
-  const float py_muz = ((ny>1) ? g->cvac*g->dt*g->rdy : 0)*m->rmuz;     \
-  const float pz_muy = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmuy;     \
-  const float pz_mux = ((nz>1) ? g->cvac*g->dt*g->rdz : 0)*m->rmux;     \
-                                                                        \
-  field_t * ALIGNED(16) f0;                                             \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz;         \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_EX() f0->tcax = ( py_muz*(f0->cbz-fy->cbz) - \
-                                 pz_muy*(f0->cby-fz->cby) )
-#define UPDATE_EY() f0->tcay = ( pz_mux*(f0->cbx-fz->cbx) - \
-                                 px_muz*(f0->cbz-fx->cbz) )
-#define UPDATE_EZ() f0->tcaz = ( px_muy*(f0->cby-fx->cby) - \
-                                 py_mux*(f0->cbx-fy->cbx) )
-
-void
-vacuum_compute_curl_b_pipeline( pipeline_args_t * args,
-                                int pipeline_rank,
-                                int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_EX(); UPDATE_EY(); UPDATE_EZ(); 
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
 
-using namespace v4;
-
-void
-vacuum_compute_curl_b_pipeline_v4( pipeline_args_t * args,
-                                   int pipeline_rank,
-                                   int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  const v4float vpx_muz( px_muz ), vpx_muy( px_muy );
-  const v4float vpy_mux( py_mux ), vpy_muz( py_muz );
-  const v4float vpz_muy( pz_muy ), vpz_mux( pz_mux );
-
-  v4float save1, dummy;
-
-  v4float f0_cbx,  f0_cby,  f0_cbz;
-  v4float f0_tcax, f0_tcay, f0_tcaz;
-  v4float          fx_cby,  fx_cbz;
-  v4float fy_cbx,           fy_cbz;
-  v4float fz_cbx,  fz_cby;
-
-  field_t * ALIGNED(16) f00, * ALIGNED(16) f01, * ALIGNED(16) f02, * ALIGNED(16) f03; // Voxel quad
-  field_t * ALIGNED(16) fx0, * ALIGNED(16) fx1, * ALIGNED(16) fx2, * ALIGNED(16) fx3; // Voxel quad +x neighbors
-  field_t * ALIGNED(16) fy0, * ALIGNED(16) fy1, * ALIGNED(16) fy2, * ALIGNED(16) fy3; // Voxel quad +y neighbors
-  field_t * ALIGNED(16) fz0, * ALIGNED(16) fz1, * ALIGNED(16) fz2, * ALIGNED(16) fz3; // Voxel quad +z neighbors
-
-  // Process the bulk of the voxels 4 at a time
-
-  INIT_STENCIL();
-  for( ; n_voxel>3; n_voxel-=4 ) {
-    f00 = f0; fx0 = fx; fy0 = fy; fz0 = fz; NEXT_STENCIL();
-    f01 = f0; fx1 = fx; fy1 = fy; fz1 = fz; NEXT_STENCIL();
-    f02 = f0; fx2 = fx; fy2 = fy; fz2 = fz; NEXT_STENCIL();
-    f03 = f0; fx3 = fx; fy3 = fy; fz3 = fz; NEXT_STENCIL();
-
-    load_4x3_tr( &f00->cbx,  &f01->cbx,  &f02->cbx,  &f03->cbx,  f0_cbx,  f0_cby,  f0_cbz         );
-    load_4x4_tr( &f00->tcax, &f01->tcax, &f02->tcax, &f03->tcax, f0_tcax, f0_tcay, f0_tcaz, save1 );
-
-    load_4x3_tr( &fx0->cbx,  &fx1->cbx,  &fx2->cbx,  &fx3->cbx,  dummy,   fx_cby,  fx_cbz         );
-    load_4x3_tr( &fy0->cbx,  &fy1->cbx,  &fy2->cbx,  &fy3->cbx,  fy_cbx,  dummy,   fy_cbz         );
-    load_4x2_tr( &fz0->cbx,  &fz1->cbx,  &fz2->cbx,  &fz3->cbx,  fz_cbx,  fz_cby   /**/           );
-
-    f0_tcax = fms( vpy_muz,(f0_cbz-fy_cbz), vpz_muy*(f0_cby-fz_cby) );
-    f0_tcay = fms( vpz_mux,(f0_cbx-fz_cbx), vpx_muz*(f0_cbz-fx_cbz) );
-    f0_tcaz = fms( vpx_muy,(f0_cby-fx_cby), vpy_mux*(f0_cbx-fy_cbx) );
-
-    // Note: Unlike load_4x3 versus load_4x4, store_4x4 is much more efficient than store_4x3!
-    store_4x4_tr( f0_tcax, f0_tcay, f0_tcaz, save1, &f00->tcax,  &f01->tcax,  &f02->tcax,  &f03->tcax );
-  }
-}
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_compute_curl_b
+// function.
+//----------------------------------------------------------------------------//
 
 void
-vacuum_compute_curl_b( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  /***************************************************************************
-   * Begin tangential B ghost setup
-   ***************************************************************************/
-  
-  begin_remote_ghost_tang_b( fa->f, fa->g );
-  local_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update interior fields
-   * Note: ex all (1:nx,  1:ny+1,1,nz+1) interior (1:nx,2:ny,2:nz)
-   * Note: ey all (1:nx+1,1:ny,  1:nz+1) interior (2:nx,1:ny,2:nz)
-   * Note: ez all (1:nx+1,1:ny+1,1:nz  ) interior (1:nx,1:ny,2:nz)
-   ***************************************************************************/
-
-  // Do majority interior in a single pass.  The host handles
-  // stragglers.
-
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_compute_curl_b, args, 0 );
-
-  // While the pipelines are busy, do non-bulk interior fields
-
-  DECLARE_STENCIL();
-
-  // Do left over interior ex
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_EX();
-    }
-  }
-
-  // Do left over interior ey
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(2,1,z);
-    fx = &f(1,1,z);
-    fz = &f(2,1,z-1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do left over interior ez
-  for( y=2; y<=ny; y++ ) {
-    f0 = &f(2,y,  1);
-    fx = &f(1,y,  1);
-    fy = &f(2,y-1,1);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-
-  WAIT_PIPELINES();
-  
-  /***************************************************************************
-   * Finish tangential B ghost setup
-   ***************************************************************************/
-
-  end_remote_ghost_tang_b( fa->f, fa->g );
-
-  /***************************************************************************
-   * Update exterior fields
-   ***************************************************************************/
-
-  // Do exterior ex
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx; x++ ) {
-      UPDATE_EX();
-      f0++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // Do exterior ey
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(1,y,z);
-      fx = &f(0,y,z);
-      fz = &f(1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,z);
-      fx = &f(nx,  y,z);
-      fz = &f(nx+1,y,z-1);
-      UPDATE_EY();
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,1);
-    fx = &f(1,y,1);
-    fz = &f(2,y,0);
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-  for( y=1; y<=ny; y++ ) {
-    f0 = &f(2,y,nz+1);
-    fx = &f(1,y,nz+1);
-    fz = &f(2,y,nz  );
-    for( x=2; x<=nx; x++ ) {
-      UPDATE_EY();
-      f0++;
-      fx++;
-      fz++;
-    }
-  }
-
-  // Do exterior ez
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_EZ();
-      f0++;
-      fx++;
-      fy++;
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      UPDATE_EZ();
-    }
-  }
-  for( z=1; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      UPDATE_EZ();
-    }
+vacuum_compute_curl_b( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-  local_adjust_tang_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_compute_curl_b_pipeline( fa );
 }
diff --git a/src/field_advance/standard/vacuum_compute_div_e_err.c b/src/field_advance/standard/vacuum_compute_div_e_err.c
index 583d2491..5d606661 100644
--- a/src/field_advance/standard/vacuum_compute_div_e_err.c
+++ b/src/field_advance/standard/vacuum_compute_div_e_err.c
@@ -1,177 +1,22 @@
 // Note: This is virtually identical to vacuum_compute_rhob
-#define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                       \
-  /**/  field_t                * ALIGNED(128) f = args->f;      \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
-  const grid_t                 *              g = args->g;      \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
-                                                                \
-  const float nc = m->nonconductive;                            \
-  const float px = ((nx>1) ? g->rdx : 0)*m->epsx;               \
-  const float py = ((ny>1) ? g->rdy : 0)*m->epsy;               \
-  const float pz = ((nz>1) ? g->rdz : 0)*m->epsz;               \
-  const float cj = 1./g->eps0;                                  \
-                                                                \
-  field_t * ALIGNED(16) f0;                                     \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_DERR_E() f0->div_e_err = nc*( px*( f0->ex - fx->ex ) +   \
-                                             py*( f0->ey - fy->ey ) +   \
-                                             pz*( f0->ez - fz->ez ) -   \
-                                             cj*( f0->rhof + f0->rhob ) )
 
-void
-vacuum_compute_div_e_err_pipeline( pipeline_args_t * args,
-                            int pipeline_rank,
-                            int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_DERR_E();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+#define IN_sfa
 
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_compute_div_e_err
+// function.
+//----------------------------------------------------------------------------//
 
 void
-vacuum_compute_div_e_err( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Have pipelines compute the interior of local domain (the host
-  // handles stragglers in the interior)
-
-  // Begin setting normal e ghosts
-
-  begin_remote_ghost_norm_e( fa->f, fa->g );
-  local_ghost_norm_e( fa->f, fa->g );
-
-  // Have pipelines compute interior of local domain
-
-  pipeline_args_t args[1];  
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_compute_div_e_err, args, 0 );
-
-  // While pipelines are busy, have host compute the exterior
-  // of the local domain
-
-  DECLARE_STENCIL();
-
-  // Finish setting normal e ghosts
-  end_remote_ghost_norm_e( fa->f, fa->g );
-
-  // z faces, x edges, y edges and all corners
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fx = &f(0,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
+vacuum_compute_div_e_err( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fx = &f(0,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // y faces, z edges
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // x faces
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_DERR_E();
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      fz = &f(nx+1,y,  z-1);
-      UPDATE_DERR_E();
-    }
-  }
-
-  // Finish up setting interior
-
-  WAIT_PIPELINES();
 
-  local_adjust_div_e( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_compute_div_e_err_pipeline( fa );
 }
diff --git a/src/field_advance/standard/vacuum_compute_rhob.c b/src/field_advance/standard/vacuum_compute_rhob.c
index d9857b03..98315923 100644
--- a/src/field_advance/standard/vacuum_compute_rhob.c
+++ b/src/field_advance/standard/vacuum_compute_rhob.c
@@ -1,175 +1,22 @@
 // Note: This is virtually identical to vacuum_compute_div_e_err
-#define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  /**/  field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                       \
-  /**/  field_t                * ALIGNED(128) f = args->f;      \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;  \
-  const grid_t                 *              g = args->g;      \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                 \
-                                                                \
-  const float nc = m->nonconductive;                            \
-  const float px = (nx>1) ? g->eps0*m->epsx*g->rdx : 0;         \
-  const float py = (ny>1) ? g->eps0*m->epsy*g->rdy : 0;         \
-  const float pz = (nz>1) ? g->eps0*m->epsz*g->rdz : 0;         \
-                                                                \
-  field_t * ALIGNED(16) f0;                                     \
-  field_t * ALIGNED(16) fx, * ALIGNED(16) fy, * ALIGNED(16) fz; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()  \
-  f0 = &f(x,  y,  z  ); \
-  fx = &f(x-1,y,  z  ); \
-  fy = &f(x,  y-1,z  ); \
-  fz = &f(x,  y,  z-1)
-
-#define NEXT_STENCIL()                \
-  f0++; fx++; fy++; fz++; x++;        \
-  if( x>nx ) {                        \
-    /**/       y++;            x = 2; \
-    if( y>ny ) z++; if( y>ny ) y = 2; \
-    INIT_STENCIL();                   \
-  }
-
-#define UPDATE_DERR_E() f0->rhob = nc*( px*( f0->ex - fx->ex ) + \
-                                        py*( f0->ey - fy->ey ) + \
-                                        pz*( f0->ez - fz->ez ) - f0->rhof )
 
-void
-vacuum_compute_rhob_pipeline( pipeline_args_t * args,
-                              int pipeline_rank,
-                              int n_pipeline ) {
-  DECLARE_STENCIL();
-
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 2,nx, 2,ny, 2,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    UPDATE_DERR_E();
-    NEXT_STENCIL();
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+#define IN_sfa
 
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_compute_rhob
+// function.
+//----------------------------------------------------------------------------//
 
 void
-vacuum_compute_rhob( field_array_t * RESTRICT fa ) {
-  if( !fa ) ERROR(( "Bad args" ));
-
-  // Have pipelines compute the interior of local domain (the host
-  // handles stragglers in the interior)
-
-  // Begin setting normal e ghosts
-
-  begin_remote_ghost_norm_e( fa->f, fa->g );
-  local_ghost_norm_e( fa->f, fa->g );
-
-  // Have pipelines compute interior of local domain
-
-  pipeline_args_t args[1];  
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_compute_rhob, args, 0 );
-
-  // While pipelines are busy, have host compute the exterior
-  // of the local domain
-
-  DECLARE_STENCIL();
-
-  // Finish setting normal e ghosts
-  end_remote_ghost_norm_e( fa->f, fa->g );
-
-  // z faces, x edges, y edges and all corners
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  1);
-    fx = &f(0,y,  1);
-    fy = &f(1,y-1,1);
-    fz = &f(1,y,  0);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
+vacuum_compute_rhob( field_array_t * RESTRICT fa )
+{
+  if ( !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-  for( y=1; y<=ny+1; y++ ) {
-    f0 = &f(1,y,  nz+1);
-    fx = &f(0,y,  nz+1);
-    fy = &f(1,y-1,nz+1);
-    fz = &f(1,y,  nz);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // y faces, z edges
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,1,z);
-    fx = &f(0,1,z);
-    fy = &f(1,0,z);
-    fz = &f(1,1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-  for( z=2; z<=nz; z++ ) {
-    f0 = &f(1,ny+1,z);
-    fx = &f(0,ny+1,z);
-    fy = &f(1,ny,  z);
-    fz = &f(1,ny+1,z-1);
-    for( x=1; x<=nx+1; x++ ) {
-      UPDATE_DERR_E();
-      f0++;
-      fx++;
-      fy++;
-      fz++;
-    }
-  }
-
-  // x faces
-  for( z=2; z<=nz; z++ ) {
-    for( y=2; y<=ny; y++ ) {
-      f0 = &f(1,y,  z);
-      fx = &f(0,y,  z);
-      fy = &f(1,y-1,z);
-      fz = &f(1,y,  z-1);
-      UPDATE_DERR_E();
-      f0 = &f(nx+1,y,  z);
-      fx = &f(nx,  y,  z);
-      fy = &f(nx+1,y-1,z);
-      fz = &f(nx+1,y,  z-1);
-      UPDATE_DERR_E();
-    }
-  }
-
-  // Finish up setting interior
-
-  WAIT_PIPELINES();
 
-  local_adjust_rhob( fa->f, fa->g );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_compute_rhob_pipeline( fa );
 }
diff --git a/src/field_advance/standard/vacuum_energy_f.c b/src/field_advance/standard/vacuum_energy_f.c
index fd49c3d4..4977b02c 100644
--- a/src/field_advance/standard/vacuum_energy_f.c
+++ b/src/field_advance/standard/vacuum_energy_f.c
@@ -1,137 +1,22 @@
 // FIXME: USE THE DISCRETIZED VARIATIONAL DEFINITION OF ENERGY
 
 #define IN_sfa
-#include "sfa_private.h"
-
-typedef struct pipeline_args {
-  const field_t      * ALIGNED(128) f;
-  const sfa_params_t *              p;
-  const grid_t       *              g;
-  double en[MAX_PIPELINE+1][6];
-} pipeline_args_t;
-
-#define DECLARE_STENCIL()                                                  \
-  const field_t                * ALIGNED(128) f = args->f;                 \
-  const material_coefficient_t * ALIGNED(128) m = args->p->mc;             \
-  const grid_t                 *              g = args->g;                 \
-  const int nx = g->nx, ny = g->ny, nz = g->nz;                            \
-                                                                           \
-  const float qepsx = 0.25*m->epsx;                                        \
-  const float qepsy = 0.25*m->epsy;                                        \
-  const float qepsz = 0.25*m->epsz;                                        \
-  const float hrmux = 0.5*m->rmux; /* was previously 0.25 in master */	   \
-  const float hrmuy = 0.5*m->rmuy; /* was previously 0.25 in master */	   \
-  const float hrmuz = 0.5*m->rmuz; /* was previously 0.25 in master*/	   \
-                                                                           \
-  const field_t * ALIGNED(16) f0;                                          \
-  const field_t * ALIGNED(16) fx,  * ALIGNED(16) fy,  * ALIGNED(16) fz;    \
-  const field_t * ALIGNED(16) fyz, * ALIGNED(16) fzx, * ALIGNED(16) fxy;   \
-  double en_ex = 0, en_ey = 0, en_ez = 0, en_bx = 0, en_by = 0, en_bz = 0; \
-  int x, y, z
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-
-#define INIT_STENCIL()   \
-  f0  = &f(x,  y,  z  ); \
-  fx  = &f(x+1,y,  z  ); \
-  fy  = &f(x,  y+1,z  ); \
-  fz  = &f(x,  y,  z+1); \
-  fyz = &f(x,  y+1,z+1); \
-  fzx = &f(x+1,y,  z+1); \
-  fxy = &f(x+1,y+1,z  )
-
-#define NEXT_STENCIL()                              \
-  f0++; fx++; fy++; fz++; fyz++; fzx++; fxy++; x++; \
-  if( x>nx ) {                                      \
-    /**/       y++;            x = 1;               \
-    if( y>ny ) z++; if( y>ny ) y = 1;               \
-    INIT_STENCIL();                                 \
-  }
-
-#define REDUCE_EN()                     \
-  en_ex += qepsx*(  f0->ex * f0->ex +   \
-                    fy->ex * fy->ex +   \
-                    fz->ex * fz->ex +   \
-                   fyz->ex *fyz->ex );  \
-  en_ey += qepsy*(  f0->ey * f0->ey +   \
-                    fz->ey * fz->ey +   \
-                    fx->ey * fx->ey +   \
-                   fzx->ey *fzx->ey );  \
-  en_ez += qepsz*(  f0->ez * f0->ez +   \
-                    fx->ez * fx->ez +   \
-                    fy->ez * fy->ez +   \
-                   fxy->ez *fxy->ez );  \
-  en_bx += hrmux*(  f0->cbx* f0->cbx +  \
-                    fx->cbx* fx->cbx ); \
-  en_by += hrmuy*(  f0->cby* f0->cby +  \
-                    fy->cby* fy->cby ); \
-  en_bz += hrmuz*(  f0->cbz* f0->cbz +  \
-                    fz->cbz* fz->cbz )
- 
-void
-vacuum_energy_f_pipeline( pipeline_args_t * args,
-                          int pipeline_rank,
-                          int n_pipeline ) {
-  DECLARE_STENCIL();
-  
-  int n_voxel;
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 16,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-  
-  INIT_STENCIL();
-  for( ; n_voxel; n_voxel-- ) {
-    REDUCE_EN();
-    NEXT_STENCIL();
-  }
-
-  args->en[pipeline_rank][0] = en_ex;
-  args->en[pipeline_rank][1] = en_ey;
-  args->en[pipeline_rank][2] = en_ez;
-  args->en[pipeline_rank][3] = en_bx;
-  args->en[pipeline_rank][4] = en_by;
-  args->en[pipeline_rank][5] = en_bz;
-}
 
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "Not implemented"
+#include "sfa_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper vacuum_energy_f function.
+//----------------------------------------------------------------------------//
 
 void
-vacuum_energy_f( double              *          global,
-                 const field_array_t * RESTRICT fa ) {
-  if( !global || !fa ) ERROR(( "Bad args" ));
-
-  // Have each pipeline and the host handle a portion of the
-  // local voxels
-  
-  pipeline_args_t args[1];
-  args->f = fa->f;
-  args->p = (sfa_params_t *)fa->params;
-  args->g = fa->g;
-  EXEC_PIPELINES( vacuum_energy_f, args, 0 );
-  WAIT_PIPELINES();
-
-  // Reduce results from each pipelines
-  
-  int p;
-  for( p=1; p<=N_PIPELINE; p++ ) {
-    args->en[0][0] += args->en[p][0]; args->en[0][1] += args->en[p][1];
-    args->en[0][2] += args->en[p][2]; args->en[0][3] += args->en[p][3];
-    args->en[0][4] += args->en[p][4]; args->en[0][5] += args->en[p][5];
+vacuum_energy_f( double * global,
+                 const field_array_t * RESTRICT fa )
+{
+  if ( !global || !fa )
+  {
+    ERROR( ( "Bad args" ) );
   }
-    
-  // Convert to physical units and reduce results between nodes
-  
-  double v0 = 0.5*fa->g->eps0*fa->g->dV;
-  args->en[0][0] *= v0; args->en[0][1] *= v0;
-  args->en[0][2] *= v0; args->en[0][3] *= v0;
-  args->en[0][4] *= v0; args->en[0][5] *= v0;
 
-  // Reduce results between nodes
-
-  mp_allsum_d( args->en[0], global, 6 );
+  // Conditionally execute this when more abstractions are available.
+  vacuum_energy_f_pipeline( global, fa );
 }
-
diff --git a/src/sf_interface/accumulator_array.c b/src/sf_interface/accumulator_array.c
index 443cc002..1acbfa06 100644
--- a/src/sf_interface/accumulator_array.c
+++ b/src/sf_interface/accumulator_array.c
@@ -11,10 +11,21 @@
 #include "sf_interface.h"
 
 static int
-aa_n_pipeline(void) {
-  int                       n = serial.n_pipeline;
-  if( n<thread.n_pipeline ) n = thread.n_pipeline;
-  return n; /* max( {serial,thread,spu}.n_pipeline ) */
+aa_n_pipeline(void)
+{
+#if defined(VPIC_USE_PTHREADS)                         // Pthreads case.
+  int                          n = serial.n_pipeline;
+  if ( n < thread.n_pipeline ) n = thread.n_pipeline;
+
+#elif defined(VPIC_USE_OPENMP)                         // OpenMP case.
+  int                          n = omp_helper.n_pipeline;
+
+#else                                                  // Error case.
+  #error "VPIC_USE_OPENMP or VPIC_USE_PTHREADS must be specified"
+
+#endif
+
+  return n; // max( {serial,thread}.n_pipeline )
 }
 
 void
diff --git a/src/sf_interface/clear_accumulators.c b/src/sf_interface/clear_accumulators.c
index 0b0859eb..4bc8d9e0 100644
--- a/src/sf_interface/clear_accumulators.c
+++ b/src/sf_interface/clear_accumulators.c
@@ -1,38 +1,20 @@
 #define IN_sf_interface
-#include "sf_interface_private.h"
-
-void
-clear_accumulators_pipeline( accumulators_pipeline_args_t * args,
-                             int pipeline_rank,
-                             int n_pipeline ) {
-  accumulator_t * ALIGNED(16) a = args->a;
-  int n = args->n, n_array = args->n_array, s_array = args->s_array, i;
-  DISTRIBUTE(n, accumulators_n_block, pipeline_rank, n_pipeline, i, n); a += i;
-  for( ; n_array; n_array--, a+=s_array ) CLEAR( a, n );
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "V4 version not hooked up yet!"
 
-#endif
+#include "sf_interface_private.h"
 
-#define VOX(x,y,z) VOXEL(x,y,z, aa->g->nx,aa->g->ny,aa->g->nz)
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper clear_accumulator_array
+// function.
+//----------------------------------------------------------------------------//
 
 void
-clear_accumulator_array( accumulator_array_t * RESTRICT aa ) {
-  DECLARE_ALIGNED_ARRAY( accumulators_pipeline_args_t, 128, args, 1 );
-  int i0;
-
-  if( !aa ) ERROR(( "Bad args" ));
-
-  i0 = (VOX(1,1,1)/2)*2; // Round i0 down to even for 128B align on Cell */
-
-  args->a       = aa->a + i0;
-  args->n       = ((( VOX(aa->g->nx,aa->g->ny,aa->g->nz) - i0 + 1 )+1)/2)*2;
-  args->n_array = aa->n_pipeline + 1;
-  args->s_array = aa->stride;
-  EXEC_PIPELINES( clear_accumulators, args, 0 );
-  WAIT_PIPELINES();
+clear_accumulator_array( accumulator_array_t * RESTRICT aa )
+{
+  if ( !aa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Conditionally execute this when more abstractions are available.
+  clear_accumulator_array_pipeline( aa );
 }
-
diff --git a/src/sf_interface/interpolator_array.cc b/src/sf_interface/interpolator_array.cc
index 42a1066a..4ddcb75f 100644
--- a/src/sf_interface/interpolator_array.cc
+++ b/src/sf_interface/interpolator_array.cc
@@ -1,16 +1,18 @@
 #define IN_sf_interface
-#define HAS_V4_PIPELINE
+
 #include "sf_interface_private.h"
 
 void
-checkpt_interpolator_array( const interpolator_array_t * ia ) {
+checkpt_interpolator_array( const interpolator_array_t * ia )
+{
   CHECKPT( ia, 1 );
   CHECKPT_ALIGNED( ia->i, ia->g->nv, 128 );
   CHECKPT_PTR( ia->g );
 }
 
 interpolator_array_t *
-restore_interpolator_array( void ) {
+restore_interpolator_array( void )
+{
   interpolator_array_t * ia;
   RESTORE( ia );
   RESTORE_ALIGNED( ia->i );
@@ -19,7 +21,8 @@ restore_interpolator_array( void ) {
 }
 
 interpolator_array_t *
-new_interpolator_array( grid_t * g ) {
+new_interpolator_array( grid_t * g )
+{
   interpolator_array_t * ia;
   if( !g ) ERROR(( "NULL grid" ));
   MALLOC( ia, 1 );
@@ -32,236 +35,32 @@ new_interpolator_array( grid_t * g ) {
 }
 
 void
-delete_interpolator_array( interpolator_array_t * ia ) {
+delete_interpolator_array( interpolator_array_t * ia )
+{
   if( !ia ) return;
   UNREGISTER_OBJECT( ia );
   FREE_ALIGNED( ia->i );
   FREE( ia );
 }
 
-#define fi(x,y,z) fi[   VOXEL(x,y,z, nx,ny,nz) ]
-#define f(x,y,z)  f [   VOXEL(x,y,z, nx,ny,nz) ]
-#define nb(x,y,z) nb[ 6*VOXEL(x,y,z, nx,ny,nz) ]
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper load_interpolator_array
+// function.
+//----------------------------------------------------------------------------//
 
 void
-load_interpolator_pipeline( load_interpolator_pipeline_args_t * args,
-			    int pipeline_rank,
-                            int n_pipeline ) {
-  interpolator_t * ALIGNED(128) fi = args->fi;
-  const field_t  * ALIGNED(128) f  = args->f;
-
-  interpolator_t * ALIGNED(16) pi;
-
-  const field_t  * ALIGNED(16) pf0;
-  const field_t  * ALIGNED(16) pfx,  * ALIGNED(16) pfy,  * ALIGNED(16) pfz;
-  const field_t  * ALIGNED(16) pfyz, * ALIGNED(16) pfzx, * ALIGNED(16) pfxy;
-  int x, y, z, n_voxel;
-
-  const int nx = args->nx;
-  const int ny = args->ny;
-  const int nz = args->nz;
-
-  const float fourth = 0.25;
-  const float half   = 0.5;
-
-  float w0, w1, w2, w3;
-
-  // Process the voxels assigned to this pipeline
-  
-  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 1,
-                     pipeline_rank, n_pipeline, x, y, z, n_voxel );
-
-# define LOAD_STENCIL()    \
-  pi   = &fi(x,  y,  z  ); \
-  pf0  =  &f(x,  y,  z  ); \
-  pfx  =  &f(x+1,y,  z  ); \
-  pfy  =  &f(x,  y+1,z  ); \
-  pfz  =  &f(x,  y,  z+1); \
-  pfyz =  &f(x,  y+1,z+1); \
-  pfzx =  &f(x+1,y,  z+1); \
-  pfxy =  &f(x+1,y+1,z  )
-
-  LOAD_STENCIL();
-  
-  for( ; n_voxel; n_voxel-- ) {
-
-    // ex interpolation
-    w0 = pf0->ex;
-    w1 = pfy->ex;
-    w2 = pfz->ex;
-    w3 = pfyz->ex;
-    pi->ex       = fourth*( (w3 + w0) + (w1 + w2) );
-    pi->dexdy    = fourth*( (w3 - w0) + (w1 - w2) );
-    pi->dexdz    = fourth*( (w3 - w0) - (w1 - w2) );
-    pi->d2exdydz = fourth*( (w3 + w0) - (w1 + w2) );
-
-    // ey interpolation coefficients
-    w0 = pf0->ey;
-    w1 = pfz->ey;
-    w2 = pfx->ey;
-    w3 = pfzx->ey;
-    pi->ey       = fourth*( (w3 + w0) + (w1 + w2) );
-    pi->deydz    = fourth*( (w3 - w0) + (w1 - w2) );
-    pi->deydx    = fourth*( (w3 - w0) - (w1 - w2) );
-    pi->d2eydzdx = fourth*( (w3 + w0) - (w1 + w2) );
-
-    // ez interpolation coefficients
-    w0 = pf0->ez;
-    w1 = pfx->ez;
-    w2 = pfy->ez;
-    w3 = pfxy->ez;
-    pi->ez       = fourth*( (w3 + w0) + (w1 + w2) );
-    pi->dezdx    = fourth*( (w3 - w0) + (w1 - w2) );
-    pi->dezdy    = fourth*( (w3 - w0) - (w1 - w2) );
-    pi->d2ezdxdy = fourth*( (w3 + w0) - (w1 + w2) );
-
-    // bx interpolation coefficients
-    w0 = pf0->cbx;
-    w1 = pfx->cbx;
-    pi->cbx    = half*( w1 + w0 );
-    pi->dcbxdx = half*( w1 - w0 );
-
-    // by interpolation coefficients
-    w0 = pf0->cby;
-    w1 = pfy->cby;
-    pi->cby    = half*( w1 + w0 );
-    pi->dcbydy = half*( w1 - w0 );
-
-    // bz interpolation coefficients
-    w0 = pf0->cbz;
-    w1 = pfz->cbz;
-    pi->cbz    = half*( w1 + w0 );
-    pi->dcbzdz = half*( w1 - w0 );
-
-    pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++;
-
-    x++;
-    if( x>nx ) {
-      x=1, y++;
-      if( y>ny ) y=1, z++;
-      LOAD_STENCIL();
-    }
+load_interpolator_array( interpolator_array_t * RESTRICT ia,
+                         const field_array_t * RESTRICT fa )
+{
+  if ( !ia              ||
+       !fa              ||
+       ia->g != fa->g )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-# undef LOAD_STENCIL
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-load_interpolator_pipeline_v4( load_interpolator_pipeline_args_t * args,
-                               int pipeline_rank,
-                               int n_pipeline ) {
-  interpolator_t * ALIGNED(128) fi = args->fi;
-  const field_t  * ALIGNED(128) f  = args->f;
-
-  interpolator_t * ALIGNED(16) pi;
-
-  const field_t * ALIGNED(16) pf0;
-  const field_t * ALIGNED(16) pfx,  * ALIGNED(16) pfy,  * ALIGNED(16) pfz;
-  const field_t * ALIGNED(16) pfyz, * ALIGNED(16) pfzx, * ALIGNED(16) pfxy;
-  int x, y, z, n_voxel;
-
-  const int nx = args->nx;
-  const int ny = args->ny;
-  const int nz = args->nz;
-
-  const v4float fourth(0.25);
-  const v4float half(  0.5 );
-
-  const v4int   sgn_1_2(  0, 1<<31, 1<<31,     0 );
-  const v4int   sgn_2_3(  0,     0, 1<<31, 1<<31 );
-  const v4int   sgn_1_3(  0, 1<<31,     0, 1<<31 );
-  const v4int   sel_0_1( -1,    -1,     0,     0 );
-
-  v4float w0, w1, w2, w3;
-
-  // Process the voxels assigned to this pipeline
-
-  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
-  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 1,
-                     pipeline_rank, n_pipeline,
-                     x, y, z, n_voxel );
-  
-# define LOAD_STENCIL()    \
-  pi   = &fi(x,  y,  z  ); \
-  pf0  =  &f(x,  y,  z  ); \
-  pfx  =  &f(x+1,y,  z  ); \
-  pfy  =  &f(x,  y+1,z  ); \
-  pfz  =  &f(x,  y,  z+1); \
-  pfyz =  &f(x,  y+1,z+1); \
-  pfzx =  &f(x+1,y,  z+1); \
-  pfxy =  &f(x+1,y+1,z  )
-
-  LOAD_STENCIL();
-  
-  for( ; n_voxel; n_voxel-- ) {
-
-    // ex interpolation coefficients 
-    w0 = toggle_bits( sgn_1_2, v4float( pf0->ex) ); // [ w0 -w0 -w0 w0 ]
-    w1 =                       v4float( pfy->ex);   // [ w1  w1  w1 w1 ]
-    w2 = toggle_bits( sgn_1_2, v4float( pfz->ex) ); // [ w2 -w2 -w2 w2 ]
-    w3 =                       v4float(pfyz->ex);   // [ w3  w3  w3 w3 ]
-    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
-               &pi->ex );
-
-    // ey interpolation coefficients 
-    w0 = toggle_bits( sgn_1_2, v4float( pf0->ey) ); // [ w0 -w0 -w0 w0 ]
-    w1 =                       v4float( pfz->ey);   // [ w1  w1  w1 w1 ]
-    w2 = toggle_bits( sgn_1_2, v4float( pfx->ey) ); // [ w2 -w2 -w2 w2 ]
-    w3 =                       v4float(pfzx->ey);   // [ w3  w3  w3 w3 ]
-    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
-               &pi->ey );
-
-    // ez interpolation coefficients 
-    w0 = toggle_bits( sgn_1_2, v4float( pf0->ez) ); // [ w0 -w0 -w0 w0 ]
-    w1 =                       v4float( pfx->ez);   // [ w1  w1  w1 w1 ]
-    w2 = toggle_bits( sgn_1_2, v4float( pfy->ez) ); // [ w2 -w2 -w2 w2 ]
-    w3 =                       v4float(pfxy->ez);   // [ w3  w3  w3 w3 ]
-    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
-               &pi->ez );
-
-    // bx and by interpolation coefficients 
-    w0  = toggle_bits( sgn_1_3,
-                       merge( sel_0_1,
-                              v4float(pf0->cbx),
-                              v4float(pf0->cby) ) ); // [ w0x -w0x w0y -w0y ]
-    w1  =              merge( sel_0_1,
-                              v4float(pfx->cbx),
-                              v4float(pfy->cby) );   // [ w1x  w1x w1y  w1y ]
-    store_4x1( half*( w1 + w0 ), &pi->cbx );
-
-    // bz interpolation coefficients 
-    w0  = toggle_bits( sgn_1_3, v4float(pf0->cbz) ); // [ w0 -w0 d/c d/c ]
-    w1  =                       v4float(pfz->cbz);   // [ w1 -w1 d/c d/c ]
-    store_4x1( half*( w1 + w0 ), &pi->cbz ); // Note: Padding after bz coeff!
-
-    pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++;
-
-    x++;
-    if( x>nx ) {
-      x=1, y++;
-      if( y>ny ) y=1, z++;
-      LOAD_STENCIL();
-    }
-  }
-
-# undef LOAD_STENCIL
-
-}
-
-#endif
-
-void
-load_interpolator_array( /**/  interpolator_array_t * RESTRICT ia,
-                         const field_array_t        * RESTRICT fa ) {
-  DECLARE_ALIGNED_ARRAY( load_interpolator_pipeline_args_t, 128, args, 1 );
-
-  if( !ia || !fa || ia->g!=fa->g ) ERROR(( "Bad args" ));
+  // Conditionally execute this when more abstractions are available.
+  load_interpolator_array_pipeline( ia, fa );
 
 # if 0 // Original non-pipelined version
   for( z=1; z<=nz; z++ ) {
@@ -331,14 +130,4 @@ load_interpolator_array( /**/  interpolator_array_t * RESTRICT ia,
     }
   }
 # endif
-
-  args->fi = ia->i;
-  args->f  = fa->f;
-  args->nb = ia->g->neighbor;
-  args->nx = ia->g->nx;
-  args->ny = ia->g->ny;
-  args->nz = ia->g->nz;
-
-  EXEC_PIPELINES( load_interpolator, args, 0 );
-  WAIT_PIPELINES();
 }
diff --git a/src/sf_interface/pipeline/clear_accumulators_pipeline.c b/src/sf_interface/pipeline/clear_accumulators_pipeline.c
new file mode 100644
index 00000000..bf265edd
--- /dev/null
+++ b/src/sf_interface/pipeline/clear_accumulators_pipeline.c
@@ -0,0 +1,61 @@
+#define IN_sf_interface
+
+#include "sf_interface_pipeline.h"
+
+#include "../sf_interface_private.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+void
+clear_accumulators_pipeline_scalar( accumulators_pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline )
+{
+  accumulator_t * ALIGNED(16) a = args->a;
+
+  int n       = args->n;
+  int n_array = args->n_array;
+  int s_array = args->s_array;
+  int i;
+
+  DISTRIBUTE( n, accumulators_n_block, pipeline_rank, n_pipeline, i, n );
+
+  a += i;
+
+  for( ; n_array; n_array--, a+=s_array )
+  {
+    CLEAR( a, n );
+  }
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "V4 version not hooked up yet!"
+
+#endif
+
+#define VOX(x,y,z) VOXEL( x, y, z, aa->g->nx, aa->g->ny, aa->g->nz )
+
+void
+clear_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa )
+{
+  DECLARE_ALIGNED_ARRAY( accumulators_pipeline_args_t, 128, args, 1 );
+
+  int i0;
+
+  if ( !aa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  i0 = ( VOX(1,1,1) / 2 ) * 2; // Round i0 down to even for 128B align on Cell */
+
+  args->a       = aa->a + i0;
+  args->n       = ( ( ( VOX(aa->g->nx,aa->g->ny,aa->g->nz) - i0 + 1 ) + 1 ) / 2 ) * 2;
+  args->n_array = aa->n_pipeline + 1;
+  args->s_array = aa->stride;
+
+  EXEC_PIPELINES( clear_accumulators, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/sf_interface/pipeline/interpolator_array_pipeline.cc b/src/sf_interface/pipeline/interpolator_array_pipeline.cc
new file mode 100644
index 00000000..9cb419f9
--- /dev/null
+++ b/src/sf_interface/pipeline/interpolator_array_pipeline.cc
@@ -0,0 +1,335 @@
+#define IN_sf_interface
+
+#define HAS_V4_PIPELINE
+
+// It appears that the use of SIMD vectors in this file is not for vectors over
+// particles or fields that can be easily extended to longer SIMD vector lengths.
+// Thus, it appears that this file is not a candidate for V8_ACCELERATION.
+// #define HAS_V8_PIPELINE
+
+#include "sf_interface_pipeline.h"
+
+#include "../sf_interface_private.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+#define fi(x,y,z) fi[   VOXEL( x, y, z, nx, ny, nz ) ]
+#define f(x,y,z)  f [   VOXEL( x, y, z, nx, ny, nz ) ]
+#define nb(x,y,z) nb[ 6*VOXEL( x, y, z, nx, ny, nz ) ]
+
+void
+load_interpolator_pipeline_scalar( load_interpolator_pipeline_args_t * args,
+				   int pipeline_rank,
+				   int n_pipeline )
+{
+  interpolator_t * ALIGNED(128) fi = args->fi;
+  const field_t  * ALIGNED(128) f  = args->f;
+
+  interpolator_t * ALIGNED(16) pi;
+
+  const field_t  * ALIGNED(16) pf0;
+  const field_t  * ALIGNED(16) pfx,  * ALIGNED(16) pfy,  * ALIGNED(16) pfz;
+  const field_t  * ALIGNED(16) pfyz, * ALIGNED(16) pfzx, * ALIGNED(16) pfxy;
+
+  int x, y, z, n_voxel;
+
+  const int nx = args->nx;
+  const int ny = args->ny;
+  const int nz = args->nz;
+
+  const float fourth = 0.25;
+  const float half   = 0.50;
+
+  float w0, w1, w2, w3;
+
+  // Process the voxels assigned to this pipeline
+  
+  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 1,
+                     pipeline_rank, n_pipeline, x, y, z, n_voxel );
+
+# define LOAD_STENCIL()    \
+  pi   = &fi(x,  y,  z  ); \
+  pf0  =  &f(x,  y,  z  ); \
+  pfx  =  &f(x+1,y,  z  ); \
+  pfy  =  &f(x,  y+1,z  ); \
+  pfz  =  &f(x,  y,  z+1); \
+  pfyz =  &f(x,  y+1,z+1); \
+  pfzx =  &f(x+1,y,  z+1); \
+  pfxy =  &f(x+1,y+1,z  )
+
+  LOAD_STENCIL();
+  
+  for( ; n_voxel; n_voxel-- )
+  {
+    // ex interpolation
+    w0 = pf0->ex;
+    w1 = pfy->ex;
+    w2 = pfz->ex;
+    w3 = pfyz->ex;
+    pi->ex       = fourth*( (w3 + w0) + (w1 + w2) );
+    pi->dexdy    = fourth*( (w3 - w0) + (w1 - w2) );
+    pi->dexdz    = fourth*( (w3 - w0) - (w1 - w2) );
+    pi->d2exdydz = fourth*( (w3 + w0) - (w1 + w2) );
+
+    // ey interpolation coefficients
+    w0 = pf0->ey;
+    w1 = pfz->ey;
+    w2 = pfx->ey;
+    w3 = pfzx->ey;
+    pi->ey       = fourth*( (w3 + w0) + (w1 + w2) );
+    pi->deydz    = fourth*( (w3 - w0) + (w1 - w2) );
+    pi->deydx    = fourth*( (w3 - w0) - (w1 - w2) );
+    pi->d2eydzdx = fourth*( (w3 + w0) - (w1 + w2) );
+
+    // ez interpolation coefficients
+    w0 = pf0->ez;
+    w1 = pfx->ez;
+    w2 = pfy->ez;
+    w3 = pfxy->ez;
+    pi->ez       = fourth*( (w3 + w0) + (w1 + w2) );
+    pi->dezdx    = fourth*( (w3 - w0) + (w1 - w2) );
+    pi->dezdy    = fourth*( (w3 - w0) - (w1 - w2) );
+    pi->d2ezdxdy = fourth*( (w3 + w0) - (w1 + w2) );
+
+    // bx interpolation coefficients
+    w0 = pf0->cbx;
+    w1 = pfx->cbx;
+    pi->cbx    = half*( w1 + w0 );
+    pi->dcbxdx = half*( w1 - w0 );
+
+    // by interpolation coefficients
+    w0 = pf0->cby;
+    w1 = pfy->cby;
+    pi->cby    = half*( w1 + w0 );
+    pi->dcbydy = half*( w1 - w0 );
+
+    // bz interpolation coefficients
+    w0 = pf0->cbz;
+    w1 = pfz->cbz;
+    pi->cbz    = half*( w1 + w0 );
+    pi->dcbzdz = half*( w1 - w0 );
+
+    pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++;
+
+    x++;
+    if ( x > nx )
+    {
+      x=1, y++;
+      if ( y > ny ) y=1, z++;
+      LOAD_STENCIL();
+    }
+  }
+
+# undef LOAD_STENCIL
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+using namespace v4;
+
+void
+load_interpolator_pipeline_v4( load_interpolator_pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline )
+{
+  interpolator_t * ALIGNED(128) fi = args->fi;
+  const field_t  * ALIGNED(128) f  = args->f;
+
+  interpolator_t * ALIGNED(16) pi;
+
+  const field_t * ALIGNED(16) pf0;
+  const field_t * ALIGNED(16) pfx,  * ALIGNED(16) pfy,  * ALIGNED(16) pfz;
+  const field_t * ALIGNED(16) pfyz, * ALIGNED(16) pfzx, * ALIGNED(16) pfxy;
+  int x, y, z, n_voxel;
+
+  const int nx = args->nx;
+  const int ny = args->ny;
+  const int nz = args->nz;
+
+  const v4float fourth(0.25);
+  const v4float half(  0.50);
+
+  const v4int   sgn_1_2(  0, 1<<31, 1<<31,     0 );
+  const v4int   sgn_2_3(  0,     0, 1<<31, 1<<31 );
+  const v4int   sgn_1_3(  0, 1<<31,     0, 1<<31 );
+  const v4int   sel_0_1( -1,    -1,     0,     0 );
+
+  v4float w0, w1, w2, w3;
+
+  // Process the voxels assigned to this pipeline
+
+  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
+
+  DISTRIBUTE_VOXELS( 1,nx, 1,ny, 1,nz, 1,
+                     pipeline_rank, n_pipeline,
+                     x, y, z, n_voxel );
+  
+# define LOAD_STENCIL()    \
+  pi   = &fi(x,  y,  z  ); \
+  pf0  =  &f(x,  y,  z  ); \
+  pfx  =  &f(x+1,y,  z  ); \
+  pfy  =  &f(x,  y+1,z  ); \
+  pfz  =  &f(x,  y,  z+1); \
+  pfyz =  &f(x,  y+1,z+1); \
+  pfzx =  &f(x+1,y,  z+1); \
+  pfxy =  &f(x+1,y+1,z  )
+
+  LOAD_STENCIL();
+  
+  for( ; n_voxel; n_voxel-- )
+  {
+    // ex interpolation coefficients 
+    w0 = toggle_bits( sgn_1_2, v4float( pf0->ex) ); // [ w0 -w0 -w0 w0 ]
+    w1 =                       v4float( pfy->ex);   // [ w1  w1  w1 w1 ]
+    w2 = toggle_bits( sgn_1_2, v4float( pfz->ex) ); // [ w2 -w2 -w2 w2 ]
+    w3 =                       v4float(pfyz->ex);   // [ w3  w3  w3 w3 ]
+
+    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
+               &pi->ex );
+
+    // ey interpolation coefficients 
+    w0 = toggle_bits( sgn_1_2, v4float( pf0->ey) ); // [ w0 -w0 -w0 w0 ]
+    w1 =                       v4float( pfz->ey);   // [ w1  w1  w1 w1 ]
+    w2 = toggle_bits( sgn_1_2, v4float( pfx->ey) ); // [ w2 -w2 -w2 w2 ]
+    w3 =                       v4float(pfzx->ey);   // [ w3  w3  w3 w3 ]
+
+    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
+               &pi->ey );
+
+    // ez interpolation coefficients 
+    w0 = toggle_bits( sgn_1_2, v4float( pf0->ez) ); // [ w0 -w0 -w0 w0 ]
+    w1 =                       v4float( pfx->ez);   // [ w1  w1  w1 w1 ]
+    w2 = toggle_bits( sgn_1_2, v4float( pfy->ez) ); // [ w2 -w2 -w2 w2 ]
+    w3 =                       v4float(pfxy->ez);   // [ w3  w3  w3 w3 ]
+
+    store_4x1( fourth*( ( w3 + w0 ) + toggle_bits( sgn_2_3, w1 + w2 ) ),
+               &pi->ez );
+
+    // bx and by interpolation coefficients 
+    w0  = toggle_bits( sgn_1_3,
+                       merge( sel_0_1,
+                              v4float(pf0->cbx),
+                              v4float(pf0->cby) ) ); // [ w0x -w0x w0y -w0y ]
+    w1  =              merge( sel_0_1,
+                              v4float(pfx->cbx),
+                              v4float(pfy->cby) );   // [ w1x  w1x w1y  w1y ]
+
+    store_4x1( half*( w1 + w0 ), &pi->cbx );
+
+    // bz interpolation coefficients 
+    w0  = toggle_bits( sgn_1_3, v4float(pf0->cbz) ); // [ w0 -w0 d/c d/c ]
+    w1  =                       v4float(pfz->cbz);   // [ w1 -w1 d/c d/c ]
+
+    store_4x1( half*( w1 + w0 ), &pi->cbz ); // Note: Padding after bz coeff!
+
+    pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++;
+
+    x++;
+    if ( x > nx )
+    {
+      x=1, y++;
+      if ( y > ny ) y=1, z++;
+      LOAD_STENCIL();
+    }
+  }
+
+# undef LOAD_STENCIL
+}
+
+#endif
+
+void
+load_interpolator_array_pipeline( interpolator_array_t * RESTRICT ia,
+                                  const field_array_t * RESTRICT fa )
+{
+  DECLARE_ALIGNED_ARRAY( load_interpolator_pipeline_args_t, 128, args, 1 );
+
+  if ( !ia              ||
+       !fa              ||
+       ia->g != fa->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+# if 0 // Original non-pipelined version
+  for( z=1; z<=nz; z++ ) {
+    for( y=1; y<=ny; y++ ) {
+
+      pi = &fi(1,y,z);
+      pf0 = &f(1,y,z);
+      pfx = &f(2,y,z);
+      pfy = &f(1,y+1,z);
+      pfz = &f(1,y,z+1);
+      pfyz = &f(1,y+1,z+1);
+      pfzx = &f(2,y,z+1);
+      pfxy = &f(2,y+1,z);
+
+      for( x=1; x<=nx; x++ ) {
+
+        // ex interpolation coefficients
+        w0 = pf0->ex;
+        w1 = pfy->ex;
+        w2 = pfz->ex;
+        w3 = pfyz->ex;
+        pi->ex       = 0.25*(  w0 + w1 + w2 + w3 );
+        pi->dexdy    = 0.25*( -w0 + w1 - w2 + w3 );
+        pi->dexdz    = 0.25*( -w0 - w1 + w2 + w3 );
+        pi->d2exdydz = 0.25*(  w0 - w1 - w2 + w3 );
+        
+        // ey interpolation coefficients
+        w0 = pf0->ey;
+        w1 = pfz->ey;
+        w2 = pfx->ey;
+        w3 = pfzx->ey;
+        pi->ey       = 0.25*(  w0 + w1 + w2 + w3 );
+        pi->deydz    = 0.25*( -w0 + w1 - w2 + w3 );
+        pi->deydx    = 0.25*( -w0 - w1 + w2 + w3 );
+        pi->d2eydzdx = 0.25*(  w0 - w1 - w2 + w3 );
+        
+        // ez interpolation coefficients
+        w0 = pf0->ez;
+        w1 = pfx->ez;
+        w2 = pfy->ez;
+        w3 = pfxy->ez;
+        pi->ez       = 0.25*(  w0 + w1 + w2 + w3 );
+        pi->dezdx    = 0.25*( -w0 + w1 - w2 + w3 );
+        pi->dezdy    = 0.25*( -w0 - w1 + w2 + w3 );
+        pi->d2ezdxdy = 0.25*(  w0 - w1 - w2 + w3 );
+        
+        // bx interpolation coefficients
+        w0 = pf0->cbx;
+        w1 = pfx->cbx;
+        pi->cbx    = 0.5*(  w0 + w1 );
+        pi->dcbxdx = 0.5*( -w0 + w1 );
+        
+        // by interpolation coefficients
+        w0 = pf0->cby;
+        w1 = pfy->cby;
+        pi->cby    = 0.5*(  w0 + w1 );
+        pi->dcbydy = 0.5*( -w0 + w1 );
+        
+        // bz interpolation coefficients
+        w0 = pf0->cbz;
+        w1 = pfz->cbz;
+        pi->cbz    = 0.5*(  w0 + w1 );
+        pi->dcbzdz = 0.5*( -w0 + w1 );
+
+        pi++; pf0++; pfx++; pfy++; pfz++; pfyz++; pfzx++; pfxy++;
+      }
+    }
+  }
+# endif
+
+  args->fi = ia->i;
+  args->f  = fa->f;
+  args->nb = ia->g->neighbor;
+  args->nx = ia->g->nx;
+  args->ny = ia->g->ny;
+  args->nz = ia->g->nz;
+
+  EXEC_PIPELINES( load_interpolator, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/sf_interface/pipeline/reduce_accumulators_pipeline.cc b/src/sf_interface/pipeline/reduce_accumulators_pipeline.cc
new file mode 100644
index 00000000..50c3c44f
--- /dev/null
+++ b/src/sf_interface/pipeline/reduce_accumulators_pipeline.cc
@@ -0,0 +1,243 @@
+#define IN_sf_interface
+
+#include "sf_interface_pipeline.h"
+
+#include "../sf_interface_private.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+// FIXME: N_ARRAY>1 ALWAYS BUT THIS ISN'T STRICTLY NECESSARY BECAUSE
+// HOST IS THREAD FOR THE SERIAL AND THREADED DISPATCHERS.  SHOULD
+// PROBABLY CHANGE N_ARRAY TO
+// max({serial,thread}.n_pipeline,spu.n_pipeline+1)
+
+void
+reduce_accumulators_pipeline_scalar( accumulators_pipeline_args_t * args,
+                                     int pipeline_rank,
+                                     int n_pipeline )
+{
+  int i;
+  int i1;
+  int si = sizeof(accumulator_t) / sizeof(float);
+  int r;
+  int nr = args->n_array - 1;
+  int sr = si*args->s_array;
+  int j, k;
+
+  DISTRIBUTE( args->n, accumulators_n_block,
+              pipeline_rank, n_pipeline, i, i1 );
+
+  i1 += i;
+
+  // a is broken into restricted rw and ro parts to allow the compiler
+  // to do more aggresive optimizations
+
+  /**/  float * RESTRICT ALIGNED(16) a = args->a->jx;
+  const float * RESTRICT ALIGNED(16) b = a + sr;
+
+# if defined(V4_ACCELERATION)
+
+  using namespace v4;
+
+  v4float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
+
+# define LOOP(OP)                               \
+  for( ; i < i1; i++ )                          \
+  {						\
+    k = i*si;                                   \
+    OP(k); OP(k + 4); OP(k + 8);                \
+  }
+
+# define A(k)   load_4x1(  &a[k],          v0   );
+# define B(k,r) load_4x1(  &b[k+(r-1)*sr], v##r );
+# define C(k,v) store_4x1( v, &a[k] )
+# define O1(k)A(k  )B(k,1)                                                 \
+              C(k,   v0+v1)
+# define O2(k)A(k  )B(k,1)B(k,2)                                           \
+              C(k,  (v0+v1)+ v2)
+# define O3(k)A(k  )B(k,1)B(k,2)B(k,3)                                     \
+              C(k,  (v0+v1)+(v2+v3))
+# define O4(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)                               \
+              C(k, ((v0+v1)+(v2+v3))+  v4)
+# define O5(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)                         \
+              C(k, ((v0+v1)+(v2+v3))+ (v4+v5))
+# define O6(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)                   \
+              C(k, ((v0+v1)+(v2+v3))+((v4+v5)+ v6))
+# define O7(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)             \
+              C(k, ((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))
+# define O8(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)B(k,8)       \
+              C(k,(((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))+   v8)
+# define O9(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)B(k,8)B(k,9) \
+              C(k,(((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))+  (v8+v9))
+
+# else
+
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11;
+
+# define LOOP(OP)                                \
+  for( ; i < i1; i++ )                           \
+  {                                              \
+    k = i*si;                                    \
+    OP(k    ); OP(k + 1); OP(k + 2); OP(k + 3);  \
+    OP(k + 4); OP(k + 5); OP(k + 6); OP(k + 7);  \
+    OP(k + 8); OP(k + 9); OP(k +10); OP(k +11);  \
+  }
+
+# define O1(k) a[k] =    a[k     ] + b[k     ]
+# define O2(k) a[k] =   (a[k     ] + b[k     ]) +  b[k+  sr]
+# define O3(k) a[k] =   (a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr])
+# define O4(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/                 b[k+3*sr]
+# define O5(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/                (b[k+3*sr] + b[k+4*sr])
+# define O6(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/               ((b[k+3*sr] + b[k+4*sr]) +  b[k+5*sr]          )
+# define O7(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))
+# define O8(k) a[k] = (((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))) + \
+    /**/                 b[k+7*sr]
+# define O9(k) a[k] = (((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
+    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))) + \
+    /**/                (b[k+7*sr] + b[k+8*sr])
+
+# endif
+  
+  switch( nr ) {        
+  case 0:           break;
+  case 1: LOOP(O1); break;
+  case 2: LOOP(O2); break;
+  case 3: LOOP(O3); break;
+  case 4: LOOP(O4); break;
+  case 5: LOOP(O5); break;
+  case 6: LOOP(O6); break;
+  case 7: LOOP(O7); break;
+  case 8: LOOP(O8); break;
+  case 9: LOOP(O9); break;
+  default:
+#   if defined(V4_ACCELERATION)
+    for( ; i < i1; i++ )
+    {
+      j = i*si;
+
+      load_4x1( &a[j+0], v0 );
+      load_4x1( &a[j+4], v1 );
+      load_4x1( &a[j+8], v2 );
+
+      for( r = 0; r < nr; r++ )
+      {
+        k = j + r * sr;
+
+        load_4x1( &b[k+0], v3 );
+	load_4x1( &b[k+4], v4 );
+	load_4x1( &b[k+8], v5 );
+
+        v0 += v3;
+	v1 += v4;
+	v2 += v5;
+      }
+
+      store_4x1( v0, &a[j+0] );
+      store_4x1( v1, &a[j+4] );
+      store_4x1( v2, &a[j+8] );
+    }
+#   else
+    for( ; i < i1; i++ )
+    {
+      j = i * si;
+
+      f0  = a[j+ 0];
+      f1  = a[j+ 1];
+      f2  = a[j+ 2];
+      f3  = a[j+ 3];
+      f4  = a[j+ 4];
+      f5  = a[j+ 5];
+      f6  = a[j+ 6];
+      f7  = a[j+ 7];
+      f8  = a[j+ 8];
+      f9  = a[j+ 9];
+      f10 = a[j+10];
+      f11 = a[j+11];
+
+      for( r = 0; r < nr; r++ )
+      {
+        k = j + r * sr;
+
+        f0  += b[k+ 0];
+	f1  += b[k+ 1];
+	f2  += b[k+ 2];
+	f3  += b[k+ 3];
+        f4  += b[k+ 4];
+	f5  += b[k+ 5];
+	f6  += b[k+ 6];
+	f7  += b[k+ 7];
+        f8  += b[k+ 8];
+	f9  += b[k+ 9];
+	f10 += b[k+10];
+	f11 += b[k+11];
+      }
+
+      a[j+ 0] =  f0;
+      a[j+ 1] =  f1;
+      a[j+ 2] =  f2;
+      a[j+ 3] =  f3;
+      a[j+ 4] =  f4;
+      a[j+ 5] =  f5;
+      a[j+ 6] =  f6;
+      a[j+ 7] =  f7;
+      a[j+ 8] =  f8;
+      a[j+ 9] =  f9;
+      a[j+10] = f10;
+      a[j+11] = f11;
+    }
+#   endif
+    break;
+  }
+
+# undef O9
+# undef O8
+# undef O7
+# undef O6
+# undef O5
+# undef O4
+# undef O3
+# undef O2
+# undef O1
+# undef C
+# undef B
+# undef A
+# undef LOOP
+
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "The regular pipeline is already V4 accelerated."
+
+#endif
+
+#define VOX(x,y,z) VOXEL( x, y, z, aa->g->nx, aa->g->ny, aa->g->nz )
+
+void
+reduce_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa )
+{
+  DECLARE_ALIGNED_ARRAY( accumulators_pipeline_args_t, 128, args, 1 );
+
+  int i0;
+
+  if ( !aa )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  i0 = ( VOX(1,1,1) / 2 ) * 2; // Round i0 down to even for 128B align on Cell
+
+  args->a       = aa->a + i0;
+  args->n       = ( ( ( VOX( aa->g->nx, aa->g->ny, aa->g->nz ) - i0 + 1 ) + 1 ) / 2 ) * 2;
+  args->n_array = aa->n_pipeline + 1;
+  args->s_array = aa->stride;
+
+  EXEC_PIPELINES( reduce_accumulators, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/sf_interface/pipeline/sf_interface_pipeline.h b/src/sf_interface/pipeline/sf_interface_pipeline.h
new file mode 100644
index 00000000..f43ca19c
--- /dev/null
+++ b/src/sf_interface/pipeline/sf_interface_pipeline.h
@@ -0,0 +1,91 @@
+#ifndef _sf_interface_pipeline_h_
+#define _sf_interface_pipeline_h_
+
+#ifndef IN_sf_interface
+#error "Do not include sf_interface_pipeline.h; include sf_interface.h"
+#endif
+
+#include "../sf_interface.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// load_interpolator_pipeline interface
+
+typedef struct load_interpolator_pipeline_args
+{
+  MEM_PTR( interpolator_t, 128 ) fi;
+  MEM_PTR( const field_t,  128 ) f;
+  MEM_PTR( const int64_t,  128 ) nb;
+  int nx;
+  int ny;
+  int nz;
+
+  PAD_STRUCT( 3*SIZEOF_MEM_PTR + 3*sizeof(int) )
+
+} load_interpolator_pipeline_args_t;
+
+void
+load_interpolator_pipeline_scalar( load_interpolator_pipeline_args_t * args,
+				   int pipeline_rank,
+				   int n_pipeline );
+
+void
+load_interpolator_pipeline_v4( load_interpolator_pipeline_args_t * args,
+                               int pipeline_rank,
+                               int n_pipeline );
+
+///////////////////////////////////////////////////////////////////////////////
+// clear_accumulators_pipeline interface
+
+// Pipelines are be assigned accumulator blocks in multiples of 256
+// (16KB) which is particularly convenient on Cell.  The pipeline
+// dispatcher will handle any stragglers.
+
+enum { accumulators_n_block = 256 };
+
+typedef struct accumulators_pipeline_args
+{
+  MEM_PTR( accumulator_t, 128) a; // First accumulator to reduce
+  int n;                          // Number of accumulators to reduce
+  int n_array;                    // Number of accumulator arrays
+  int s_array;                    // Stride between each array
+
+  PAD_STRUCT( SIZEOF_MEM_PTR + 3*sizeof(int) )
+
+} accumulators_pipeline_args_t;
+
+void
+clear_accumulators_pipeline_scalar( accumulators_pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline );
+
+///////////////////////////////////////////////////////////////////////////////
+// reduce_accumulators_pipeline interface
+
+void
+reduce_accumulators_pipeline_scalar( accumulators_pipeline_args_t * args,
+                                     int pipeline_rank,
+                                     int n_pipeline );
+
+///////////////////////////////////////////////////////////////////////////////
+
+typedef struct unload_accumulator_pipeline_args
+{
+  MEM_PTR( field_t, 128 ) f;             // Reduce accumulators to this
+  MEM_PTR( const accumulator_t, 128 ) a; // Accumulator array to reduce
+  int nx;                                // Local domain x-resolution
+  int ny;                                // Local domain y-resolution
+  int nz;                                // Local domain z-resolution
+  float cx;                              // x-axis coupling constant
+  float cy;                              // y-axis coupling constant
+  float cz;                              // z-axis coupling constant
+
+  PAD_STRUCT( 2*SIZEOF_MEM_PTR + 3*sizeof(int) + 3*sizeof(float) )
+
+} unload_accumulator_pipeline_args_t;
+
+void
+unload_accumulator_pipeline_scalar( unload_accumulator_pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline );
+
+#endif // _sf_interface_pipeline_h_
diff --git a/src/sf_interface/pipeline/unload_accumulator_pipeline.cc b/src/sf_interface/pipeline/unload_accumulator_pipeline.cc
new file mode 100644
index 00000000..df254f99
--- /dev/null
+++ b/src/sf_interface/pipeline/unload_accumulator_pipeline.cc
@@ -0,0 +1,137 @@
+// FIXME: This function assumes that the accumlator ghost values are
+// zero.  Further, assumes that the ghost values of jfx, jfy, jfz are
+// meaningless.  This might be changed to a more robust but slightly
+// slower implementation in the near future.
+
+#define IN_sf_interface
+
+#include "sf_interface_pipeline.h"
+
+#include "../sf_interface_private.h"
+
+#include "../../util/pipelines/pipelines_exec.h"
+
+#define f(x,y,z) f[ VOXEL( x, y, z, nx, ny, nz ) ]
+#define a(x,y,z) a[ VOXEL( x, y, z, nx, ny, nz ) ]
+
+void
+unload_accumulator_pipeline_scalar( unload_accumulator_pipeline_args_t * args,
+                                    int pipeline_rank,
+                                    int n_pipeline )
+{
+  field_t             * ALIGNED(128) f = args->f;
+  const accumulator_t * ALIGNED(128) a = args->a;
+
+  const accumulator_t * ALIGNED(16) a0;
+  const accumulator_t * ALIGNED(16) ax,  * ALIGNED(16) ay,  * ALIGNED(16) az;
+  const accumulator_t * ALIGNED(16) ayz, * ALIGNED(16) azx, * ALIGNED(16) axy;
+
+  field_t * ALIGNED(16) f0;
+
+  int x, y, z, n_voxel;
+
+  const int nx = args->nx;
+  const int ny = args->ny;
+  const int nz = args->nz;
+
+  const float cx = args->cx;
+  const float cy = args->cy;
+  const float cz = args->cz;
+
+  // Process the voxels assigned to this pipeline
+
+  if ( pipeline_rank == n_pipeline )
+  {
+    return; // No need for straggler cleanup
+  }
+
+  DISTRIBUTE_VOXELS( 1, nx+1, 1, ny+1, 1, nz+1, 1,
+                     pipeline_rank, n_pipeline, x, y, z, n_voxel );
+
+# define LOAD_STENCIL()                                                 \
+  f0  = &f(x,  y,  z  );                                                \
+  a0  = &a(x,  y,  z  );                                                \
+  ax  = &a(x-1,y,  z  ); ay  = &a(x,  y-1,z  ); az  = &a(x,  y,  z-1);  \
+  ayz = &a(x,  y-1,z-1); azx = &a(x-1,y,  z-1); axy = &a(x-1,y-1,z  )
+
+  LOAD_STENCIL();
+
+  for( ; n_voxel; n_voxel-- )
+  {
+    f0->jfx += cx*( a0->jx[0] + ay->jx[1] + az->jx[2] + ayz->jx[3] );
+    f0->jfy += cy*( a0->jy[0] + az->jy[1] + ax->jy[2] + azx->jy[3] );
+    f0->jfz += cz*( a0->jz[0] + ax->jz[1] + ay->jz[2] + axy->jz[3] );
+
+    f0++; a0++; ax++; ay++; az++; ayz++; azx++; axy++;
+
+    x++;
+    if ( x > nx + 1 )
+    {
+      x=1, y++;
+      if ( y > ny + 1 ) y=1, z++;
+      LOAD_STENCIL();
+    }
+  }
+
+# undef LOAD_STENCIL
+
+}
+
+#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+#error "V4 version not hooked up yet."
+
+#endif
+
+void
+unload_accumulator_array_pipeline( field_array_t * RESTRICT fa,
+                                   const accumulator_array_t * RESTRICT aa )
+{
+  unload_accumulator_pipeline_args_t args[1];
+
+  if ( !fa              ||
+       !aa              ||
+       fa->g != aa->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+# if 0 // Original non-pipelined version
+
+  for( z=1; z<=nz+1; z++ ) {
+    for( y=1; y<=ny+1; y++ ) {
+
+      x   = 1;
+      f0  = &f(x,  y,  z  );
+      a0  = &a(x,  y,  z  );
+      ax  = &a(x-1,y,  z  ); ay  = &a(x,  y-1,z  ); az  = &a(x,  y,  z-1);
+      ayz = &a(x,  y-1,z-1); azx = &a(x-1,y,  z-1); axy = &a(x-1,y-1,z  );
+
+      for( x=1; x<=nx+1; x++ ) {
+
+        f0->jfx += cx*( a0->jx[0] + ay->jx[1] + az->jx[2] + ayz->jx[3] );
+        f0->jfy += cy*( a0->jy[0] + az->jy[1] + ax->jy[2] + azx->jy[3] );
+        f0->jfz += cz*( a0->jz[0] + ax->jz[1] + ay->jz[2] + axy->jz[3] );
+
+        f0++; a0++; ax++; ay++; az++; ayz++; azx++; axy++;
+
+      }
+    }
+  }
+
+# endif
+
+  args->f  = fa->f;
+  args->a  = aa->a;
+  args->nx = fa->g->nx;
+  args->ny = fa->g->ny;
+  args->nz = fa->g->nz;
+
+  args->cx = 0.25 * fa->g->rdy * fa->g->rdz / fa->g->dt;
+  args->cy = 0.25 * fa->g->rdz * fa->g->rdx / fa->g->dt;
+  args->cz = 0.25 * fa->g->rdx * fa->g->rdy / fa->g->dt;
+
+  EXEC_PIPELINES( unload_accumulator, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/sf_interface/reduce_accumulators.cc b/src/sf_interface/reduce_accumulators.cc
index f404b501..5fb0e378 100644
--- a/src/sf_interface/reduce_accumulators.cc
+++ b/src/sf_interface/reduce_accumulators.cc
@@ -1,173 +1,20 @@
 #define IN_sf_interface
+
 #include "sf_interface_private.h"
 
-// FIXME: N_ARRAY>1 ALWAYS BUT THIS ISN'T STRICTLY NECESSARY BECAUSE
-// HOST IS THREAD FOR THE SERIAL AND THREADED DISPATCHERS.  SHOULD
-// PROBABLY CHANGE N_ARRAY TO
-// max({serial,thread}.n_pipeline,spu.n_pipeline+1)
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper reduce_accumulator_array
+// function.
+//----------------------------------------------------------------------------//
 
 void
-reduce_accumulators_pipeline( accumulators_pipeline_args_t * args,
-                              int pipeline_rank,
-                              int n_pipeline ) {
-  int i, i1, si = sizeof(accumulator_t) / sizeof(float);
-  int r, nr = args->n_array-1, sr = si*args->s_array;
-  int j, k;
-
-  DISTRIBUTE( args->n, accumulators_n_block,
-              pipeline_rank, n_pipeline, i, i1 ); i1 += i;
-
-  // a is broken into restricted rw and ro parts to allow the compiler
-  // to do more aggresive optimizations
-
-  /**/  float * RESTRICT ALIGNED(16) a = args->a->jx;
-  const float * RESTRICT ALIGNED(16) b = a + sr;
-
-# if defined(V4_ACCELERATION)
-
-  using namespace v4;
-
-  v4float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
-
-# define LOOP(OP)                               \
-  for( ; i<i1; i++ ) {                          \
-    k = i*si;                                   \
-    OP(k   ); OP(k+ 4); OP(k+ 8);               \
-  }
-# define A(k)   load_4x1(  &a[k],          v0   );
-# define B(k,r) load_4x1(  &b[k+(r-1)*sr], v##r );
-# define C(k,v) store_4x1( v, &a[k] )
-# define O1(k)A(k  )B(k,1)                                                 \
-              C(k,   v0+v1)
-# define O2(k)A(k  )B(k,1)B(k,2)                                           \
-              C(k,  (v0+v1)+ v2)
-# define O3(k)A(k  )B(k,1)B(k,2)B(k,3)                                     \
-              C(k,  (v0+v1)+(v2+v3))
-# define O4(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)                               \
-              C(k, ((v0+v1)+(v2+v3))+  v4)
-# define O5(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)                         \
-              C(k, ((v0+v1)+(v2+v3))+ (v4+v5))
-# define O6(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)                   \
-              C(k, ((v0+v1)+(v2+v3))+((v4+v5)+ v6))
-# define O7(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)             \
-              C(k, ((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))
-# define O8(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)B(k,8)       \
-              C(k,(((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))+   v8)
-# define O9(k)A(k  )B(k,1)B(k,2)B(k,3)B(k,4)B(k,5)B(k,6)B(k,7)B(k,8)B(k,9) \
-              C(k,(((v0+v1)+(v2+v3))+((v4+v5)+(v6+v7)))+  (v8+v9))
-
-# else
-
-  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11;
-
-# define LOOP(OP)                               \
-  for( ; i<i1; i++ ) {                          \
-    k = i*si;                                   \
-    OP(k   );OP(k+ 1);OP(k+ 2);OP(k+ 3);        \
-    OP(k+ 4);OP(k+ 5);OP(k+ 6);OP(k+ 7);        \
-    OP(k+ 8);OP(k+ 9);OP(k+10);OP(k+11);        \
-  }
-# define O1(k) a[k] =    a[k     ] + b[k     ]
-# define O2(k) a[k] =   (a[k     ] + b[k     ]) +  b[k+  sr]
-# define O3(k) a[k] =   (a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr])
-# define O4(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/                 b[k+3*sr]
-# define O5(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/                (b[k+3*sr] + b[k+4*sr])
-# define O6(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/               ((b[k+3*sr] + b[k+4*sr]) +  b[k+5*sr]          )
-# define O7(k) a[k] =  ((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))
-# define O8(k) a[k] = (((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))) + \
-    /**/                 b[k+7*sr]
-# define O9(k) a[k] = (((a[k     ] + b[k     ]) + (b[k+  sr] + b[k+2*sr]))  + \
-    /**/               ((b[k+3*sr] + b[k+4*sr]) + (b[k+5*sr] + b[k+6*sr]))) + \
-    /**/                (b[k+7*sr] + b[k+8*sr])
-
-# endif
-  
-  switch( nr ) {        
-  case 0:           break;
-  case 1: LOOP(O1); break;
-  case 2: LOOP(O2); break;
-  case 3: LOOP(O3); break;
-  case 4: LOOP(O4); break;
-  case 5: LOOP(O5); break;
-  case 6: LOOP(O6); break;
-  case 7: LOOP(O7); break;
-  case 8: LOOP(O8); break;
-  case 9: LOOP(O9); break;
-  default:
-#   if defined(V4_ACCELERATION)
-    for( ; i<i1; i++ ) {
-      j = i*si;
-      load_4x1(&a[j+0],v0);  load_4x1(&a[j+4],v1);  load_4x1(&a[j+8],v2);
-      for( r=0; r<nr; r++ ) {
-        k = j + r*sr;
-        load_4x1(&b[k+0],v3);  load_4x1(&b[k+4],v4);  load_4x1(&b[k+8],v5);
-        v0 += v3;              v1 += v4;              v2 += v5;
-      }
-      store_4x1(v0,&a[j+0]); store_4x1(v1,&a[j+4]); store_4x1(v2,&a[j+8]);
-    }
-#   else
-    for( ; i<i1; i++ ) {
-      j = i*si;
-      f0  = a[j+ 0]; f1  = a[j+ 1]; f2  = a[j+ 2]; f3  = a[j+ 3];
-      f4  = a[j+ 4]; f5  = a[j+ 5]; f6  = a[j+ 6]; f7  = a[j+ 7];
-      f8  = a[j+ 8]; f9  = a[j+ 9]; f10 = a[j+10]; f11 = a[j+11];
-      for( r=0; r<nr; r++ ) {
-        k = j + r*sr;
-        f0  += b[k+ 0]; f1  += b[k+ 1]; f2  += b[k+ 2]; f3  += b[k+ 3];
-        f4  += b[k+ 4]; f5  += b[k+ 5]; f6  += b[k+ 6]; f7  += b[k+ 7];
-        f8  += b[k+ 8]; f9  += b[k+ 9]; f10 += b[k+10]; f11 += b[k+11];
-      }
-      a[j+ 0] =  f0; a[j+ 1] =  f1; a[j+ 2] =  f2; a[j+ 3] =  f3;
-      a[j+ 4] =  f4; a[j+ 5] =  f5; a[j+ 6] =  f6; a[j+ 7] =  f7;
-      a[j+ 8] =  f8; a[j+ 9] =  f9; a[j+10] = f10; a[j+11] = f11;
-    }
-#   endif
-    break;
+reduce_accumulator_array( accumulator_array_t * RESTRICT aa )
+{
+  if ( !aa )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-# undef O9
-# undef O8
-# undef O7
-# undef O6
-# undef O5
-# undef O4
-# undef O3
-# undef O2
-# undef O1
-# undef C
-# undef B
-# undef A
-# undef LOOP
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "The regular pipeline is already V4 accelerated!"
-
-#endif
-
-#define VOX(x,y,z) VOXEL(x,y,z, aa->g->nx,aa->g->ny,aa->g->nz)
-
-void
-reduce_accumulator_array( accumulator_array_t * RESTRICT aa ) {
-  DECLARE_ALIGNED_ARRAY( accumulators_pipeline_args_t, 128, args, 1 );
-  int i0;
-
-  if( !aa ) ERROR(( "Bad args" ));
-
-  i0 = (VOX(1,1,1)/2)*2; // Round i0 down to even for 128B align on Cell
-
-  args->a       = aa->a + i0;
-  args->n       = (((VOX(aa->g->nx,aa->g->ny,aa->g->nz) - i0 + 1 )+1)/2)*2;
-  args->n_array = aa->n_pipeline + 1;
-  args->s_array = aa->stride;
-
-  EXEC_PIPELINES( reduce_accumulators, args, 0 );
-  WAIT_PIPELINES();
+  // Conditionally execute this when more abstractions are available.
+  reduce_accumulator_array_pipeline( aa );
 }
diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h
index 1a9ee9f4..6dc86883 100644
--- a/src/sf_interface/sf_interface.h
+++ b/src/sf_interface/sf_interface.h
@@ -13,6 +13,45 @@
 #include "../field_advance/field_advance.h"
 // FIXME: SHOULD INCLUDE SPECIES_ADVANCE TOO ONCE READY
 
+//----------------------------------------------------------------------------//
+// We want to conditionally define pad sizes for various structs so they will
+// be properly aligned for performance and also for various intrinsics calls
+// because many intrinsics will fail if not operating on properly aligned
+// data. Check for the most restrictive alignment need first and make sure it
+// has priority in being satisfied.
+//----------------------------------------------------------------------------//
+
+//----------------------------------------------------------------------------//
+// 64-byte align
+
+#if defined(USE_V16_PORTABLE) || \
+    defined(USE_V16_AVX512)
+
+#define PAD_SIZE_INTERPOLATOR 14
+#define PAD_SIZE_ACCUMULATOR   4
+#define PAD_SIZE_HYDRO         2
+
+//----------------------------------------------------------------------------//
+// 32-byte align
+
+#elif defined(USE_V8_PORTABLE) || \
+      defined(USE_V8_AVX)      || \
+      defined(USE_V8_AVX2)
+
+#define PAD_SIZE_INTERPOLATOR 6
+#define PAD_SIZE_ACCUMULATOR  4
+#define PAD_SIZE_HYDRO        2
+
+//----------------------------------------------------------------------------//
+// 16-byte align
+
+#else
+
+#define PAD_SIZE_INTERPOLATOR 2
+#define PAD_SIZE_HYDRO        2
+
+#endif
+
 /*****************************************************************************/
 
 // Interpolator arrays shall be a (nx+2) x (ny+2) x (nz+2) allocation
@@ -20,17 +59,22 @@
 // for voxels on the surface of the local domain (for example
 // fi(0,:,:) or fi(nx+1,:,:)) are not used.
 
-typedef struct interpolator {
+typedef struct interpolator
+{
   float ex, dexdy, dexdz, d2exdydz;
   float ey, deydz, deydx, d2eydzdx;
   float ez, dezdx, dezdy, d2ezdxdy;
   float cbx, dcbxdx;
   float cby, dcbydy;
   float cbz, dcbzdz;
-  float _pad[2];  // 16-byte align
+  float _pad1[PAD_SIZE_INTERPOLATOR];
+  // float _pad1[2];  // 16-byte align
+  // float _pad2[4];  // More padding to get 32-byte align, make conditional
+  // float _pad3[8];  // More padding to get 64-byte align, make conditional
 } interpolator_t;
 
-typedef struct interpolator_array {
+typedef struct interpolator_array
+{
   interpolator_t * ALIGNED(128) i;
   grid_t * g;
 } interpolator_array_t;
@@ -65,16 +109,21 @@ END_C_DECLS
 // allocation indexed FORTRAN style.  That is, the accumulator array
 // is a 4d array.  a(:,:,:,0) is the accumulator used by the host
 // processor.  a(:,:,:,1:n_pipeline) are the accumulators used by
-// pipelines during operations.  Like the interpolator, accumualtors
+// pipelines during operations.  Like the interpolator, accumulators
 // on the surface of the local domain are not used.
 
-typedef struct accumulator {
+typedef struct accumulator
+{
   float jx[4];   // jx0@(0,-1,-1),jx1@(0,1,-1),jx2@(0,-1,1),jx3@(0,1,1)
   float jy[4];   // jy0@(-1,0,-1),jy1@(-1,0,1),jy2@(1,0,-1),jy3@(1,0,1)
   float jz[4];   // jz0@(-1,-1,0),jz1@(1,-1,0),jz2@(-1,1,0),jz3@(1,1,0)
+  #if defined PAD_SIZE_ACCUMULATOR
+  float pad2[PAD_SIZE_ACCUMULATOR]; // Padding for 32 and 64-byte align
+  #endif
 } accumulator_t;
 
-typedef struct accumulator_array {
+typedef struct accumulator_array
+{
   accumulator_t * ALIGNED(128) a;
   int n_pipeline; // Number of pipelines supported by this accumulator
   int stride;     // Stride be each pipeline's accumulator array
@@ -133,15 +182,17 @@ END_C_DECLS
 // the surface of the local domain (for example h(0,:,:) or
 // h(nx+1,:,:)) are not used.
 
-typedef struct hydro {
+typedef struct hydro
+{
   float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
   float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
   float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
   float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-  float _pad[2];         // 16-byte align
+  float _pad[PAD_SIZE_HYDRO]; // 16, 32 and 64-byte align
 } hydro_t;
 
-typedef struct hydro_array {
+typedef struct hydro_array
+{
   hydro_t * ALIGNED(128) h;
   grid_t * g;
 } hydro_array_t;
diff --git a/src/sf_interface/sf_interface_private.h b/src/sf_interface/sf_interface_private.h
index 0ae72791..6c36a21b 100644
--- a/src/sf_interface/sf_interface_private.h
+++ b/src/sf_interface/sf_interface_private.h
@@ -10,63 +10,23 @@
 ///////////////////////////////////////////////////////////////////////////////
 // load_interpolator_pipeline interface
 
-typedef struct load_interpolator_pipeline_args {
-
-  MEM_PTR( interpolator_t, 128 ) fi;
-  MEM_PTR( const field_t,  128 ) f;
-  MEM_PTR( const int64_t,  128 ) nb;
-  int nx;
-  int ny;
-  int nz;
-
-  PAD_STRUCT( 3*SIZEOF_MEM_PTR + 3*sizeof(int) )
-
-} load_interpolator_pipeline_args_t;
-
-PROTOTYPE_PIPELINE( load_interpolator, load_interpolator_pipeline_args_t );
+void
+load_interpolator_array_pipeline( interpolator_array_t * RESTRICT ia,
+                                  const field_array_t * RESTRICT fa );
 
 ///////////////////////////////////////////////////////////////////////////////
 // clear_accumulators_pipeline interface
 
-// Pipelines are be assigned accumulator blocks in multiples of 256
-// (16KB) which is particularly convenient on Cell.  The pipeline
-// dispatcher will handle any stragglers.
-
-enum { accumulators_n_block = 256 };
-
-typedef struct accumulators_pipeline_args {
-
-  MEM_PTR( accumulator_t, 128) a; // First accumulator to reduce
-  int n;                          // Number of accumulators to reduce
-  int n_array;                    // Number of accumulator arrays
-  int s_array;                    // Stride between each array
+void
+clear_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa );
 
-  PAD_STRUCT( SIZEOF_MEM_PTR + 3*sizeof(int) )
-
-} accumulators_pipeline_args_t;
-
-PROTOTYPE_PIPELINE( clear_accumulators,  accumulators_pipeline_args_t );
-PROTOTYPE_PIPELINE( reduce_accumulators, accumulators_pipeline_args_t );
+void
+reduce_accumulator_array_pipeline( accumulator_array_t * RESTRICT aa );
 
 ///////////////////////////////////////////////////////////////////////////////
 
-typedef struct unload_accumulator_pipeline_args {
-
-  MEM_PTR( field_t, 128 ) f;             // Reduce accumulators to this
-  MEM_PTR( const accumulator_t, 128 ) a; // Accumulator array to reduce
-  int nx;                                // Local domain x-resolution
-  int ny;                                // Local domain y-resolution
-  int nz;                                // Local domain z-resolution
-  float cx;                              // x-axis coupling constant
-  float cy;                              // y-axis coupling constant
-  float cz;                              // z-axis coupling constant
-
-  PAD_STRUCT( 2*SIZEOF_MEM_PTR + 3*sizeof(int) + 3*sizeof(float) )
-
-} unload_accumulator_pipeline_args_t;
-
-PROTOTYPE_PIPELINE( unload_accumulator, unload_accumulator_pipeline_args_t );
-
-#undef FOR_SPU
+void
+unload_accumulator_array_pipeline( field_array_t * RESTRICT fa,
+                                   const accumulator_array_t * RESTRICT aa );
 
 #endif // _sf_interface_private_h_
diff --git a/src/sf_interface/unload_accumulator.cc b/src/sf_interface/unload_accumulator.cc
index 8eb1513d..7456309d 100644
--- a/src/sf_interface/unload_accumulator.cc
+++ b/src/sf_interface/unload_accumulator.cc
@@ -1,117 +1,23 @@
-// FIXME: This function assumes that the accumlator ghost values are
-// zero.  Further, assumes that the ghost values of jfx, jfy, jfz are
-// meaningless.  This might be changed to a more robust but slightly
-// slower implementation in the near future.
-
 #define IN_sf_interface
-#include "sf_interface_private.h"
-
-#define f(x,y,z) f[ VOXEL(x,y,z, nx,ny,nz) ]
-#define a(x,y,z) a[ VOXEL(x,y,z, nx,ny,nz) ]
-
-void
-unload_accumulator_pipeline( unload_accumulator_pipeline_args_t * args,
-			     int pipeline_rank,
-                             int n_pipeline ) {
-  field_t             * ALIGNED(128) f = args->f;
-  const accumulator_t * ALIGNED(128) a = args->a;
-  
-  const accumulator_t * ALIGNED(16) a0;
-  const accumulator_t * ALIGNED(16) ax,  * ALIGNED(16) ay,  * ALIGNED(16) az;
-  const accumulator_t * ALIGNED(16) ayz, * ALIGNED(16) azx, * ALIGNED(16) axy;
-  field_t * ALIGNED(16) f0;
-  int x, y, z, n_voxel;
-  
-  const int nx = args->nx;
-  const int ny = args->ny;
-  const int nz = args->nz;
-
-  const float cx = args->cx;
-  const float cy = args->cy;
-  const float cz = args->cz;
-
-  // Process the voxels assigned to this pipeline
-  
-  if( pipeline_rank==n_pipeline ) return; // No need for straggler cleanup
-  DISTRIBUTE_VOXELS( 1,nx+1, 1,ny+1, 1,nz+1, 1,
-                     pipeline_rank, n_pipeline, x, y, z, n_voxel );
-
-# define LOAD_STENCIL()                                                 \
-  f0  = &f(x,  y,  z  );                                                \
-  a0  = &a(x,  y,  z  );                                                \
-  ax  = &a(x-1,y,  z  ); ay  = &a(x,  y-1,z  ); az  = &a(x,  y,  z-1);  \
-  ayz = &a(x,  y-1,z-1); azx = &a(x-1,y,  z-1); axy = &a(x-1,y-1,z  )
-
-  LOAD_STENCIL();
-
-  for( ; n_voxel; n_voxel-- ) {
-
-    f0->jfx += cx*( a0->jx[0] + ay->jx[1] + az->jx[2] + ayz->jx[3] );
-    f0->jfy += cy*( a0->jy[0] + az->jy[1] + ax->jy[2] + azx->jy[3] );
-    f0->jfz += cz*( a0->jz[0] + ax->jz[1] + ay->jz[2] + axy->jz[3] );
-
-    f0++; a0++; ax++; ay++; az++; ayz++; azx++; axy++;
-
-    x++;
-    if( x>nx+1 ) {
-      x=1, y++;
-      if( y>ny+1 ) y=1, z++;
-      LOAD_STENCIL();
-    }
-
-  }
 
-# undef LOAD_STENCIL
-
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-#error "V4 version not hooked up yet!"
+#include "sf_interface_private.h"
 
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper unload_accumulator_array
+// function.
+//----------------------------------------------------------------------------//
 
 void
-unload_accumulator_array( /**/  field_array_t       * RESTRICT fa,
-                          const accumulator_array_t * RESTRICT aa ) {
-  unload_accumulator_pipeline_args_t args[1];
-
-  if( !fa || !aa || fa->g!=aa->g ) ERROR(( "Bad args" ));
-
-# if 0 // Original non-pipelined version
-
-  for( z=1; z<=nz+1; z++ ) {
-    for( y=1; y<=ny+1; y++ ) {
-
-      x   = 1;
-      f0  = &f(x,  y,  z  );
-      a0  = &a(x,  y,  z  );
-      ax  = &a(x-1,y,  z  ); ay  = &a(x,  y-1,z  ); az  = &a(x,  y,  z-1);
-      ayz = &a(x,  y-1,z-1); azx = &a(x-1,y,  z-1); axy = &a(x-1,y-1,z  );
-
-      for( x=1; x<=nx+1; x++ ) {
-
-        f0->jfx += cx*( a0->jx[0] + ay->jx[1] + az->jx[2] + ayz->jx[3] );
-        f0->jfy += cy*( a0->jy[0] + az->jy[1] + ax->jy[2] + azx->jy[3] );
-        f0->jfz += cz*( a0->jz[0] + ax->jz[1] + ay->jz[2] + axy->jz[3] );
-
-        f0++; a0++; ax++; ay++; az++; ayz++; azx++; axy++;
-
-      }
-    }
+unload_accumulator_array( field_array_t * RESTRICT fa,
+                          const accumulator_array_t * RESTRICT aa )
+{
+  if ( !fa              ||
+       !aa              ||
+       fa->g != aa->g )
+  {
+    ERROR( ( "Bad args" ) );
   }
 
-# endif
-
-  args->f  = fa->f;
-  args->a  = aa->a;
-  args->nx = fa->g->nx;
-  args->ny = fa->g->ny;
-  args->nz = fa->g->nz;
-  args->cx = 0.25*fa->g->rdy*fa->g->rdz/fa->g->dt;
-  args->cy = 0.25*fa->g->rdz*fa->g->rdx/fa->g->dt;
-  args->cz = 0.25*fa->g->rdx*fa->g->rdy/fa->g->dt;
-
-  EXEC_PIPELINES( unload_accumulator, args, 0 );
-  WAIT_PIPELINES();
+  // Conditionally execute this when more abstractions are available.
+  unload_accumulator_array_pipeline( fa, aa );
 }
diff --git a/src/species_advance/species_advance.c b/src/species_advance/species_advance.c
index 34a3ffee..33ffd435 100644
--- a/src/species_advance/species_advance.c
+++ b/src/species_advance/species_advance.c
@@ -103,8 +103,8 @@ species_t *
 species( const char * name,
          float q,
          float m,
-         int max_local_np,
-         int max_local_nm,
+         size_t max_local_np,
+         size_t max_local_nm,
          int sort_interval,
          int sort_out_of_place,
          grid_t * g ) {
@@ -144,4 +144,3 @@ species( const char * name,
   REGISTER_OBJECT( sp, checkpt_species, restore_species, NULL );
   return sp;
 }
-
diff --git a/src/species_advance/species_advance.h b/src/species_advance/species_advance.h
index 6abb63f1..80141adc 100644
--- a/src/species_advance/species_advance.h
+++ b/src/species_advance/species_advance.h
@@ -14,86 +14,15 @@
 
 #include "../sf_interface/sf_interface.h"
 
-typedef int32_t species_id; // Must be 32-bit wide for particle_injector_t
-
-// FIXME: Eventually particle_t (definitely) and ther other formats
-// (maybe) should be opaque and specific to a particular
-// species_advance implementation
-
-typedef struct particle {
-  float dx, dy, dz; // Particle position in cell coordinates (on [-1,1])
-  int32_t i;        // Voxel containing the particle.  Note that
-  /**/              // particled awaiting processing by boundary_p
-  /**/              // have actually set this to 8*voxel + face where
-  /**/              // face is the index of the face they interacted
-  /**/              // with (on 0:5).  This limits the local number of
-  /**/              // voxels to 2^28 but emitter handling already
-  /**/              // has a stricter limit on this (2^26).
-  float ux, uy, uz; // Particle normalized momentum
-  float w;          // Particle weight (number of physical particles)
-} particle_t;
-
-// WARNING: FUNCTIONS THAT USE A PARTICLE_MOVER ASSUME THAT EVERYBODY
-// WHO USES THAT PARTICLE MOVER WILL HAVE ACCESS TO PARTICLE ARRAY
-
-typedef struct particle_mover {
-  float dispx, dispy, dispz; // Displacement of particle
-  int32_t i;                 // Index of the particle to move
-} particle_mover_t;
-
-// NOTE: THE LAYOUT OF A PARTICLE_INJECTOR _MUST_ BE COMPATIBLE WITH
-// THE CONCATENATION OF A PARTICLE_T AND A PARTICLE_MOVER!
-
-typedef struct particle_injector {
-  float dx, dy, dz;          // Particle position in cell coords (on [-1,1])
-  int32_t i;                 // Index of cell containing the particle
-  float ux, uy, uz;          // Particle normalized momentum
-  float w;                   // Particle weight (number of physical particles)
-  float dispx, dispy, dispz; // Displacement of particle
-  species_id sp_id;          // Species of particle
-} particle_injector_t;
-
-typedef struct species {
-  char * name;                        // Species name
-  float q;                            // Species particle charge
-  float m;                            // Species particle rest mass
-
-  int np, max_np;                     // Number and max local particles
-  particle_t * ALIGNED(128) p;        // Array of particles for the species
-
-  int nm, max_nm;                     // Number and max local movers in use
-  particle_mover_t * ALIGNED(128) pm; // Particle movers
-
-  int64_t last_sorted;                // Step when the particles were last
-                                      // sorted.
-  int sort_interval;                  // How often to sort the species
-  int sort_out_of_place;              // Sort method
-  int * ALIGNED(128) partition;       // Static array indexed 0:
-  /**/                                // (nx+2)*(ny+2)*(nz+2).  Each value
-  /**/                                // corresponds to the associated particle
-  /**/                                // array index of the first particle in
-  /**/                                // the cell.  Array is allocated and
-  /**/                                // values computed in sort_p.  Purpose is
-  /**/                                // for implementing collision models
-  /**/                                // This is given in terms of the
-  /**/                                // underlying's grids space filling
-  /**/                                // curve indexing.  Thus, immediately
-  /**/                                // after a sort:
-  /**/                                //   sp->p[sp->partition[g->sfc[i]  ]:
-  /**/                                //         sp->partition[g->sfc[i]+1]-1]
-  /**/                                // are all the particles in voxel
-  /**/                                // with local index i, while:
-  /**/                                //   sp->p[ sp->partition[ j   ]:
-  /**/                                //          sp->partition[ j+1 ] ]
-  /**/                                // are all the particles in voxel
-  /**/                                // with space filling curve index j.
-  /**/                                // Note: SFC NOT IN USE RIGHT NOW THUS
-  /**/                                // g->sfc[i]=i ABOVE.
-
-  grid_t * g;                         // Underlying grid
-  species_id id;                      // Unique identifier for a species
-  struct species *next;               // Next species in the list
-} species_t;
+//----------------------------------------------------------------------------//
+// Choose between using AoSoA or AoS data layout for the particles.
+//----------------------------------------------------------------------------//
+
+#include "species_advance_aos.h"
+
+//----------------------------------------------------------------------------//
+// Declare methods.
+//----------------------------------------------------------------------------//
 
 BEGIN_C_DECLS
 
@@ -121,8 +50,8 @@ species_t *
 species( const char * name,
          float q,
          float m,
-         int max_local_np,
-         int max_local_nm,
+         size_t max_local_np,
+         size_t max_local_nm,
          int sort_interval,
          int sort_out_of_place,
          grid_t * g );
@@ -136,13 +65,21 @@ species( const char * name,
 void
 sort_p( species_t * RESTRICT sp );
 
+void
+sort_p_pipeline( species_t * sp );
+
 // In advance_p.cxx
 
 void
-advance_p( /**/  species_t            * RESTRICT sp,
-           /**/  accumulator_array_t  * RESTRICT aa,
+advance_p( species_t * RESTRICT sp,
+           accumulator_array_t * RESTRICT aa,
            const interpolator_array_t * RESTRICT ia );
 
+void
+advance_p_pipeline( species_t * RESTRICT sp,
+                    accumulator_array_t * RESTRICT aa,
+                    const interpolator_array_t * RESTRICT ia );
+
 // In center_p.cxx
 
 // This does a half advance field advance and a half Boris rotate on
@@ -151,9 +88,13 @@ advance_p( /**/  species_t            * RESTRICT sp,
 // the time step.
 
 void
-center_p( /**/  species_t            * RESTRICT sp,
+center_p( species_t * RESTRICT sp,
           const interpolator_array_t * RESTRICT ia );
 
+void
+center_p_pipeline( species_t * RESTRICT sp,
+                   const interpolator_array_t * RESTRICT ia );
+
 // In uncenter_p.cxx
 
 // This is the inverse of center_p.  Thus, particles with r and u at
@@ -161,9 +102,13 @@ center_p( /**/  species_t            * RESTRICT sp,
 // step stale.
 
 void
-uncenter_p( /**/  species_t            * RESTRICT sp,
+uncenter_p( species_t * RESTRICT sp,
             const interpolator_array_t * RESTRICT ia );
 
+void
+uncenter_p_pipeline( species_t * RESTRICT sp,
+                     const interpolator_array_t * RESTRICT ia );
+
 // In energy.cxx
 
 // This computes the kinetic energy stored in the particles.  The
@@ -171,26 +116,30 @@ uncenter_p( /**/  species_t            * RESTRICT sp,
 // result.
 
 double
-energy_p( const species_t            * RESTRICT sp,
+energy_p( const species_t * RESTRICT sp,
           const interpolator_array_t * RESTRICT ia );
 
+double
+energy_p_pipeline( const species_t * RESTRICT sp,
+                   const interpolator_array_t * RESTRICT ia );
+
 // In rho_p.cxx
 
 void
-accumulate_rho_p( /**/  field_array_t * RESTRICT fa,
-                  const species_t     * RESTRICT sp );
+accumulate_rho_p( field_array_t * RESTRICT fa,
+                  const species_t * RESTRICT sp );
 
 void
-accumulate_rhob( field_t          * RESTRICT ALIGNED(128) f,
+accumulate_rhob( field_t * RESTRICT ALIGNED(128) f,
                  const particle_t * RESTRICT ALIGNED(32)  p,
-                 const grid_t     * RESTRICT              g,
-                 const float                              qsp );
+                 const grid_t * RESTRICT g,
+                 const float qsp );
 
 // In hydro_p.c
 
 void
-accumulate_hydro_p( /**/  hydro_array_t        * RESTRICT ha,
-                    const species_t            * RESTRICT sp,
+accumulate_hydro_p( hydro_array_t * RESTRICT ha,
+                    const species_t * RESTRICT sp,
                     const interpolator_array_t * RESTRICT ia );
 
 // In move_p.cxx
diff --git a/src/species_advance/species_advance_aos.h b/src/species_advance/species_advance_aos.h
new file mode 100644
index 00000000..3e1af9ad
--- /dev/null
+++ b/src/species_advance/species_advance_aos.h
@@ -0,0 +1,96 @@
+/* 
+ * Written by:
+ *   Kevin J. Bowers, Ph.D.
+ *   Plasma Physics Group (X-1)
+ *   Applied Physics Division
+ *   Los Alamos National Lab
+ * March/April 2004 - Original version (data structures based on earlier
+ *                    V4PIC versions)
+ *
+ */
+
+#ifndef _species_advance_aos_h_
+#define _species_advance_aos_h_
+
+typedef int32_t species_id; // Must be 32-bit wide for particle_injector_t
+
+// FIXME: Eventually particle_t (definitely) and their other formats
+// (maybe) should be opaque and specific to a particular
+// species_advance implementation
+
+typedef struct particle {
+  float dx, dy, dz; // Particle position in cell coordinates (on [-1,1])
+  int32_t i;        // Voxel containing the particle.  Note that
+  /**/              // particles awaiting processing by boundary_p
+  /**/              // have actually set this to 8*voxel + face where
+  /**/              // face is the index of the face they interacted
+  /**/              // with (on 0:5).  This limits the local number of
+  /**/              // voxels to 2^28 but emitter handling already
+  /**/              // has a stricter limit on this (2^26).
+  float ux, uy, uz; // Particle normalized momentum
+  float w;          // Particle weight (number of physical particles)
+} particle_t;
+
+// WARNING: FUNCTIONS THAT USE A PARTICLE_MOVER ASSUME THAT EVERYBODY
+// WHO USES THAT PARTICLE MOVER WILL HAVE ACCESS TO PARTICLE ARRAY
+
+typedef struct particle_mover {
+  float dispx, dispy, dispz; // Displacement of particle
+  int32_t i;                 // Index of the particle to move
+} particle_mover_t;
+
+// NOTE: THE LAYOUT OF A PARTICLE_INJECTOR _MUST_ BE COMPATIBLE WITH
+// THE CONCATENATION OF A PARTICLE_T AND A PARTICLE_MOVER!
+
+typedef struct particle_injector {
+  float dx, dy, dz;          // Particle position in cell coords (on [-1,1])
+  int32_t i;                 // Index of cell containing the particle
+  float ux, uy, uz;          // Particle normalized momentum
+  float w;                   // Particle weight (number of physical particles)
+  float dispx, dispy, dispz; // Displacement of particle
+  species_id sp_id;          // Species of particle
+} particle_injector_t;
+
+typedef struct species {
+  char * name;                        // Species name
+  float q;                            // Species particle charge
+  float m;                            // Species particle rest mass
+
+  int np, max_np;                     // Number and max local particles
+  particle_t * ALIGNED(128) p;        // Array of particles for the species
+
+  int nm, max_nm;                     // Number and max local movers in use
+  particle_mover_t * ALIGNED(128) pm; // Particle movers
+
+  int64_t last_sorted;                // Step when the particles were last
+                                      // sorted.
+  int sort_interval;                  // How often to sort the species
+  int sort_out_of_place;              // Sort method
+  int * ALIGNED(128) partition;       // Static array indexed 0:
+  /**/                                // (nx+2)*(ny+2)*(nz+2).  Each value
+  /**/                                // corresponds to the associated particle
+  /**/                                // array index of the first particle in
+  /**/                                // the cell.  Array is allocated and
+  /**/                                // values computed in sort_p.  Purpose is
+  /**/                                // for implementing collision models
+  /**/                                // This is given in terms of the
+  /**/                                // underlying's grids space filling
+  /**/                                // curve indexing.  Thus, immediately
+  /**/                                // after a sort:
+  /**/                                //   sp->p[sp->partition[g->sfc[i]  ]:
+  /**/                                //         sp->partition[g->sfc[i]+1]-1]
+  /**/                                // are all the particles in voxel
+  /**/                                // with local index i, while:
+  /**/                                //   sp->p[ sp->partition[ j   ]:
+  /**/                                //          sp->partition[ j+1 ] ]
+  /**/                                // are all the particles in voxel
+  /**/                                // with space filling curve index j.
+  /**/                                // Note: SFC NOT IN USE RIGHT NOW THUS
+  /**/                                // g->sfc[i]=i ABOVE.
+
+  grid_t * g;                         // Underlying grid
+  species_id id;                      // Unique identifier for a species
+  struct species *next;               // Next species in the list
+} species_t;
+
+#endif // _species_advance_aos_h_
diff --git a/src/species_advance/standard/advance_p.cc b/src/species_advance/standard/advance_p.cc
index cd20e6d1..b892bd88 100644
--- a/src/species_advance/standard/advance_p.cc
+++ b/src/species_advance/standard/advance_p.cc
@@ -1,454 +1,19 @@
-// FIXME: PARTICLE MOVERS NEED TO BE OVERALLOCATED IN STRUCTORS TO
-// ACCOUNT FOR SPLITTING THE MOVER ARRAY BETWEEN HOST AND PIPELINES
-
 #define IN_spa
-#define HAS_V4_PIPELINE
-#include "spa_private.h"
-
-void
-advance_p_pipeline( advance_p_pipeline_args_t * args,
-                    int pipeline_rank,
-                    int n_pipeline ) {
-  particle_t           * ALIGNED(128) p0 = args->p0;
-  accumulator_t        * ALIGNED(128) a0 = args->a0;
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-  const grid_t *                      g  = args->g;
-
-  particle_t           * ALIGNED(32)  p;
-  particle_mover_t     * ALIGNED(16)  pm;
-  const interpolator_t * ALIGNED(16)  f;
-  float                * ALIGNED(16)  a;
-
-  const float qdt_2mc        = args->qdt_2mc;
-  const float cdt_dx         = args->cdt_dx;
-  const float cdt_dy         = args->cdt_dy;
-  const float cdt_dz         = args->cdt_dz;
-  const float qsp            = args->qsp;
-  const float one            = 1.;
-  const float one_third      = 1./3.;
-  const float two_fifteenths = 2./15.;
-
-  float dx, dy, dz, ux, uy, uz, q;
-  float hax, hay, haz, cbx, cby, cbz;
-  float v0, v1, v2, v3, v4, v5;
-
-  int itmp, ii, n, nm, max_nm;
-  
-  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
-
-  // Determine which quads of particles quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, n );
-  p = args->p0 + itmp;
-
-  // Determine which movers are reserved for this pipeline
-  // Movers (16 bytes) should be reserved for pipelines in at least
-  // multiples of 8 such that the set of particle movers reserved for
-  // a pipeline is 128-byte aligned and a multiple of 128-byte in
-  // size.  The host is guaranteed to get enough movers to process its
-  // particles with this allocation.
-
-  max_nm = args->max_nm - (args->np&15);
-  if( max_nm<0 ) max_nm = 0;
-  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
-  if( pipeline_rank==n_pipeline ) max_nm = args->max_nm - itmp;
-  pm   = args->pm + itmp;
-  nm   = 0;
-  itmp = 0;
-
-  // Determine which accumulator array to use
-  // The host gets the first accumulator array
-
-  if( pipeline_rank!=n_pipeline )
-    a0 += (1+pipeline_rank)*
-          POW2_CEIL((args->nx+2)*(args->ny+2)*(args->nz+2),2);
-
-  // Process particles for this pipeline
-
-  for(;n;n--,p++) {
-    dx   = p->dx;                             // Load position
-    dy   = p->dy;
-    dz   = p->dz;
-    ii   = p->i;
-    f    = f0 + ii;                           // Interpolate E
-    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
-                     dz*( f->dexdz + dy*f->d2exdydz ) );
-    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
-                     dx*( f->deydx + dz*f->d2eydzdx ) );
-    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
-                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
-    cbx  = f->cbx + dx*f->dcbxdx;             // Interpolate B
-    cby  = f->cby + dy*f->dcbydy;
-    cbz  = f->cbz + dz*f->dcbzdz;
-    ux   = p->ux;                             // Load momentum
-    uy   = p->uy;
-    uz   = p->uz;
-    q    = p->w;
-    ux  += hax;                               // Half advance E
-    uy  += hay;
-    uz  += haz;
-    v0   = qdt_2mc/sqrtf(one + (ux*ux + (uy*uy + uz*uz)));
-    /**/                                      // Boris - scalars
-    v1   = cbx*cbx + (cby*cby + cbz*cbz);
-    v2   = (v0*v0)*v1;
-    v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
-    v4   = v3/(one+v1*(v3*v3));
-    v4  += v4;
-    v0   = ux + v3*( uy*cbz - uz*cby );       // Boris - uprime
-    v1   = uy + v3*( uz*cbx - ux*cbz );
-    v2   = uz + v3*( ux*cby - uy*cbx );
-    ux  += v4*( v1*cbz - v2*cby );            // Boris - rotation
-    uy  += v4*( v2*cbx - v0*cbz );
-    uz  += v4*( v0*cby - v1*cbx );
-    ux  += hax;                               // Half advance E
-    uy  += hay;
-    uz  += haz;
-    p->ux = ux;                               // Store momentum
-    p->uy = uy;
-    p->uz = uz;
-    v0   = one/sqrtf(one + (ux*ux+ (uy*uy + uz*uz)));
-    /**/                                      // Get norm displacement
-    ux  *= cdt_dx;
-    uy  *= cdt_dy;
-    uz  *= cdt_dz;
-    ux  *= v0;
-    uy  *= v0;
-    uz  *= v0;
-    v0   = dx + ux;                           // Streak midpoint (inbnds)
-    v1   = dy + uy;
-    v2   = dz + uz;
-    v3   = v0 + ux;                           // New position
-    v4   = v1 + uy;
-    v5   = v2 + uz;
-
-    // FIXME-KJB: COULD SHORT CIRCUIT ACCUMULATION IN THE CASE WHERE QSP==0!
-    if(  v3<=one &&  v4<=one &&  v5<=one &&   // Check if inbnds
-        -v3<=one && -v4<=one && -v5<=one ) {
 
-      // Common case (inbnds).  Note: accumulator values are 4 times
-      // the total physical charge that passed through the appropriate
-      // current quadrant in a time-step
+#include "../species_advance.h"
 
-      q *= qsp;
-      p->dx = v3;                             // Store new position
-      p->dy = v4;
-      p->dz = v5;
-      dx = v0;                                // Streak midpoint
-      dy = v1;
-      dz = v2;
-      v5 = q*ux*uy*uz*one_third;              // Compute correction
-      a  = (float *)( a0 + ii );              // Get accumulator
-
-#     define ACCUMULATE_J(X,Y,Z,offset)                                 \
-      v4  = q*u##X;   /* v2 = q ux                            */        \
-      v1  = v4*d##Y;  /* v1 = q ux dy                         */        \
-      v0  = v4-v1;    /* v0 = q ux (1-dy)                     */        \
-      v1 += v4;       /* v1 = q ux (1+dy)                     */        \
-      v4  = one+d##Z; /* v4 = 1+dz                            */        \
-      v2  = v0*v4;    /* v2 = q ux (1-dy)(1+dz)               */        \
-      v3  = v1*v4;    /* v3 = q ux (1+dy)(1+dz)               */        \
-      v4  = one-d##Z; /* v4 = 1-dz                            */        \
-      v0 *= v4;       /* v0 = q ux (1-dy)(1-dz)               */        \
-      v1 *= v4;       /* v1 = q ux (1+dy)(1-dz)               */        \
-      v0 += v5;       /* v0 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */        \
-      v1 -= v5;       /* v1 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */        \
-      v2 -= v5;       /* v2 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */        \
-      v3 += v5;       /* v3 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */        \
-      a[offset+0] += v0;                                                \
-      a[offset+1] += v1;                                                \
-      a[offset+2] += v2;                                                \
-      a[offset+3] += v3
-
-      ACCUMULATE_J( x,y,z, 0 );
-      ACCUMULATE_J( y,z,x, 4 );
-      ACCUMULATE_J( z,x,y, 8 );
-
-#     undef ACCUMULATE_J
-
-    } else {                                    // Unlikely
-      local_pm->dispx = ux;
-      local_pm->dispy = uy;
-      local_pm->dispz = uz;
-      local_pm->i     = p - p0;
-
-      if( move_p( p0, local_pm, a0, g, qsp ) ) { // Unlikely
-        if( nm<max_nm ) {
-	  pm[nm++] = local_pm[0];
-        }
-        else {
-	  itmp++;                 // Unlikely
-	} // if
-      } // if
-    }
-
-  }
-
-  args->seg[pipeline_rank].pm        = pm;
-  args->seg[pipeline_rank].max_nm    = max_nm;
-  args->seg[pipeline_rank].nm        = nm;
-  args->seg[pipeline_rank].n_ignored = itmp;
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
+//----------------------------------------------------------------------------//
+// Top level function to select and call particle advance function using the
+// desired particle advance abstraction.  Currently, the only abstraction
+// available is the pipeline abstraction.
+//----------------------------------------------------------------------------//
 
 void
-advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  particle_t           * ALIGNED(128) p0 = args->p0;
-  accumulator_t        * ALIGNED(128) a0 = args->a0;
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-  const grid_t         *              g  = args->g;
-
-  particle_t           * ALIGNED(128) p;
-  particle_mover_t     * ALIGNED(16)  pm; 
-  float                * ALIGNED(16)  vp0;
-  float                * ALIGNED(16)  vp1;
-  float                * ALIGNED(16)  vp2;
-  float                * ALIGNED(16)  vp3;
-
-  const v4float qdt_2mc(args->qdt_2mc);
-  const v4float cdt_dx(args->cdt_dx);
-  const v4float cdt_dy(args->cdt_dy);
-  const v4float cdt_dz(args->cdt_dz);
-  const v4float qsp(args->qsp);
-  const v4float one(1.);
-  const v4float one_third(1./3.);
-  const v4float two_fifteenths(2./15.);
-  const v4float neg_one(-1.);
-
-  const float _qsp = args->qsp;
-
-  v4float dx, dy, dz, ux, uy, uz, q;
-  v4float hax, hay, haz, cbx, cby, cbz;
-  v4float v0, v1, v2, v3, v4, v5;
-  v4int   ii, outbnd;
-
-  int itmp, nq, nm, max_nm;
-
-  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
-
-  // Determine which quads of particle quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
-  p = args->p0 + itmp;
-  nq>>=2;
-
-  // Determine which movers are reserved for this pipeline.
-  // Movers (16 bytes) should be reserved for pipelines in at least
-  // multiples of 8 such that the set of particle movers reserved for
-  // a pipeline is 128-byte aligned and a multiple of 128-byte in
-  // size.  The host is guaranteed to get enough movers to process its
-  // particles with this allocation.
-
-  max_nm = args->max_nm - (args->np&15);
-  if( max_nm<0 ) max_nm = 0;
-  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
-  if( pipeline_rank==n_pipeline ) max_nm = args->max_nm - itmp;
-  pm   = args->pm + itmp;
-  nm   = 0;
-  itmp = 0;
-
-  // Determine which accumulator array to use
-  // The host gets the first accumulator array
-
-  a0 += (1+pipeline_rank)*
-        POW2_CEIL((args->nx+2)*(args->ny+2)*(args->nz+2),2);
-
-  // Process the particle quads for this pipeline
-
-  for( ; nq; nq--, p+=4 ) {
-    load_4x4_tr(&p[0].dx,&p[1].dx,&p[2].dx,&p[3].dx,dx,dy,dz,ii);
-
-    // Interpolate fields
-    vp0 = (float * ALIGNED(16))(f0 + ii(0));
-    vp1 = (float * ALIGNED(16))(f0 + ii(1));
-    vp2 = (float * ALIGNED(16))(f0 + ii(2));
-    vp3 = (float * ALIGNED(16))(f0 + ii(3));
-    load_4x4_tr(vp0,  vp1,  vp2,  vp3,  hax,v0,v1,v2); hax = qdt_2mc*fma( fma( v2, dy, v1 ), dz, fma( v0, dy, hax ) );
-    load_4x4_tr(vp0+4,vp1+4,vp2+4,vp3+4,hay,v3,v4,v5); hay = qdt_2mc*fma( fma( v5, dz, v4 ), dx, fma( v3, dz, hay ) );
-    load_4x4_tr(vp0+8,vp1+8,vp2+8,vp3+8,haz,v0,v1,v2); haz = qdt_2mc*fma( fma( v2, dx, v1 ), dy, fma( v0, dx, haz ) );
-    load_4x4_tr(vp0+12,vp1+12,vp2+12,vp3+12,cbx,v3,cby,v4); cbx = fma( v3, dx, cbx );
-    /**/                                                    cby = fma( v4, dy, cby );
-    load_4x2_tr(vp0+16,vp1+16,vp2+16,vp3+16,cbz,v5);        cbz = fma( v5, dz, cbz );
-
-    // Update momentum
-    // If you are willing to eat a 5-10% performance hit,
-    // v0 = qdt_2mc/sqrt(blah) is a few ulps more accurate (but still
-    // quite in the noise numerically) for cyclotron frequencies
-    // approaching the nyquist frequency.
-
-    load_4x4_tr(&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux,ux,uy,uz,q);
-    ux += hax;
-    uy += hay;
-    uz += haz;
-    v0  = qdt_2mc*rsqrt( one + fma( ux,ux, fma( uy,uy, uz*uz ) ) );
-    v1  = fma( cbx,cbx, fma( cby,cby, cbz*cbz ) );
-    v2  = (v0*v0)*v1;
-    v3  = v0*fma( fma( two_fifteenths, v2, one_third ), v2, one );
-    v4  = v3*rcp(fma( v3*v3, v1, one ));
-    v4 += v4;
-    v0  = fma( fms( uy,cbz, uz*cby ), v3, ux );
-    v1  = fma( fms( uz,cbx, ux*cbz ), v3, uy );
-    v2  = fma( fms( ux,cby, uy*cbx ), v3, uz );
-    ux  = fma( fms( v1,cbz, v2*cby ), v4, ux );
-    uy  = fma( fms( v2,cbx, v0*cbz ), v4, uy );
-    uz  = fma( fms( v0,cby, v1*cbx ), v4, uz );
-    ux += hax;
-    uy += hay;
-    uz += haz;
-    store_4x4_tr(ux,uy,uz,q,&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux);
-    
-    // Update the position of inbnd particles
-    v0  = rsqrt( one + fma( ux,ux, fma( uy,uy, uz*uz ) ) );
-    ux *= cdt_dx;
-    uy *= cdt_dy;
-    uz *= cdt_dz;
-    ux *= v0;
-    uy *= v0;
-    uz *= v0;      // ux,uy,uz are normalized displ (relative to cell size)
-    v0  = dx + ux;
-    v1  = dy + uy;
-    v2  = dz + uz; // New particle midpoint
-    v3  = v0 + ux;
-    v4  = v1 + uy;
-    v5  = v2 + uz; // New particle position
-    outbnd = (v3>one) | (v3<neg_one) |
-             (v4>one) | (v4<neg_one) |
-             (v5>one) | (v5<neg_one);
-    v3  = merge(outbnd,dx,v3); // Do not update outbnd particles
-    v4  = merge(outbnd,dy,v4);
-    v5  = merge(outbnd,dz,v5);
-    store_4x4_tr(v3,v4,v5,ii,&p[0].dx,&p[1].dx,&p[2].dx,&p[3].dx);
-    
-    // Accumulate current of inbnd particles
-    // Note: accumulator values are 4 times the total physical charge that
-    // passed through the appropriate current quadrant in a time-step
-    q  = czero(outbnd,q*qsp);       // Do not accumulate outbnd particles
-    dx = v0;                       // Streak midpoint (valid for inbnd only)
-    dy = v1;
-    dz = v2;
-    v5 = q*ux*uy*uz*one_third;     // Charge conservation correction
-    vp0 = (float * ALIGNED(16))(a0 + ii(0)); // Accumulator pointers
-    vp1 = (float * ALIGNED(16))(a0 + ii(1));
-    vp2 = (float * ALIGNED(16))(a0 + ii(2));
-    vp3 = (float * ALIGNED(16))(a0 + ii(3));
-
-#   define ACCUMULATE_J(X,Y,Z,offset)                               \
-    v4  = q*u##X;   /* v4 = q ux                            */      \
-    v1  = v4*d##Y;  /* v1 = q ux dy                         */      \
-    v0  = v4-v1;    /* v0 = q ux (1-dy)                     */      \
-    v1 += v4;       /* v1 = q ux (1+dy)                     */      \
-    v4  = one+d##Z; /* v4 = 1+dz                            */      \
-    v2  = v0*v4;    /* v2 = q ux (1-dy)(1+dz)               */      \
-    v3  = v1*v4;    /* v3 = q ux (1+dy)(1+dz)               */      \
-    v4  = one-d##Z; /* v4 = 1-dz                            */      \
-    v0 *= v4;       /* v0 = q ux (1-dy)(1-dz)               */      \
-    v1 *= v4;       /* v1 = q ux (1+dy)(1-dz)               */      \
-    v0 += v5;       /* v0 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */      \
-    v1 -= v5;       /* v1 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */      \
-    v2 -= v5;       /* v2 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */      \
-    v3 += v5;       /* v3 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */      \
-    transpose(v0,v1,v2,v3);                                         \
-    increment_4x1(vp0+offset,v0);                                   \
-    increment_4x1(vp1+offset,v1);                                   \
-    increment_4x1(vp2+offset,v2);                                   \
-    increment_4x1(vp3+offset,v3)
-
-    ACCUMULATE_J( x,y,z, 0 );
-    ACCUMULATE_J( y,z,x, 4 );
-    ACCUMULATE_J( z,x,y, 8 );
-
-#   undef ACCUMULATE_J
-
-    // Update position and accumulate outbnd
-
-#   define MOVE_OUTBND(N)                                               \
-    if( outbnd(N) ) {                       /* Unlikely */              \
-      local_pm->dispx = ux(N);                                          \
-      local_pm->dispy = uy(N);                                          \
-      local_pm->dispz = uz(N);                                          \
-      local_pm->i     = (p - p0) + N;                                   \
-      if( move_p( p0, local_pm, a0, g, _qsp ) ) { /* Unlikely */        \
-        if( nm<max_nm ) copy_4x1( &pm[nm++], local_pm );                \
-        else            itmp++;             /* Unlikely */              \
-      }                                                                 \
-    }
-
-    MOVE_OUTBND(0);
-    MOVE_OUTBND(1);
-    MOVE_OUTBND(2);
-    MOVE_OUTBND(3);
-
-#   undef MOVE_OUTBND
-
-  }
-
-  args->seg[pipeline_rank].pm        = pm;
-  args->seg[pipeline_rank].max_nm    = max_nm;
-  args->seg[pipeline_rank].nm        = nm;
-  args->seg[pipeline_rank].n_ignored = itmp;
-}
-
-#endif
-          
-void
-advance_p( /**/  species_t            * RESTRICT sp,
-           /**/  accumulator_array_t  * RESTRICT aa,
-           const interpolator_array_t * RESTRICT ia ) {
-  DECLARE_ALIGNED_ARRAY( advance_p_pipeline_args_t, 128, args, 1 );
-  DECLARE_ALIGNED_ARRAY( particle_mover_seg_t, 128, seg, MAX_PIPELINE+1 );
-  int rank;
-
-  if( !sp || !aa || !ia || sp->g!=aa->g || sp->g!=ia->g )
-    ERROR(( "Bad args" ));
-
-  args->p0       = sp->p;
-  args->pm       = sp->pm;
-  args->a0       = aa->a;
-  args->f0       = ia->i;
-  args->seg      = seg;
-  args->g        = sp->g;
-
-  args->qdt_2mc  = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
-  args->cdt_dx   = sp->g->cvac*sp->g->dt*sp->g->rdx;
-  args->cdt_dy   = sp->g->cvac*sp->g->dt*sp->g->rdy;
-  args->cdt_dz   = sp->g->cvac*sp->g->dt*sp->g->rdz;
-  args->qsp      = sp->q;
-
-  args->np       = sp->np;
-  args->max_nm   = sp->max_nm;
-  args->nx       = sp->g->nx;
-  args->ny       = sp->g->ny;
-  args->nz       = sp->g->nz;
-
-  // Have the host processor do the last incomplete bundle if necessary.
-  // Note: This is overlapped with the pipelined processing.  As such,
-  // it uses an entire accumulator.  Reserving an entire accumulator
-  // for the host processor to handle at most 15 particles is wasteful
-  // of memory.  It is anticipated that it may be useful at some point
-  // in the future have pipelines accumulating currents while the host
-  // processor is doing other more substantive work (e.g. accumulating
-  // currents from particles received from neighboring nodes).
-  // However, it is worth reconsidering this at some point in the
-  // future.
-
-  EXEC_PIPELINES( advance_p, args, 0 );
-  WAIT_PIPELINES();
-
-  // FIXME: HIDEOUS HACK UNTIL BETTER PARTICLE MOVER SEMANTICS
-  // INSTALLED FOR DEALING WITH PIPELINES.  COMPACT THE PARTICLE
-  // MOVERS TO ELIMINATE HOLES FROM THE PIPELINING.
-
-  sp->nm = 0;
-  for( rank=0; rank<=N_PIPELINE; rank++ ) {
-    if( args->seg[rank].n_ignored )
-      WARNING(( "Pipeline %i ran out of storage for %i movers",
-                rank, args->seg[rank].n_ignored ));
-    if( sp->pm+sp->nm != args->seg[rank].pm )
-      MOVE( sp->pm+sp->nm, args->seg[rank].pm, args->seg[rank].nm );
-    sp->nm += args->seg[rank].nm;
-  }
+advance_p( species_t * RESTRICT sp,
+           accumulator_array_t * RESTRICT aa,
+           const interpolator_array_t * RESTRICT ia )
+{
+  // Once more options are available, this should be conditionally executed
+  // based on user choice.
+  advance_p_pipeline( sp, aa, ia );
 }
diff --git a/src/species_advance/standard/center_p.cc b/src/species_advance/standard/center_p.cc
index 9cb6a144..d9fba87b 100644
--- a/src/species_advance/standard/center_p.cc
+++ b/src/species_advance/standard/center_p.cc
@@ -1,166 +1,18 @@
 #define IN_spa
-#define HAS_V4_PIPELINE
-#include "spa_private.h"
 
-void
-center_p_pipeline( center_p_pipeline_args_t * args,
-                   int pipeline_rank,
-                   int n_pipeline ) {
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-
-  particle_t           * ALIGNED(32)  p;
-  const interpolator_t * ALIGNED(16)  f;
-
-  const float qdt_2mc        =     args->qdt_2mc;
-  const float qdt_4mc        = 0.5*args->qdt_2mc; // For half Boris rotate
-  const float one            = 1.;
-  const float one_third      = 1./3.;
-  const float two_fifteenths = 2./15.;
-
-  float dx, dy, dz, ux, uy, uz;
-  float hax, hay, haz, cbx, cby, cbz;
-  float v0, v1, v2, v3, v4;
+#include "../species_advance.h"
 
-  int first, ii, n;
-
-  // Determine which particle quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, n );
-  p = args->p0 + first;
-
-  // Process particles for this pipeline
-
-  for(;n;n--,p++) {
-    dx   = p->dx;                            // Load position
-    dy   = p->dy;
-    dz   = p->dz;
-    ii   = p->i;
-    f    = f0 + ii;                          // Interpolate E
-    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
-                     dz*( f->dexdz + dy*f->d2exdydz ) );
-    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
-                     dx*( f->deydx + dz*f->d2eydzdx ) );
-    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
-                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
-    cbx  = f->cbx + dx*f->dcbxdx;            // Interpolate B
-    cby  = f->cby + dy*f->dcbydy;
-    cbz  = f->cbz + dz*f->dcbzdz;
-    ux   = p->ux;                            // Load momentum
-    uy   = p->uy;
-    uz   = p->uz;
-    ux  += hax;                              // Half advance E
-    uy  += hay;
-    uz  += haz;
-    v0   = qdt_4mc/(float)sqrt(one + (ux*ux + (uy*uy + uz*uz)));
-    /**/                                     // Boris - scalars
-    v1   = cbx*cbx + (cby*cby + cbz*cbz);
-    v2   = (v0*v0)*v1;
-    v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
-    v4   = v3/(one+v1*(v3*v3));
-    v4  += v4;
-    v0   = ux + v3*( uy*cbz - uz*cby );      // Boris - uprime
-    v1   = uy + v3*( uz*cbx - ux*cbz );
-    v2   = uz + v3*( ux*cby - uy*cbx );
-    ux  += v4*( v1*cbz - v2*cby );           // Boris - rotation
-    uy  += v4*( v2*cbx - v0*cbz );
-    uz  += v4*( v0*cby - v1*cbx );
-    p->ux = ux;                              // Store momentum
-    p->uy = uy;
-    p->uz = uz;
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
+//----------------------------------------------------------------------------//
+// Top level function to select and call particle center function using the
+// desired particle center abstraction.  Currently, the only abstraction
+// available is the pipeline abstraction.
+//----------------------------------------------------------------------------//
 
 void
-center_p_pipeline_v4( center_p_pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline ) {
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-
-  particle_t           * ALIGNED(128) p;
-  const float          * ALIGNED(16)  vp0;
-  const float          * ALIGNED(16)  vp1;
-  const float          * ALIGNED(16)  vp2;
-  const float          * ALIGNED(16)  vp3;
-
-  const v4float qdt_2mc(    args->qdt_2mc);
-  const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate
-  const v4float one(1.);
-  const v4float one_third(1./3.);
-  const v4float two_fifteenths(2./15.);
-
-  v4float dx, dy, dz, ux, uy, uz, q;
-  v4float hax, hay, haz, cbx, cby, cbz;
-  v4float v0, v1, v2, v3, v4, v5;
-  v4int   ii;
-
-  int itmp, nq;
-
-  // Determine which particle quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
-  p = args->p0 + itmp;
-  nq >>= 2;
-
-  // Process the particle quads for this pipeline
-
-  for( ; nq; nq--, p+=4 ) {
-    load_4x4_tr(&p[0].dx,&p[1].dx,&p[2].dx,&p[3].dx,dx,dy,dz,ii);
-
-    // Interpolate fields
-    vp0 = (const float * ALIGNED(16))(f0 + ii(0));
-    vp1 = (const float * ALIGNED(16))(f0 + ii(1));
-    vp2 = (const float * ALIGNED(16))(f0 + ii(2));
-    vp3 = (const float * ALIGNED(16))(f0 + ii(3));
-    load_4x4_tr(vp0,  vp1,  vp2,  vp3,  hax,v0,v1,v2); hax = qdt_2mc*fma( fma( dy, v2, v1 ), dz, fma( dy, v0, hax ) );
-    load_4x4_tr(vp0+4,vp1+4,vp2+4,vp3+4,hay,v3,v4,v5); hay = qdt_2mc*fma( fma( dz, v5, v4 ), dx, fma( dz, v3, hay ) );
-    load_4x4_tr(vp0+8,vp1+8,vp2+8,vp3+8,haz,v0,v1,v2); haz = qdt_2mc*fma( fma( dx, v2, v1 ), dy, fma( dx, v0, haz ) );
-    load_4x4_tr(vp0+12,vp1+12,vp2+12,vp3+12,cbx,v3,cby,v4); cbx = fma( v3, dx, cbx );
-    /**/                                                    cby = fma( v4, dy, cby );
-    load_4x2_tr(vp0+16,vp1+16,vp2+16,vp3+16,cbz,v5);        cbz = fma( v5, dz, cbz );
-
-    // Update momentum
-    load_4x4_tr(&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux,ux,uy,uz,q);
-    /**/                                             // Could use load_4x3_tr
-    ux += hax;
-    uy += hay;
-    uz += haz;
-    v0  = qdt_4mc*rsqrt( one + fma( ux,ux, fma( uy,uy, uz*uz ) ) );
-    v1  = fma( cbx,cbx, fma( cby,cby, cbz*cbz ) );
-    v2  = (v0*v0)*v1;
-    v3  = v0*fma( v2, fma( v2, two_fifteenths, one_third ), one );
-    v4  = v3*rcp( fma( v3*v3, v1, one ) ); v4 += v4;
-    v0  = fma( fms( uy,cbz, uz*cby ), v3, ux );
-    v1  = fma( fms( uz,cbx, ux*cbz ), v3, uy );
-    v2  = fma( fms( ux,cby, uy*cbx ), v3, uz );
-    ux  = fma( fms( v1,cbz, v2*cby ), v4, ux );
-    uy  = fma( fms( v2,cbx, v0*cbz ), v4, uy );
-    uz  = fma( fms( v0,cby, v1*cbx ), v4, uz );
-    store_4x4_tr(ux,uy,uz,q,&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux);
-    /**/                                             // Could use store_4x3_tr
-  }
-}
-
-#endif
-
-void
-center_p( /**/  species_t            * RESTRICT sp,
-          const interpolator_array_t * RESTRICT ia ) {
-  DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 );
-
-  if( !sp || !ia || sp->g!=ia->g ) ERROR(( "Bad args" ));
-
-  // Have the pipelines do the bulk of particles in quads and have the
-  // host do the final incomplete quad.
-
-  args->p0      = sp->p;
-  args->f0      = ia->i;
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
-  args->np      = sp->np;
-
-  EXEC_PIPELINES( center_p, args, 0 );
-  WAIT_PIPELINES();
+center_p( species_t * RESTRICT sp,
+          const interpolator_array_t * RESTRICT ia )
+{
+  // Once more options are available, this should be conditionally executed
+  // based on user choice.
+  center_p_pipeline( sp, ia );
 }
diff --git a/src/species_advance/standard/energy_p.cc b/src/species_advance/standard/energy_p.cc
index 9c8eba8d..1e0dbfc7 100644
--- a/src/species_advance/standard/energy_p.cc
+++ b/src/species_advance/standard/energy_p.cc
@@ -1,147 +1,22 @@
 #define IN_spa
-#define HAS_V4_PIPELINE
-#include "spa_private.h"
 
-// This function calculates kinetic energy, normalized by c^2.
-void
-energy_p_pipeline( energy_p_pipeline_args_t * RESTRICT args,
-                   int pipeline_rank,
-                   int n_pipeline ) {
-  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
-  const particle_t     * RESTRICT ALIGNED(32)  p = args->p;
-  const float qdt_2mc = args->qdt_2mc;
-  const float msp     = args->msp;
-  const float one     = 1;
+#include "../species_advance.h"
 
-  float dx, dy, dz;
-  float v0, v1, v2;
-
-  double en = 0;
-
-  int i, n, n0, n1;
-
-  // Determine which particles this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, n1 ); n1 += n0;
-
-  // Process particles quads for this pipeline
-
-  for( n=n0; n<n1; n++ ) {
-    dx  = p[n].dx;
-    dy  = p[n].dy;
-    dz  = p[n].dz;
-    i   = p[n].i;
-    v0  = p[n].ux + qdt_2mc*(    ( f[i].ex    + dy*f[i].dexdy    ) +
-                              dz*( f[i].dexdz + dy*f[i].d2exdydz ) );
-    v1  = p[n].uy + qdt_2mc*(    ( f[i].ey    + dz*f[i].deydz    ) +
-                              dx*( f[i].deydx + dz*f[i].d2eydzdx ) );
-    v2  = p[n].uz + qdt_2mc*(    ( f[i].ez    + dx*f[i].dezdx    ) +
-                              dy*( f[i].dezdy + dx*f[i].d2ezdxdy ) );
-    v0  = v0*v0 + v1*v1 + v2*v2;
-    v0  = (msp * p[n].w) * (v0 / (one + sqrtf(one + v0)));
-    en += (double)v0;
-  }
-
-  args->en[pipeline_rank] = en;
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
-
-void
-energy_p_pipeline_v4( energy_p_pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline ) {
-  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
-  const particle_t     * RESTRICT ALIGNED(128) p = args->p;
-
-  const float          * RESTRICT ALIGNED(16)  vp0;
-  const float          * RESTRICT ALIGNED(16)  vp1;
-  const float          * RESTRICT ALIGNED(16)  vp2;
-  const float          * RESTRICT ALIGNED(16)  vp3;
-
-  const v4float qdt_2mc(args->qdt_2mc);
-  const v4float msp(args->msp);
-  const v4float one(1.);
-
-  v4float dx, dy, dz;
-  v4float ex, ey, ez;
-  v4float v0, v1, v2, w;
-  v4int i;
-
-  double en0 = 0, en1 = 0, en2 = 0, en3 = 0;
-
-  int n0, nq;
-
-  // Determine which particle quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, nq );
-  p += n0;
-  nq >>= 2;
-
-  // Process the particle quads for this pipeline
-
-  for( ; nq; nq--, p+=4 ) {
-    load_4x4_tr(&p[0].dx,&p[1].dx,&p[2].dx,&p[3].dx,dx,dy,dz,i);
-
-    // Interpolate fields
-
-    vp0 = (float *)(f + i(0));
-    vp1 = (float *)(f + i(1));
-    vp2 = (float *)(f + i(2));
-    vp3 = (float *)(f + i(3));
-    load_4x4_tr(vp0,  vp1,  vp2,  vp3,  ex,v0,v1,v2); ex = fma( fma( dy, v2, v1 ), dz, fma( dy, v0, ex ) );
-    load_4x4_tr(vp0+4,vp1+4,vp2+4,vp3+4,ey,v0,v1,v2); ey = fma( fma( dz, v2, v1 ), dx, fma( dz, v0, ey ) );
-    load_4x4_tr(vp0+8,vp1+8,vp2+8,vp3+8,ez,v0,v1,v2); ez = fma( fma( dx, v2, v1 ), dy, fma( dx, v0, ez ) );
-
-    // Update momentum to half step
-    // (note Boris rotation does not change energy so it is unnecessary)
-
-    load_4x4_tr(&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux,v0,v1,v2,w);
-    v0  = fma( ex, qdt_2mc, v0 );
-    v1  = fma( ey, qdt_2mc, v1 );
-    v2  = fma( ez, qdt_2mc, v2 );
-
-    // Accumulate energy
-
-    v0 = fma( v0,v0, fma( v1,v1, v2*v2 ) );
-    v0 = (msp * w) * (v0 / (one + sqrt(one + v0))); 
-    en0 += (double)v0(0);
-    en1 += (double)v0(1);
-    en2 += (double)v0(2);
-    en3 += (double)v0(3);
-  }
-
-  args->en[pipeline_rank] = en0 + en1 + en2 + en3;
-}
-
-#endif
+//----------------------------------------------------------------------------//
+// Top level function to select and call particle energy function using the
+// desired particle energy abstraction.  Currently, the only abstraction
+// available is the pipeline abstraction.
+//----------------------------------------------------------------------------//
 
 double
-energy_p( const species_t            * RESTRICT sp,
-          const interpolator_array_t * RESTRICT ia ) {
-  DECLARE_ALIGNED_ARRAY( energy_p_pipeline_args_t, 128, args, 1 );
-  DECLARE_ALIGNED_ARRAY( double, 128, en, MAX_PIPELINE+1 );
-  double local, global;
-  int rank;
-
-  if( !sp || !ia || sp->g!=ia->g ) ERROR(( "Bad args" ));
-
-  // Have the pipelines do the bulk of particles in quads and have the
-  // host do the final incomplete quad.
-
-  args->p       = sp->p;
-  args->f       = ia->i;
-  args->en      = en;
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
-  args->msp     = sp->m;
-  args->np      = sp->np;
+energy_p( const species_t * RESTRICT sp,
+          const interpolator_array_t * RESTRICT ia )
+{
+  double energy_particles;
 
-  EXEC_PIPELINES( energy_p, args, 0 );
-  WAIT_PIPELINES();
+  // Once more options are available, this should be conditionally executed
+  // based on user choice.
+  energy_particles = energy_p_pipeline( sp, ia );
 
-  local = 0; for( rank=0; rank<=N_PIPELINE; rank++ ) local += en[rank];
-  mp_allsum_d( &local, &global, 1 );
-  return global*((double)sp->g->cvac*(double)sp->g->cvac);
+  return energy_particles;
 }
diff --git a/src/species_advance/standard/hydro_p.c b/src/species_advance/standard/hydro_p.c
index b4fe18df..f85a79a2 100644
--- a/src/species_advance/standard/hydro_p.c
+++ b/src/species_advance/standard/hydro_p.c
@@ -12,7 +12,8 @@
  */
 
 #define IN_spa
-#include "spa_private.h"
+
+#include "../species_advance.h"
 
 // accumulate_hydro_p adds the hydrodynamic fields associated with the
 // supplied particle_list to the hydro array.  Trilinear interpolation
diff --git a/src/species_advance/standard/move_p.cc b/src/species_advance/standard/move_p.cc
index c6e838ef..dfba3785 100644
--- a/src/species_advance/standard/move_p.cc
+++ b/src/species_advance/standard/move_p.cc
@@ -1,5 +1,6 @@
 #define IN_spa
-#include "spa_private.h"
+
+#include "../species_advance.h"
 
 // move_p moves the particle m->p by m->dispx, m->dispy, m->dispz
 // depositing particle current as it goes. If the particle was moved
@@ -55,9 +56,9 @@ move_p( particle_t       * RESTRICT ALIGNED(128) p,
 
   q  = v4float(qsp)*splat<3>(u); // q  = p_q,   p_q,   p_q,   D/C
   q3 = v4float(1.f/3.f)*q;      // q3 = p_q/3, p_q/3, p_q/3, D/C
-  dr = shuffle<0,1,2,2>( dr );  // dr = p_ddx, p_ddy, p_ddz, D/C 
+  dr = shuffle<0,1,2,2>( dr );  // dr = p_ddx, p_ddy, p_ddz, D/C
   r  = shuffle<0,1,2,2>( r );  // r  = p_dx,  p_dy,  p_dz,  D/C
-  
+
   for(;;) {
 
     // At this point:
@@ -82,14 +83,14 @@ move_p( particle_t       * RESTRICT ALIGNED(128) p,
     // Likewise, due to speed of light limitations, generally dr
     // cannot get much larger than 1 or so and the numerator, if not
     // zero, can generally never be smaller than FLT_EPS/2.  Thus,
-    // likewise, the divide will never underflow either. 
+    // likewise, the divide will never underflow either.
 
-    // FIXME: THIS COULD PROBABLY BE DONE EVEN FASTER 
+    // FIXME: THIS COULD PROBABLY BE DONE EVEN FASTER
     sgn_dr = copysign( one,  dr );
     v0     = copysign( tiny, dr );
     store_4x1( (sgn_dr-r) / ((dr+dr)+v0), stack_vf );
     /**/                          type = 3;             f0 = 1;
-    f1 = stack_vf[0]; if( f1<f0 ) type = 0; if( f1<f0 ) f0 = f1; // Branchless cmov 
+    f1 = stack_vf[0]; if( f1<f0 ) type = 0; if( f1<f0 ) f0 = f1; // Branchless cmov
     f1 = stack_vf[1]; if( f1<f0 ) type = 1; if( f1<f0 ) f0 = f1;
     f1 = stack_vf[2]; if( f1<f0 ) type = 2; if( f1<f0 ) f0 = f1;
     s = v4float( f0 );
@@ -147,7 +148,7 @@ move_p( particle_t       * RESTRICT ALIGNED(128) p,
     // If streak ended at the end of the particle track, this mover
     // was succesfully processed.  Should be just under ~50% of the
     // time.
-       
+
     if( type==3 ) { store_4x1( r, &p[n].dx ); p[n].i = voxel; break; }
 
     // Streak terminated on a voxel face.  Determine if the particle
@@ -201,7 +202,7 @@ move_p( particle_t       * RESTRICT ALIGNED(128) p,
 
     // Crossed into a normal voxel.  Update the voxel index, convert the
     // particle coordinate system and keep moving the particle.
-      
+
     voxel = (int32_t)( neighbor - g->rangel );
     r = toggle_bits( bits, r );
   }
@@ -237,27 +238,27 @@ move_p( particle_t       * ALIGNED(128) p0,
     s_dispy = pm->dispy;
     s_dispz = pm->dispz;
 
-    s_dir[0] = (s_dispx>0) ? 1 : -1;
-    s_dir[1] = (s_dispy>0) ? 1 : -1;
-    s_dir[2] = (s_dispz>0) ? 1 : -1;
-    
+    s_dir[0] = (s_dispx>0.0f) ? 1.0f : -1.0f;
+    s_dir[1] = (s_dispy>0.0f) ? 1.0f : -1.0f;
+    s_dir[2] = (s_dispz>0.0f) ? 1.0f : -1.0f;
+
     // Compute the twice the fractional distance to each potential
     // streak/cell face intersection.
-    v0 = (s_dispx==0) ? 3.4e38 : (s_dir[0]-s_midx)/s_dispx;
-    v1 = (s_dispy==0) ? 3.4e38 : (s_dir[1]-s_midy)/s_dispy;
-    v2 = (s_dispz==0) ? 3.4e38 : (s_dir[2]-s_midz)/s_dispz;
+    v0 = (s_dispx==0.0f) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx;
+    v1 = (s_dispy==0.0f) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy;
+    v2 = (s_dispz==0.0f) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz;
 
     // Determine the fractional length and axis of current streak. The
     // streak ends on either the first face intersected by the
     // particle track or at the end of the particle track.
-    // 
+    //
     //   axis 0,1 or 2 ... streak ends on a x,y or z-face respectively
     //   axis 3        ... streak ends at end of the particle track
-    /**/      v3=2,  axis=3;
-    if(v0<v3) v3=v0, axis=0;
-    if(v1<v3) v3=v1, axis=1;
-    if(v2<v3) v3=v2, axis=2;
-    v3 *= 0.5;
+    /**/      v3=2.0f, axis=3;
+    if(v0<v3) v3=v0,   axis=0;
+    if(v1<v3) v3=v1,   axis=1;
+    if(v2<v3) v3=v2,   axis=2;
+    v3 *= 0.5f;
 
     // Compute the midpoint and the normalized displacement of the streak
     s_dispx *= v3;
@@ -322,7 +323,7 @@ move_p( particle_t       * ALIGNED(128) p0,
                            // _exactly_ on the boundary.
     face = axis; if( v0>0 ) face += 3;
     neighbor = g->neighbor[ 6*p->i + face ];
-    
+
     if( UNLIKELY( neighbor==reflect_particles ) ) {
       // Hit a reflecting boundary condition.  Reflect the particle
       // momentum and remaining displacement and keep moving the
@@ -342,7 +343,7 @@ move_p( particle_t       * ALIGNED(128) p0,
 
     // Crossed into a normal voxel.  Update the voxel index, convert the
     // particle coordinate system and keep moving the particle.
-    
+
     p->i = neighbor - g->rangel; // Compute local index of neighbor
     /**/                         // Note: neighbor - g->rangel < 2^31 / 6
     (&(p->dx))[axis] = -v0;      // Convert coordinate system
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
new file mode 100644
index 00000000..a222d76f
--- /dev/null
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
@@ -0,0 +1,314 @@
+// FIXME: PARTICLE MOVERS NEED TO BE OVERALLOCATED IN STRUCTORS TO
+// ACCOUNT FOR SPLITTING THE MOVER ARRAY BETWEEN HOST AND PIPELINES
+
+#define IN_spa
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "spa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an advance_p pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+advance_p_pipeline_scalar( advance_p_pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline )
+{
+  particle_t           * ALIGNED(128) p0 = args->p0;
+  accumulator_t        * ALIGNED(128) a0 = args->a0;
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+  const grid_t *                      g  = args->g;
+
+  particle_t           * ALIGNED(32)  p;
+  particle_mover_t     * ALIGNED(16)  pm;
+  const interpolator_t * ALIGNED(16)  f;
+  float                * ALIGNED(16)  a;
+
+  const float qdt_2mc        = args->qdt_2mc;
+  const float cdt_dx         = args->cdt_dx;
+  const float cdt_dy         = args->cdt_dy;
+  const float cdt_dz         = args->cdt_dz;
+  const float qsp            = args->qsp;
+  const float one            = 1.0;
+  const float one_third      = 1.0/3.0;
+  const float two_fifteenths = 2.0/15.0;
+
+  float dx, dy, dz, ux, uy, uz, q;
+  float hax, hay, haz, cbx, cby, cbz;
+  float v0, v1, v2, v3, v4, v5;
+  int   ii;
+
+  int itmp, n, nm, max_nm;
+
+  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
+
+  // Determine which quads of particles quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, n );
+
+  p = args->p0 + itmp;
+
+  // Determine which movers are reserved for this pipeline.
+  // Movers (16 bytes) should be reserved for pipelines in at least
+  // multiples of 8 such that the set of particle movers reserved for
+  // a pipeline is 128-byte aligned and a multiple of 128-byte in
+  // size.  The host is guaranteed to get enough movers to process its
+  // particles with this allocation.
+
+  max_nm = args->max_nm - ( args->np&15 );
+
+  if ( max_nm < 0 ) max_nm = 0;
+
+  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
+
+  if ( pipeline_rank == n_pipeline ) max_nm = args->max_nm - itmp;
+
+  pm   = args->pm + itmp;
+  nm   = 0;
+  itmp = 0;
+
+  // Determine which accumulator array to use
+  // The host gets the first accumulator array.
+
+  if ( pipeline_rank != n_pipeline )
+    a0 += ( 1 + pipeline_rank ) *
+          POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+
+  // Process particles for this pipeline.
+
+  for( ; n; n--, p++ )
+  {
+    dx   = p->dx;                             // Load position
+    dy   = p->dy;
+    dz   = p->dz;
+    ii   = p->i;
+
+    f    = f0 + ii;                           // Interpolate E
+
+    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
+                     dz*( f->dexdz + dy*f->d2exdydz ) );
+
+    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
+                     dx*( f->deydx + dz*f->d2eydzdx ) );
+
+    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
+                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
+
+    cbx  = f->cbx + dx*f->dcbxdx;             // Interpolate B
+    cby  = f->cby + dy*f->dcbydy;
+    cbz  = f->cbz + dz*f->dcbzdz;
+
+    ux   = p->ux;                             // Load momentum
+    uy   = p->uy;
+    uz   = p->uz;
+    q    = p->w;
+
+    ux  += hax;                               // Half advance E
+    uy  += hay;
+    uz  += haz;
+
+    v0   = qdt_2mc / sqrtf( one + ( ux*ux + ( uy*uy + uz*uz ) ) );
+
+                                              // Boris - scalars
+    v1   = cbx*cbx + ( cby*cby + cbz*cbz );
+    v2   = ( v0*v0 ) * v1;
+    v3   = v0 * ( one + v2 * ( one_third + v2 * two_fifteenths ) );
+    v4   = v3 / ( one + v1 * ( v3 * v3 ) );
+    v4  += v4;
+
+    v0   = ux + v3*( uy*cbz - uz*cby );       // Boris - uprime
+    v1   = uy + v3*( uz*cbx - ux*cbz );
+    v2   = uz + v3*( ux*cby - uy*cbx );
+
+    ux  += v4*( v1*cbz - v2*cby );            // Boris - rotation
+    uy  += v4*( v2*cbx - v0*cbz );
+    uz  += v4*( v0*cby - v1*cbx );
+
+    ux  += hax;                               // Half advance E
+    uy  += hay;
+    uz  += haz;
+
+    p->ux = ux;                               // Store momentum
+    p->uy = uy;
+    p->uz = uz;
+
+    v0   = one / sqrtf( one + ( ux*ux+ ( uy*uy + uz*uz ) ) );
+                                              // Get norm displacement
+
+    ux  *= cdt_dx;
+    uy  *= cdt_dy;
+    uz  *= cdt_dz;
+
+    ux  *= v0;
+    uy  *= v0;
+    uz  *= v0;
+
+    v0   = dx + ux;                           // Streak midpoint (inbnds)
+    v1   = dy + uy;
+    v2   = dz + uz;
+
+    v3   = v0 + ux;                           // New position
+    v4   = v1 + uy;
+    v5   = v2 + uz;
+
+    // FIXME-KJB: COULD SHORT CIRCUIT ACCUMULATION IN THE CASE WHERE QSP==0!
+    if (  v3 <= one &&  v4 <= one &&  v5 <= one &&   // Check if inbnds
+         -v3 <= one && -v4 <= one && -v5 <= one )
+    {
+      // Common case (inbnds).  Note: accumulator values are 4 times
+      // the total physical charge that passed through the appropriate
+      // current quadrant in a time-step.
+
+      q *= qsp;
+
+      p->dx = v3;                             // Store new position
+      p->dy = v4;
+      p->dz = v5;
+
+      dx = v0;                                // Streak midpoint
+      dy = v1;
+      dz = v2;
+
+      v5 = q*ux*uy*uz*one_third;              // Compute correction
+
+      a  = (float *)( a0 + ii );              // Get accumulator
+
+#     define ACCUMULATE_J(X,Y,Z,offset)                                 \
+      v4  = q*u##X;   /* v2 = q ux                            */        \
+      v1  = v4*d##Y;  /* v1 = q ux dy                         */        \
+      v0  = v4-v1;    /* v0 = q ux (1-dy)                     */        \
+      v1 += v4;       /* v1 = q ux (1+dy)                     */        \
+      v4  = one+d##Z; /* v4 = 1+dz                            */        \
+      v2  = v0*v4;    /* v2 = q ux (1-dy)(1+dz)               */        \
+      v3  = v1*v4;    /* v3 = q ux (1+dy)(1+dz)               */        \
+      v4  = one-d##Z; /* v4 = 1-dz                            */        \
+      v0 *= v4;       /* v0 = q ux (1-dy)(1-dz)               */        \
+      v1 *= v4;       /* v1 = q ux (1+dy)(1-dz)               */        \
+      v0 += v5;       /* v0 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */        \
+      v1 -= v5;       /* v1 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */        \
+      v2 -= v5;       /* v2 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */        \
+      v3 += v5;       /* v3 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */        \
+      a[offset+0] += v0;                                                \
+      a[offset+1] += v1;                                                \
+      a[offset+2] += v2;                                                \
+      a[offset+3] += v3
+
+      ACCUMULATE_J( x, y, z, 0 );
+      ACCUMULATE_J( y, z, x, 4 );
+      ACCUMULATE_J( z, x, y, 8 );
+
+#     undef ACCUMULATE_J
+    }
+
+    else                                        // Unlikely
+    {
+      local_pm->dispx = ux;
+      local_pm->dispy = uy;
+      local_pm->dispz = uz;
+
+      local_pm->i     = p - p0;
+
+      if ( move_p( p0, local_pm, a0, g, qsp ) ) // Unlikely
+      {
+        if ( nm < max_nm )
+        {
+          pm[nm++] = local_pm[0];
+        }
+
+        else
+        {
+          itmp++;                               // Unlikely
+        }
+      }
+    }
+  }
+
+  args->seg[pipeline_rank].pm        = pm;
+  args->seg[pipeline_rank].max_nm    = max_nm;
+  args->seg[pipeline_rank].nm        = nm;
+  args->seg[pipeline_rank].n_ignored = itmp;
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper advance_p pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+advance_p_pipeline( species_t * RESTRICT sp,
+                    accumulator_array_t * RESTRICT aa,
+                    const interpolator_array_t * RESTRICT ia )
+{
+  DECLARE_ALIGNED_ARRAY( advance_p_pipeline_args_t, 128, args, 1 );
+
+  DECLARE_ALIGNED_ARRAY( particle_mover_seg_t, 128, seg, MAX_PIPELINE + 1 );
+
+  int rank;
+
+  if ( !sp || !aa || !ia || sp->g != aa->g || sp->g != ia->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  args->p0      = sp->p;
+  args->pm      = sp->pm;
+  args->a0      = aa->a;
+  args->f0      = ia->i;
+  args->seg     = seg;
+  args->g       = sp->g;
+
+  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->cdt_dx  = sp->g->cvac*sp->g->dt*sp->g->rdx;
+  args->cdt_dy  = sp->g->cvac*sp->g->dt*sp->g->rdy;
+  args->cdt_dz  = sp->g->cvac*sp->g->dt*sp->g->rdz;
+  args->qsp     = sp->q;
+
+  args->np      = sp->np;
+  args->max_nm  = sp->max_nm;
+  args->nx      = sp->g->nx;
+  args->ny      = sp->g->ny;
+  args->nz      = sp->g->nz;
+
+  // Have the host processor do the last incomplete bundle if necessary.
+  // Note: This is overlapped with the pipelined processing.  As such,
+  // it uses an entire accumulator.  Reserving an entire accumulator
+  // for the host processor to handle at most 15 particles is wasteful
+  // of memory.  It is anticipated that it may be useful at some point
+  // in the future have pipelines accumulating currents while the host
+  // processor is doing other more substantive work (e.g. accumulating
+  // currents from particles received from neighboring nodes).
+  // However, it is worth reconsidering this at some point in the
+  // future.
+
+  EXEC_PIPELINES( advance_p, args, 0 );
+
+  WAIT_PIPELINES();
+
+  // FIXME: HIDEOUS HACK UNTIL BETTER PARTICLE MOVER SEMANTICS
+  // INSTALLED FOR DEALING WITH PIPELINES.  COMPACT THE PARTICLE
+  // MOVERS TO ELIMINATE HOLES FROM THE PIPELINING.
+
+  sp->nm = 0;
+  for( rank = 0; rank <= N_PIPELINE; rank++ )
+  {
+    if ( args->seg[rank].n_ignored )
+    {
+      WARNING( ( "Pipeline %i ran out of storage for %i movers",
+                 rank, args->seg[rank].n_ignored ) );
+    }
+
+    if ( sp->pm + sp->nm != args->seg[rank].pm )
+    {
+      MOVE( sp->pm + sp->nm, args->seg[rank].pm, args->seg[rank].nm );
+    }
+
+    sp->nm += args->seg[rank].nm;
+  }
+}
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
new file mode 100644
index 00000000..bc152588
--- /dev/null
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
@@ -0,0 +1,419 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+//----------------------------------------------------------------------------//
+// Method 4
+//----------------------------------------------------------------------------//
+// This method processes 16 particles at a time instead of 32.
+//----------------------------------------------------------------------------//
+// This method processes the particles in the same order as the reference
+// implementation and gives good reproducibility. This is achieved using
+// modified load_16x8_tr_p and store_16x8_tr_p functions which load or store
+// the particle data in the correct order in a single step instead of using
+// two steps.
+//----------------------------------------------------------------------------//
+
+void
+advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  particle_t           * ALIGNED(128) p0 = args->p0;
+  accumulator_t        * ALIGNED(128) a0 = args->a0;
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+  const grid_t         *              g  = args->g;
+
+  particle_t           * ALIGNED(128) p;
+  particle_mover_t     * ALIGNED(16)  pm;
+
+  float                * ALIGNED(64)  vp00;
+  float                * ALIGNED(64)  vp01;
+  float                * ALIGNED(64)  vp02;
+  float                * ALIGNED(64)  vp03;
+  float                * ALIGNED(64)  vp04;
+  float                * ALIGNED(64)  vp05;
+  float                * ALIGNED(64)  vp06;
+  float                * ALIGNED(64)  vp07;
+  float                * ALIGNED(64)  vp08;
+  float                * ALIGNED(64)  vp09;
+  float                * ALIGNED(64)  vp10;
+  float                * ALIGNED(64)  vp11;
+  float                * ALIGNED(64)  vp12;
+  float                * ALIGNED(64)  vp13;
+  float                * ALIGNED(64)  vp14;
+  float                * ALIGNED(64)  vp15;
+
+  // Basic constants.
+  const v16float qdt_2mc(args->qdt_2mc);
+  const v16float cdt_dx(args->cdt_dx);
+  const v16float cdt_dy(args->cdt_dy);
+  const v16float cdt_dz(args->cdt_dz);
+  const v16float qsp(args->qsp);
+  const v16float one(1.0);
+  const v16float one_third(1.0/3.0);
+  const v16float two_fifteenths(2.0/15.0);
+  const v16float neg_one(-1.0);
+
+  const float _qsp = args->qsp;
+
+  v16float dx, dy, dz, ux, uy, uz, q;
+  v16float hax, hay, haz, cbx, cby, cbz;
+  v16float v00, v01, v02, v03, v04, v05, v06, v07;
+  v16float v08, v09, v10, v11, v12, v13, v14, v15;
+  v16int   ii, outbnd;
+
+  int itmp, nq, nm, max_nm;
+
+  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
+
+  // Determine which blocks of particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 4;
+
+  // Determine which movers are reserved for this pipeline.
+  // Movers (16 bytes) should be reserved for pipelines in at least
+  // multiples of 8 such that the set of particle movers reserved for
+  // a pipeline is 128-byte aligned and a multiple of 128-byte in
+  // size.  The host is guaranteed to get enough movers to process its
+  // particles with this allocation.
+
+  max_nm = args->max_nm - ( args->np&15 );
+
+  if ( max_nm < 0 ) max_nm = 0;
+
+  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
+
+  if ( pipeline_rank == n_pipeline ) max_nm = args->max_nm - itmp;
+
+  pm   = args->pm + itmp;
+  nm   = 0;
+  itmp = 0;
+
+  // Determine which accumulator array to use.
+  // The host gets the first accumulator array.
+
+  a0 += ( 1 + pipeline_rank ) *
+        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=16 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_16x8_tr_p( &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                    &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx,
+                    dx, dy, dz, ii, ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(64) ) ( f0 + ii( 0) );
+    vp01 = ( float * ALIGNED(64) ) ( f0 + ii( 1) );
+    vp02 = ( float * ALIGNED(64) ) ( f0 + ii( 2) );
+    vp03 = ( float * ALIGNED(64) ) ( f0 + ii( 3) );
+    vp04 = ( float * ALIGNED(64) ) ( f0 + ii( 4) );
+    vp05 = ( float * ALIGNED(64) ) ( f0 + ii( 5) );
+    vp06 = ( float * ALIGNED(64) ) ( f0 + ii( 6) );
+    vp07 = ( float * ALIGNED(64) ) ( f0 + ii( 7) );
+    vp08 = ( float * ALIGNED(64) ) ( f0 + ii( 8) );
+    vp09 = ( float * ALIGNED(64) ) ( f0 + ii( 9) );
+    vp10 = ( float * ALIGNED(64) ) ( f0 + ii(10) );
+    vp11 = ( float * ALIGNED(64) ) ( f0 + ii(11) );
+    vp12 = ( float * ALIGNED(64) ) ( f0 + ii(12) );
+    vp13 = ( float * ALIGNED(64) ) ( f0 + ii(13) );
+    vp14 = ( float * ALIGNED(64) ) ( f0 + ii(14) );
+    vp15 = ( float * ALIGNED(64) ) ( f0 + ii(15) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x16_tr( vp00, vp01, vp02, vp03,
+                   vp04, vp05, vp06, vp07,
+                   vp08, vp09, vp10, vp11,
+                   vp12, vp13, vp14, vp15,
+                   hax, v00, v01, v02, hay, v03, v04, v05,
+                   haz, v06, v07, v08, cbx, v09, cby, v10 );
+
+    hax = qdt_2mc*fma( fma( v02, dy, v01 ), dz, fma( v00, dy, hax ) );
+
+    hay = qdt_2mc*fma( fma( v05, dz, v04 ), dx, fma( v03, dz, hay ) );
+
+    haz = qdt_2mc*fma( fma( v08, dx, v07 ), dy, fma( v06, dx, haz ) );
+
+    cbx = fma( v09, dx, cbx );
+
+    cby = fma( v10, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_16x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+                  vp04+16, vp05+16, vp06+16, vp07+16,
+                  vp08+16, vp09+16, vp10+16, vp11+16,
+                  vp12+16, vp13+16, vp14+16, vp15+16,
+                  cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    // For a 5-10% performance hit, v00 = qdt_2mc/sqrt(blah) is a few ulps more
+    // accurate (but still quite in the noise numerically) for cyclotron
+    // frequencies approaching the nyquist frequency.
+    //--------------------------------------------------------------------------
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_2mc*rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz*cbz ) );
+    v02  = (v00*v00)*v01;
+    v03  = v00*fma( fma( two_fifteenths, v02, one_third ), v02, one );
+    v04  = v03*rcp( fma( v03*v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms(  uy, cbz,  uz*cby ), v03, ux );
+    v01  = fma( fms(  uz, cbx,  ux*cbz ), v03, uy );
+    v02  = fma( fms(  ux, cby,  uy*cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02*cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00*cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01*cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    // Store ux, uy, uz in v06, v07, v08 so particle velocity store can be done
+    // later with the particle positions.
+    v06  = ux;
+    v07  = uy;
+    v08  = uz;
+
+    //--------------------------------------------------------------------------
+    // Update the position of in bound particles.
+    //--------------------------------------------------------------------------
+    v00 = rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+
+    ux *= cdt_dx;
+    uy *= cdt_dy;
+    uz *= cdt_dz;
+
+    ux *= v00;
+    uy *= v00;
+    uz *= v00;      // ux,uy,uz are normalized displ (relative to cell size)
+
+    v00 =  dx + ux;
+    v01 =  dy + uy;
+    v02 =  dz + uz; // New particle midpoint
+
+    v03 = v00 + ux;
+    v04 = v01 + uy;
+    v05 = v02 + uz; // New particle position
+
+    //--------------------------------------------------------------------------
+    // Determine which particles are out of bounds.
+    //--------------------------------------------------------------------------
+    outbnd = ( v03 > one ) | ( v03 < neg_one ) |
+             ( v04 > one ) | ( v04 < neg_one ) |
+             ( v05 > one ) | ( v05 < neg_one );
+
+    v03 = merge( outbnd, dx, v03 ); // Do not update outbnd particles
+    v04 = merge( outbnd, dy, v04 );
+    v05 = merge( outbnd, dz, v05 );
+
+    //--------------------------------------------------------------------------
+    // Store particle data, final.
+    //--------------------------------------------------------------------------
+    store_16x8_tr_p( v03, v04, v05, ii, v06, v07, v08, q,
+                     &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                     &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx );
+
+    // Accumulate current of inbnd particles.
+    // Note: accumulator values are 4 times the total physical charge that
+    // passed through the appropriate current quadrant in a time-step.
+    q  = czero( outbnd, q*qsp );   // Do not accumulate outbnd particles
+
+    dx = v00;                      // Streak midpoint (valid for inbnd only)
+    dy = v01;
+    dz = v02;
+
+    v13 = q*ux*uy*uz*one_third;    // Charge conservation correction
+
+    //--------------------------------------------------------------------------
+    // Set current density accumulation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(64) ) ( a0 + ii( 0) );
+    vp01 = ( float * ALIGNED(64) ) ( a0 + ii( 1) );
+    vp02 = ( float * ALIGNED(64) ) ( a0 + ii( 2) );
+    vp03 = ( float * ALIGNED(64) ) ( a0 + ii( 3) );
+    vp04 = ( float * ALIGNED(64) ) ( a0 + ii( 4) );
+    vp05 = ( float * ALIGNED(64) ) ( a0 + ii( 5) );
+    vp06 = ( float * ALIGNED(64) ) ( a0 + ii( 6) );
+    vp07 = ( float * ALIGNED(64) ) ( a0 + ii( 7) );
+    vp08 = ( float * ALIGNED(64) ) ( a0 + ii( 8) );
+    vp09 = ( float * ALIGNED(64) ) ( a0 + ii( 9) );
+    vp10 = ( float * ALIGNED(64) ) ( a0 + ii(10) );
+    vp11 = ( float * ALIGNED(64) ) ( a0 + ii(11) );
+    vp12 = ( float * ALIGNED(64) ) ( a0 + ii(12) );
+    vp13 = ( float * ALIGNED(64) ) ( a0 + ii(13) );
+    vp14 = ( float * ALIGNED(64) ) ( a0 + ii(14) );
+    vp15 = ( float * ALIGNED(64) ) ( a0 + ii(15) );
+
+    //--------------------------------------------------------------------------
+    // Accumulate current density.
+    //--------------------------------------------------------------------------
+    // Accumulate Jx for 16 particles into the v0-v3 vectors.
+    v12  = q*ux;     // v12 = q ux
+    v01  = v12*dy;   // v01 = q ux dy
+    v00  = v12-v01;  // v00 = q ux (1-dy)
+    v01 += v12;      // v01 = q ux (1+dy)
+    v12  = one+dz;   // v12 = 1+dz
+    v02  = v00*v12;  // v02 = q ux (1-dy)(1+dz)
+    v03  = v01*v12;  // v03 = q ux (1+dy)(1+dz)
+    v12  = one-dz;   // v12 = 1-dz
+    v00 *= v12;      // v00 = q ux (1-dy)(1-dz)
+    v01 *= v12;      // v01 = q ux (1+dy)(1-dz)
+    v00 += v13;      // v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ]
+    v01 -= v13;      // v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ]
+    v02 -= v13;      // v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ]
+    v03 += v13;      // v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ]
+
+    // Accumulate Jy for 16 particles into the v4-v7 vectors.
+    v12  = q*uy;     // v12 = q uy
+    v05  = v12*dz;   // v05 = q uy dz
+    v04  = v12-v05;  // v04 = q uy (1-dz)
+    v05 += v12;      // v05 = q uy (1+dz)
+    v12  = one+dx;   // v12 = 1+dx
+    v06  = v04*v12;  // v06 = q uy (1-dz)(1+dx)
+    v07  = v05*v12;  // v07 = q uy (1+dz)(1+dx)
+    v12  = one-dx;   // v12 = 1-dx
+    v04 *= v12;      // v04 = q uy (1-dz)(1-dx)
+    v05 *= v12;      // v05 = q uy (1+dz)(1-dx)
+    v04 += v13;      // v04 = q uy [ (1-dz)(1-dx) + ux*uz/3 ]
+    v05 -= v13;      // v05 = q uy [ (1+dz)(1-dx) - ux*uz/3 ]
+    v06 -= v13;      // v06 = q uy [ (1-dz)(1+dx) - ux*uz/3 ]
+    v07 += v13;      // v07 = q uy [ (1+dz)(1+dx) + ux*uz/3 ]
+
+    // Accumulate Jz for 16 particles into the v8-v11 vectors.
+    v12  = q*uz;     // v12 = q uz
+    v09  = v12*dx;   // v09 = q uz dx
+    v08  = v12-v09;  // v08 = q uz (1-dx)
+    v09 += v12;      // v09 = q uz (1+dx)
+    v12  = one+dy;   // v12 = 1+dy
+    v10  = v08*v12;  // v10 = q uz (1-dx)(1+dy)
+    v11  = v09*v12;  // v11 = q uz (1+dx)(1+dy)
+    v12  = one-dy;   // v12 = 1-dy
+    v08 *= v12;      // v08 = q uz (1-dx)(1-dy)
+    v09 *= v12;      // v09 = q uz (1+dx)(1-dy)
+    v08 += v13;      // v08 = q uz [ (1-dx)(1-dy) + ux*uy/3 ]
+    v09 -= v13;      // v09 = q uz [ (1+dx)(1-dy) - ux*uy/3 ]
+    v10 -= v13;      // v10 = q uz [ (1-dx)(1+dy) - ux*uy/3 ]
+    v11 += v13;      // v11 = q uz [ (1+dx)(1+dy) + ux*uy/3 ]
+
+    // Zero the v12-v15 vectors prior to transposing the data.
+    v12 = 0.0;
+    v13 = 0.0;
+    v14 = 0.0;
+    v15 = 0.0;
+
+    // Transpose the data in vectors v0-v15 so it can be added into the
+    // accumulator arrays using vector operations.
+    transpose( v00, v01, v02, v03, v04, v05, v06, v07,
+               v08, v09, v10, v11, v12, v13, v14, v15 );
+
+    // Add the contributions to Jx, Jy and Jz from 16 particles into the
+    // accumulator arrays for Jx, Jy and Jz.
+    increment_16x1( vp00, v00 );
+    increment_16x1( vp01, v01 );
+    increment_16x1( vp02, v02 );
+    increment_16x1( vp03, v03 );
+    increment_16x1( vp04, v04 );
+    increment_16x1( vp05, v05 );
+    increment_16x1( vp06, v06 );
+    increment_16x1( vp07, v07 );
+    increment_16x1( vp08, v08 );
+    increment_16x1( vp09, v09 );
+    increment_16x1( vp10, v10 );
+    increment_16x1( vp11, v11 );
+    increment_16x1( vp12, v12 );
+    increment_16x1( vp13, v13 );
+    increment_16x1( vp14, v14 );
+    increment_16x1( vp15, v15 );
+
+    //--------------------------------------------------------------------------
+    // Update position and accumulate current density for out of bounds
+    // particles.
+    //--------------------------------------------------------------------------
+
+#   define MOVE_OUTBND(N)                                               \
+    if ( outbnd(N) )                                /* Unlikely */      \
+    {                                                                   \
+      local_pm->dispx = ux(N);                                          \
+      local_pm->dispy = uy(N);                                          \
+      local_pm->dispz = uz(N);                                          \
+      local_pm->i     = ( p - p0 ) + N;                                 \
+      if ( move_p( p0, local_pm, a0, g, _qsp ) )    /* Unlikely */      \
+      {                                                                 \
+        if ( nm < max_nm )                                              \
+        {                                                               \
+          v4::copy_4x1( &pm[nm++], local_pm );                          \
+        }                                                               \
+        else                                        /* Unlikely */      \
+        {                                                               \
+          itmp++;                                                       \
+        }                                                               \
+      }                                                                 \
+    }
+
+    MOVE_OUTBND( 0);
+    MOVE_OUTBND( 1);
+    MOVE_OUTBND( 2);
+    MOVE_OUTBND( 3);
+    MOVE_OUTBND( 4);
+    MOVE_OUTBND( 5);
+    MOVE_OUTBND( 6);
+    MOVE_OUTBND( 7);
+    MOVE_OUTBND( 8);
+    MOVE_OUTBND( 9);
+    MOVE_OUTBND(10);
+    MOVE_OUTBND(11);
+    MOVE_OUTBND(12);
+    MOVE_OUTBND(13);
+    MOVE_OUTBND(14);
+    MOVE_OUTBND(15);
+
+#   undef MOVE_OUTBND
+  }
+
+  args->seg[pipeline_rank].pm        = pm;
+  args->seg[pipeline_rank].max_nm    = max_nm;
+  args->seg[pipeline_rank].nm        = nm;
+  args->seg[pipeline_rank].n_ignored = itmp;
+}
+
+#else
+
+void
+advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No advance_p_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
new file mode 100644
index 00000000..4e23770e
--- /dev/null
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
@@ -0,0 +1,322 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  particle_t           * ALIGNED(128) p0 = args->p0;
+  accumulator_t        * ALIGNED(128) a0 = args->a0;
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+  const grid_t         *              g  = args->g;
+
+  particle_t           * ALIGNED(128) p;
+  particle_mover_t     * ALIGNED(16)  pm;
+
+  float                * ALIGNED(16)  vp00;
+  float                * ALIGNED(16)  vp01;
+  float                * ALIGNED(16)  vp02;
+  float                * ALIGNED(16)  vp03;
+
+  // Basic constants.
+  const v4float qdt_2mc(args->qdt_2mc);
+  const v4float cdt_dx(args->cdt_dx);
+  const v4float cdt_dy(args->cdt_dy);
+  const v4float cdt_dz(args->cdt_dz);
+  const v4float qsp(args->qsp);
+  const v4float one(1.0);
+  const v4float one_third(1.0/3.0);
+  const v4float two_fifteenths(2.0/15.0);
+  const v4float neg_one(-1.0);
+
+  const float _qsp = args->qsp;
+
+  v4float dx, dy, dz, ux, uy, uz, q;
+  v4float hax, hay, haz, cbx, cby, cbz;
+  v4float v00, v01, v02, v03, v04, v05;
+  v4int   ii, outbnd;
+
+  int itmp, nq, nm, max_nm;
+
+  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
+
+  // Determine which quads of particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 2;
+
+  // Determine which movers are reserved for this pipeline.
+  // Movers (16 bytes) should be reserved for pipelines in at least
+  // multiples of 8 such that the set of particle movers reserved for
+  // a pipeline is 128-byte aligned and a multiple of 128-byte in
+  // size.  The host is guaranteed to get enough movers to process its
+  // particles with this allocation.
+
+  max_nm = args->max_nm - ( args->np&15 );
+
+  if ( max_nm < 0 ) max_nm = 0;
+
+  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
+
+  if ( pipeline_rank == n_pipeline ) max_nm = args->max_nm - itmp;
+
+  pm   = args->pm + itmp;
+  nm   = 0;
+  itmp = 0;
+
+  // Determine which accumulator array to use.
+  // The host gets the first accumulator array.
+
+  a0 += ( 1 + pipeline_rank ) *
+        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(16) ) ( f0 + ii( 0) );
+    vp01 = ( float * ALIGNED(16) ) ( f0 + ii( 1) );
+    vp02 = ( float * ALIGNED(16) ) ( f0 + ii( 2) );
+    vp03 = ( float * ALIGNED(16) ) ( f0 + ii( 3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00, vp01, vp02, vp03,
+                 hax, v00, v01, v02 );
+
+    hax = qdt_2mc*fma( fma( v02, dy, v01 ), dz, fma( v00, dy, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+                 hay, v03, v04, v05 );
+
+    hay = qdt_2mc*fma( fma( v05, dz, v04 ), dx, fma( v03, dz, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+                 haz, v00, v01, v02 );
+
+    haz = qdt_2mc*fma( fma( v02, dx, v01 ), dy, fma( v00, dx, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+                 cbx, v03, cby, v04 );
+
+    cbx = fma( v03, dx, cbx );
+
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+                 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+                 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    // For a 5-10% performance hit, v00 = qdt_2mc/sqrt(blah) is a few ulps more
+    // accurate (but still quite in the noise numerically) for cyclotron
+    // frequencies approaching the nyquist frequency.
+    //--------------------------------------------------------------------------
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_2mc*rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz*cbz ) );
+    v02  = (v00*v00)*v01;
+    v03  = v00*fma( fma( two_fifteenths, v02, one_third ), v02, one );
+    v04  = v03*rcp( fma( v03*v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms(  uy, cbz,  uz*cby ), v03, ux );
+    v01  = fma( fms(  uz, cbx,  ux*cbz ), v03, uy );
+    v02  = fma( fms(  ux, cby,  uy*cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02*cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00*cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01*cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    //--------------------------------------------------------------------------
+    // Store particle data.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( ux, uy, uz, q,
+                  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
+
+    //--------------------------------------------------------------------------
+    // Update the position of in bound particles.
+    //--------------------------------------------------------------------------
+    v00 = rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+
+    ux *= cdt_dx;
+    uy *= cdt_dy;
+    uz *= cdt_dz;
+
+    ux *= v00;
+    uy *= v00;
+    uz *= v00;      // ux,uy,uz are normalized displ (relative to cell size)
+
+    v00 =  dx + ux;
+    v01 =  dy + uy;
+    v02 =  dz + uz; // New particle midpoint
+
+    v03 = v00 + ux;
+    v04 = v01 + uy;
+    v05 = v02 + uz; // New particle position
+
+    //--------------------------------------------------------------------------
+    // Determine which particles are out of bounds.
+    //--------------------------------------------------------------------------
+    outbnd = ( v03 > one ) | ( v03 < neg_one ) |
+             ( v04 > one ) | ( v04 < neg_one ) |
+             ( v05 > one ) | ( v05 < neg_one );
+
+    v03 = merge( outbnd, dx, v03 ); // Do not update outbnd particles
+    v04 = merge( outbnd, dy, v04 );
+    v05 = merge( outbnd, dz, v05 );
+
+    //--------------------------------------------------------------------------
+    // Store particle data, final.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( v03, v04, v05, ii,
+                  &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx );
+
+    // Accumulate current of inbnd particles.
+    // Note: accumulator values are 4 times the total physical charge that
+    // passed through the appropriate current quadrant in a time-step.
+    q  = czero( outbnd, q*qsp );   // Do not accumulate outbnd particles
+
+    dx = v00;                      // Streak midpoint (valid for inbnd only)
+    dy = v01;
+    dz = v02;
+
+    v05 = q*ux*uy*uz*one_third;    // Charge conservation correction
+
+    //--------------------------------------------------------------------------
+    // Set current density accumulation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(16) ) ( a0 + ii( 0) );
+    vp01 = ( float * ALIGNED(16) ) ( a0 + ii( 1) );
+    vp02 = ( float * ALIGNED(16) ) ( a0 + ii( 2) );
+    vp03 = ( float * ALIGNED(16) ) ( a0 + ii( 3) );
+
+    //--------------------------------------------------------------------------
+    // Accumulate current density.
+    //--------------------------------------------------------------------------
+#   define ACCUMULATE_J(X,Y,Z,offset)                                  \
+    v04  = q*u##X;    /* v04 = q ux                            */      \
+    v01  = v04*d##Y;  /* v01 = q ux dy                         */      \
+    v00  = v04-v01;   /* v00 = q ux (1-dy)                     */      \
+    v01 += v04;       /* v01 = q ux (1+dy)                     */      \
+    v04  = one+d##Z;  /* v04 = 1+dz                            */      \
+    v02  = v00*v04;   /* v02 = q ux (1-dy)(1+dz)               */      \
+    v03  = v01*v04;   /* v03 = q ux (1+dy)(1+dz)               */      \
+    v04  = one-d##Z;  /* v04 = 1-dz                            */      \
+    v00 *= v04;       /* v00 = q ux (1-dy)(1-dz)               */      \
+    v01 *= v04;       /* v01 = q ux (1+dy)(1-dz)               */      \
+    v00 += v05;       /* v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */      \
+    v01 -= v05;       /* v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */      \
+    v02 -= v05;       /* v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */      \
+    v03 += v05;       /* v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */      \
+    transpose( v00, v01, v02, v03 );                                   \
+    increment_4x1( vp00+offset, v00 );                                 \
+    increment_4x1( vp01+offset, v01 );                                 \
+    increment_4x1( vp02+offset, v02 );                                 \
+    increment_4x1( vp03+offset, v03 )
+
+    ACCUMULATE_J( x, y, z, 0 );
+    ACCUMULATE_J( y, z, x, 4 );
+    ACCUMULATE_J( z, x, y, 8 );
+
+#   undef ACCUMULATE_J
+
+    //--------------------------------------------------------------------------
+    // Update position and accumulate current density for out of bounds
+    // particles.
+    //--------------------------------------------------------------------------
+
+#   define MOVE_OUTBND(N)                                               \
+    if ( outbnd(N) )                                /* Unlikely */      \
+    {                                                                   \
+      local_pm->dispx = ux(N);                                          \
+      local_pm->dispy = uy(N);                                          \
+      local_pm->dispz = uz(N);                                          \
+      local_pm->i     = ( p - p0 ) + N;                                 \
+      if ( move_p( p0, local_pm, a0, g, _qsp ) )    /* Unlikely */      \
+      {                                                                 \
+        if ( nm < max_nm )                                              \
+        {                                                               \
+          copy_4x1( &pm[nm++], local_pm );                              \
+        }                                                               \
+        else                                        /* Unlikely */      \
+        {                                                               \
+          itmp++;                                                       \
+        }                                                               \
+      }                                                                 \
+    }
+
+    MOVE_OUTBND( 0);
+    MOVE_OUTBND( 1);
+    MOVE_OUTBND( 2);
+    MOVE_OUTBND( 3);
+
+#   undef MOVE_OUTBND
+  }
+
+  args->seg[pipeline_rank].pm        = pm;
+  args->seg[pipeline_rank].max_nm    = max_nm;
+  args->seg[pipeline_rank].nm        = nm;
+  args->seg[pipeline_rank].n_ignored = itmp;
+}
+
+#else
+
+void
+advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No advance_p_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc
new file mode 100644
index 00000000..84ed3916
--- /dev/null
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc
@@ -0,0 +1,395 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+advance_p_pipeline_v8( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  particle_t           * ALIGNED(128) p0 = args->p0;
+  accumulator_t        * ALIGNED(128) a0 = args->a0;
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+  const grid_t         *              g  = args->g;
+
+  particle_t           * ALIGNED(128) p;
+  particle_mover_t     * ALIGNED(16)  pm;
+
+  float                * ALIGNED(32)  vp00;
+  float                * ALIGNED(32)  vp01;
+  float                * ALIGNED(32)  vp02;
+  float                * ALIGNED(32)  vp03;
+  float                * ALIGNED(32)  vp04;
+  float                * ALIGNED(32)  vp05;
+  float                * ALIGNED(32)  vp06;
+  float                * ALIGNED(32)  vp07;
+
+  // Basic constants.
+  const v8float qdt_2mc(args->qdt_2mc);
+  const v8float cdt_dx(args->cdt_dx);
+  const v8float cdt_dy(args->cdt_dy);
+  const v8float cdt_dz(args->cdt_dz);
+  const v8float qsp(args->qsp);
+  const v8float one(1.0);
+  const v8float one_third(1.0/3.0);
+  const v8float two_fifteenths(2.0/15.0);
+  const v8float neg_one(-1.0);
+
+  const float _qsp = args->qsp;
+
+  v8float dx, dy, dz, ux, uy, uz, q;
+  v8float hax, hay, haz, cbx, cby, cbz;
+  v8float v00, v01, v02, v03, v04, v05, v06, v07, v08, v09;
+  v8int   ii, outbnd;
+
+  int itmp, nq, nm, max_nm;
+
+  DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
+
+  // Determine which quads of particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 3;
+
+  // Determine which movers are reserved for this pipeline.
+  // Movers (16 bytes) should be reserved for pipelines in at least
+  // multiples of 8 such that the set of particle movers reserved for
+  // a pipeline is 128-byte aligned and a multiple of 128-byte in
+  // size.  The host is guaranteed to get enough movers to process its
+  // particles with this allocation.
+
+  max_nm = args->max_nm - ( args->np&15 );
+
+  if ( max_nm < 0 ) max_nm = 0;
+
+  DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
+
+  if ( pipeline_rank == n_pipeline ) max_nm = args->max_nm - itmp;
+
+  pm   = args->pm + itmp;
+  nm   = 0;
+  itmp = 0;
+
+  // Determine which accumulator array to use.
+  // The host gets the first accumulator array.
+
+  a0 += ( 1 + pipeline_rank ) *
+        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=8 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_8x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 &p[4].dx, &p[5].dx, &p[6].dx, &p[7].dx,
+                 dx, dy, dz, ii, ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(32) ) ( f0 + ii( 0) );
+    vp01 = ( float * ALIGNED(32) ) ( f0 + ii( 1) );
+    vp02 = ( float * ALIGNED(32) ) ( f0 + ii( 2) );
+    vp03 = ( float * ALIGNED(32) ) ( f0 + ii( 3) );
+    vp04 = ( float * ALIGNED(32) ) ( f0 + ii( 4) );
+    vp05 = ( float * ALIGNED(32) ) ( f0 + ii( 5) );
+    vp06 = ( float * ALIGNED(32) ) ( f0 + ii( 6) );
+    vp07 = ( float * ALIGNED(32) ) ( f0 + ii( 7) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x8_tr( vp00, vp01, vp02, vp03,
+                 vp04, vp05, vp06, vp07,
+                 hax, v00, v01, v02, hay, v03, v04, v05 );
+
+    hax = qdt_2mc*fma( fma( v02, dy, v01 ), dz, fma( v00, dy, hax ) );
+
+    hay = qdt_2mc*fma( fma( v05, dz, v04 ), dx, fma( v03, dz, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x8_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+                 vp04+8, vp05+8, vp06+8, vp07+8,
+                 haz, v00, v01, v02, cbx, v03, cby, v04 );
+
+    haz = qdt_2mc*fma( fma( v02, dx, v01 ), dy, fma( v00, dx, haz ) );
+
+    cbx = fma( v03, dx, cbx );
+
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_8x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+                 vp04+16, vp05+16, vp06+16, vp07+16,
+                 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    // For a 5-10% performance hit, v00 = qdt_2mc/sqrt(blah) is a few ulps more
+    // accurate (but still quite in the noise numerically) for cyclotron
+    // frequencies approaching the nyquist frequency.
+    //--------------------------------------------------------------------------
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_2mc*rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz*cbz ) );
+    v02  = (v00*v00)*v01;
+    v03  = v00*fma( fma( two_fifteenths, v02, one_third ), v02, one );
+    v04  = v03*rcp( fma( v03*v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms(  uy, cbz,  uz*cby ), v03, ux );
+    v01  = fma( fms(  uz, cbx,  ux*cbz ), v03, uy );
+    v02  = fma( fms(  ux, cby,  uy*cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02*cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00*cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01*cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    // Store ux, uy, uz in v06, v07, v08 so particle velocity store can be done
+    // later with the particle positions.
+    v06  = ux;
+    v07  = uy;
+    v08  = uz;
+
+    //--------------------------------------------------------------------------
+    // Update the position of in bound particles.
+    //--------------------------------------------------------------------------
+    v00 = rsqrt( one + fma( ux, ux, fma( uy, uy, uz*uz ) ) );
+
+    ux *= cdt_dx;
+    uy *= cdt_dy;
+    uz *= cdt_dz;
+
+    ux *= v00;
+    uy *= v00;
+    uz *= v00;      // ux,uy,uz are normalized displ (relative to cell size)
+
+    v00 =  dx + ux;
+    v01 =  dy + uy;
+    v02 =  dz + uz; // New particle midpoint
+
+    v03 = v00 + ux;
+    v04 = v01 + uy;
+    v05 = v02 + uz; // New particle position
+
+    //--------------------------------------------------------------------------
+    // Determine which particles are out of bounds.
+    //--------------------------------------------------------------------------
+    outbnd = ( v03 > one ) | ( v03 < neg_one ) |
+             ( v04 > one ) | ( v04 < neg_one ) |
+             ( v05 > one ) | ( v05 < neg_one );
+
+    v03 = merge( outbnd, dx, v03 ); // Do not update outbnd particles
+    v04 = merge( outbnd, dy, v04 );
+    v05 = merge( outbnd, dz, v05 );
+
+    //--------------------------------------------------------------------------
+    // Store particle data, final.
+    //--------------------------------------------------------------------------
+    store_8x8_tr( v03, v04, v05, ii, v06, v07, v08, q,
+                  &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                  &p[4].dx, &p[5].dx, &p[6].dx, &p[7].dx );
+
+    // Accumulate current of inbnd particles.
+    // Note: accumulator values are 4 times the total physical charge that
+    // passed through the appropriate current quadrant in a time-step.
+    q  = czero( outbnd, q*qsp );   // Do not accumulate outbnd particles
+
+    dx = v00;                      // Streak midpoint (valid for inbnd only)
+    dy = v01;
+    dz = v02;
+
+    v09 = q*ux*uy*uz*one_third;    // Charge conservation correction
+
+    //--------------------------------------------------------------------------
+    // Set current density accumulation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(32) ) ( a0 + ii( 0) );
+    vp01 = ( float * ALIGNED(32) ) ( a0 + ii( 1) );
+    vp02 = ( float * ALIGNED(32) ) ( a0 + ii( 2) );
+    vp03 = ( float * ALIGNED(32) ) ( a0 + ii( 3) );
+    vp04 = ( float * ALIGNED(32) ) ( a0 + ii( 4) );
+    vp05 = ( float * ALIGNED(32) ) ( a0 + ii( 5) );
+    vp06 = ( float * ALIGNED(32) ) ( a0 + ii( 6) );
+    vp07 = ( float * ALIGNED(32) ) ( a0 + ii( 7) );
+
+    //--------------------------------------------------------------------------
+    // Accumulate current density.
+    //--------------------------------------------------------------------------
+#   define ACCUMULATE_JX(X,Y,Z)                                        \
+    v08  = q*u##X;    /* v08 = q ux                            */      \
+    v01  = v08*d##Y;  /* v01 = q ux dy                         */      \
+    v00  = v08-v01;   /* v00 = q ux (1-dy)                     */      \
+    v01 += v08;       /* v01 = q ux (1+dy)                     */      \
+    v08  = one+d##Z;  /* v08 = 1+dz                            */      \
+    v02  = v00*v08;   /* v02 = q ux (1-dy)(1+dz)               */      \
+    v03  = v01*v08;   /* v03 = q ux (1+dy)(1+dz)               */      \
+    v08  = one-d##Z;  /* v08 = 1-dz                            */      \
+    v00 *= v08;       /* v00 = q ux (1-dy)(1-dz)               */      \
+    v01 *= v08;       /* v01 = q ux (1+dy)(1-dz)               */      \
+    v00 += v09;       /* v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */      \
+    v01 -= v09;       /* v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */      \
+    v02 -= v09;       /* v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */      \
+    v03 += v09;       /* v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */
+
+#   define ACCUMULATE_JY(X,Y,Z)                                        \
+    v08  = q*u##X;    /* v08 = q ux                            */      \
+    v05  = v08*d##Y;  /* v05 = q ux dy                         */      \
+    v04  = v08-v05;   /* v04 = q ux (1-dy)                     */      \
+    v05 += v08;       /* v05 = q ux (1+dy)                     */      \
+    v08  = one+d##Z;  /* v08 = 1+dz                            */      \
+    v06  = v04*v08;   /* v06 = q ux (1-dy)(1+dz)               */      \
+    v07  = v05*v08;   /* v07 = q ux (1+dy)(1+dz)               */      \
+    v08  = one-d##Z;  /* v08 = 1-dz                            */      \
+    v04 *= v08;       /* v04 = q ux (1-dy)(1-dz)               */      \
+    v05 *= v08;       /* v05 = q ux (1+dy)(1-dz)               */      \
+    v04 += v09;       /* v04 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */      \
+    v05 -= v09;       /* v05 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */      \
+    v06 -= v09;       /* v06 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */      \
+    v07 += v09;       /* v07 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */
+
+#   define ACCUMULATE_JZ(X,Y,Z)                                        \
+    v08  = q*u##X;    /* v08 = q ux                            */      \
+    v01  = v08*d##Y;  /* v01 = q ux dy                         */      \
+    v00  = v08-v01;   /* v00 = q ux (1-dy)                     */      \
+    v01 += v08;       /* v01 = q ux (1+dy)                     */      \
+    v08  = one+d##Z;  /* v08 = 1+dz                            */      \
+    v02  = v00*v08;   /* v02 = q ux (1-dy)(1+dz)               */      \
+    v03  = v01*v08;   /* v03 = q ux (1+dy)(1+dz)               */      \
+    v08  = one-d##Z;  /* v08 = 1-dz                            */      \
+    v00 *= v08;       /* v00 = q ux (1-dy)(1-dz)               */      \
+    v01 *= v08;       /* v01 = q ux (1+dy)(1-dz)               */      \
+    v00 += v09;       /* v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */      \
+    v01 -= v09;       /* v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */      \
+    v02 -= v09;       /* v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */      \
+    v03 += v09;       /* v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */
+
+    // Accumulate Jx for 8 particles into the v00-v03 vectors.
+    ACCUMULATE_JX( x, y, z );
+
+    // Accumulate Jy for 8 particles into the v04-v07 vectors.
+    ACCUMULATE_JY( y, z, x );
+
+    // Transpose the data in vectors v00-v07 so it can be added into the
+    // accumulator arrays using vector operations.
+    transpose( v00, v01, v02, v03, v04, v05, v06, v07 );
+
+    // Add the contributions to Jx and Jy from 8 particles into the
+    // accumulator arrays for Jx and Jy.
+    increment_8x1( vp00, v00 );
+    increment_8x1( vp01, v01 );
+    increment_8x1( vp02, v02 );
+    increment_8x1( vp03, v03 );
+    increment_8x1( vp04, v04 );
+    increment_8x1( vp05, v05 );
+    increment_8x1( vp06, v06 );
+    increment_8x1( vp07, v07 );
+
+    // Accumulate Jz for 8 particles into the v00-v03 vectors.
+    ACCUMULATE_JZ( z, x, y );
+
+    // Zero the v04-v07 vectors prior to transposing the data.
+    v04 = 0.0;
+    v05 = 0.0;
+    v06 = 0.0;
+    v07 = 0.0;
+
+    // Transpose the data in vectors v00-v07 so it can be added into the
+    // accumulator arrays using vector operations.
+    transpose( v00, v01, v02, v03, v04, v05, v06, v07 );
+
+    // Add the contributions to Jz from 8 particles into the accumulator
+    // arrays for Jz.
+    increment_8x1( vp00 + 8, v00 );
+    increment_8x1( vp01 + 8, v01 );
+    increment_8x1( vp02 + 8, v02 );
+    increment_8x1( vp03 + 8, v03 );
+    increment_8x1( vp04 + 8, v04 );
+    increment_8x1( vp05 + 8, v05 );
+    increment_8x1( vp06 + 8, v06 );
+    increment_8x1( vp07 + 8, v07 );
+
+#   undef ACCUMULATE_JX
+#   undef ACCUMULATE_JY
+#   undef ACCUMULATE_JZ
+
+    //--------------------------------------------------------------------------
+    // Update position and accumulate current density for out of bounds
+    // particles.
+    //--------------------------------------------------------------------------
+
+#   define MOVE_OUTBND(N)                                               \
+    if ( outbnd(N) )                                /* Unlikely */      \
+    {                                                                   \
+      local_pm->dispx = ux(N);                                          \
+      local_pm->dispy = uy(N);                                          \
+      local_pm->dispz = uz(N);                                          \
+      local_pm->i     = ( p - p0 ) + N;                                 \
+      if ( move_p( p0, local_pm, a0, g, _qsp ) )    /* Unlikely */      \
+      {                                                                 \
+        if ( nm < max_nm )                                              \
+        {                                                               \
+          v4::copy_4x1( &pm[nm++], local_pm );                          \
+        }                                                               \
+        else                                        /* Unlikely */      \
+        {                                                               \
+          itmp++;                                                       \
+        }                                                               \
+      }                                                                 \
+    }
+
+    MOVE_OUTBND( 0);
+    MOVE_OUTBND( 1);
+    MOVE_OUTBND( 2);
+    MOVE_OUTBND( 3);
+    MOVE_OUTBND( 4);
+    MOVE_OUTBND( 5);
+    MOVE_OUTBND( 6);
+    MOVE_OUTBND( 7);
+
+#   undef MOVE_OUTBND
+  }
+
+  args->seg[pipeline_rank].pm        = pm;
+  args->seg[pipeline_rank].max_nm    = max_nm;
+  args->seg[pipeline_rank].nm        = nm;
+  args->seg[pipeline_rank].n_ignored = itmp;
+}
+
+#else
+
+void
+advance_p_pipeline_v8( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No advance_p_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/center_p_pipeline.cc b/src/species_advance/standard/pipeline/center_p_pipeline.cc
new file mode 100644
index 00000000..64b8ea32
--- /dev/null
+++ b/src/species_advance/standard/pipeline/center_p_pipeline.cc
@@ -0,0 +1,128 @@
+#define IN_spa
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "spa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for a center_p pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+center_p_pipeline_scalar( center_p_pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(32)  p;
+
+  const interpolator_t * ALIGNED(16)  f;
+
+  const float qdt_2mc        =     args->qdt_2mc;
+  const float qdt_4mc        = 0.5*args->qdt_2mc; // For half Boris rotate
+  const float one            = 1.0;
+  const float one_third      = 1.0/3.0;
+  const float two_fifteenths = 2.0/15.0;
+
+  float dx, dy, dz, ux, uy, uz;
+  float hax, hay, haz, cbx, cby, cbz;
+  float v0, v1, v2, v3, v4;
+  int   ii;
+
+  int first, n;
+
+  // Determine which particles this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, n );
+
+  p = args->p0 + first;
+
+  // Process particles for this pipeline.
+
+  for( ; n; n--, p++ )
+  {
+    dx   = p->dx;                            // Load position
+    dy   = p->dy;
+    dz   = p->dz;
+    ii   = p->i;
+
+    f    = f0 + ii;                          // Interpolate E
+
+    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
+                     dz*( f->dexdz + dy*f->d2exdydz ) );
+
+    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
+                     dx*( f->deydx + dz*f->d2eydzdx ) );
+
+    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
+                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
+
+    cbx  = f->cbx + dx*f->dcbxdx;            // Interpolate B
+    cby  = f->cby + dy*f->dcbydy;
+    cbz  = f->cbz + dz*f->dcbzdz;
+
+    ux   = p->ux;                            // Load momentum
+    uy   = p->uy;
+    uz   = p->uz;
+
+    ux  += hax;                              // Half advance E
+    uy  += hay;
+    uz  += haz;
+
+    v0   = qdt_4mc/(float)sqrt(one + (ux*ux + (uy*uy + uz*uz)));
+    /**/                                     // Boris - scalars
+    v1   = cbx*cbx + (cby*cby + cbz*cbz);
+    v2   = (v0*v0)*v1;
+    v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
+    v4   = v3/(one+v1*(v3*v3));
+    v4  += v4;
+
+    v0   = ux + v3*( uy*cbz - uz*cby );      // Boris - uprime
+    v1   = uy + v3*( uz*cbx - ux*cbz );
+    v2   = uz + v3*( ux*cby - uy*cbx );
+
+    ux  += v4*( v1*cbz - v2*cby );           // Boris - rotation
+    uy  += v4*( v2*cbx - v0*cbz );
+    uz  += v4*( v0*cby - v1*cbx );
+
+    p->ux = ux;                              // Store momentum
+    p->uy = uy;
+    p->uz = uz;
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper center_p pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+center_p_pipeline( species_t * RESTRICT sp,
+                   const interpolator_array_t * RESTRICT ia )
+{
+  DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 );
+
+  if ( !sp ||
+       !ia ||
+       sp->g != ia->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have the pipelines do the bulk of particles in blocks and have the
+  // host do the final incomplete block.
+
+  args->p0      = sp->p;
+  args->f0      = ia->i;
+  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->np      = sp->np;
+
+  EXEC_PIPELINES( center_p, args, 0 );
+  WAIT_PIPELINES();
+}
diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v16.cc
new file mode 100644
index 00000000..bced2e23
--- /dev/null
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v16.cc
@@ -0,0 +1,160 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+center_p_pipeline_v16( center_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(64)  vp00;
+  const float          * ALIGNED(64)  vp01;
+  const float          * ALIGNED(64)  vp02;
+  const float          * ALIGNED(64)  vp03;
+  const float          * ALIGNED(64)  vp04;
+  const float          * ALIGNED(64)  vp05;
+  const float          * ALIGNED(64)  vp06;
+  const float          * ALIGNED(64)  vp07;
+  const float          * ALIGNED(64)  vp08;
+  const float          * ALIGNED(64)  vp09;
+  const float          * ALIGNED(64)  vp10;
+  const float          * ALIGNED(64)  vp11;
+  const float          * ALIGNED(64)  vp12;
+  const float          * ALIGNED(64)  vp13;
+  const float          * ALIGNED(64)  vp14;
+  const float          * ALIGNED(64)  vp15;
+
+  const v16float qdt_2mc(    args->qdt_2mc);
+  const v16float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate
+  const v16float one(1.0);
+  const v16float one_third(1.0/3.0);
+  const v16float two_fifteenths(2.0/15.0);
+
+  v16float dx, dy, dz, ux, uy, uz, q;
+  v16float hax, hay, haz, cbx, cby, cbz;
+  v16float v00, v01, v02, v03, v04, v05, v06, v07, v08, v09, v10;
+  v16int   ii;
+
+  int itmp, nq;
+
+  // Determine which particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 4;
+
+  // Process the particle quads for this pipeline.
+
+  for( ; nq; nq--, p+=16 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_16x8_tr_p( &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                    &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx,
+		    dx, dy, dz, ii, ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ALIGNED(64) ) ( f0 + ii( 0) );
+    vp01 = ( float * ALIGNED(64) ) ( f0 + ii( 1) );
+    vp02 = ( float * ALIGNED(64) ) ( f0 + ii( 2) );
+    vp03 = ( float * ALIGNED(64) ) ( f0 + ii( 3) );
+    vp04 = ( float * ALIGNED(64) ) ( f0 + ii( 4) );
+    vp05 = ( float * ALIGNED(64) ) ( f0 + ii( 5) );
+    vp06 = ( float * ALIGNED(64) ) ( f0 + ii( 6) );
+    vp07 = ( float * ALIGNED(64) ) ( f0 + ii( 7) );
+    vp08 = ( float * ALIGNED(64) ) ( f0 + ii( 8) );
+    vp09 = ( float * ALIGNED(64) ) ( f0 + ii( 9) );
+    vp10 = ( float * ALIGNED(64) ) ( f0 + ii(10) );
+    vp11 = ( float * ALIGNED(64) ) ( f0 + ii(11) );
+    vp12 = ( float * ALIGNED(64) ) ( f0 + ii(12) );
+    vp13 = ( float * ALIGNED(64) ) ( f0 + ii(13) );
+    vp14 = ( float * ALIGNED(64) ) ( f0 + ii(14) );
+    vp15 = ( float * ALIGNED(64) ) ( f0 + ii(15) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x16_tr( vp00, vp01, vp02, vp03,
+		   vp04, vp05, vp06, vp07,
+		   vp08, vp09, vp10, vp11,
+		   vp12, vp13, vp14, vp15,
+		   hax, v00, v01, v02, hay, v03, v04, v05,
+		   haz, v06, v07, v08, cbx, v09, cby, v10 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    haz = qdt_2mc*fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
+
+    cbx = fma( v09, dx, cbx );
+    cby = fma( v10, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_16x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		  vp04+16, vp05+16, vp06+16, vp07+16,
+		  vp08+16, vp09+16, vp10+16, vp11+16,
+		  vp12+16, vp13+16, vp14+16, vp15+16,
+		  cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    ux   += hax;
+    uy   += hay;
+    uz   += haz;
+
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    //--------------------------------------------------------------------------
+    // Store particle momentum data.  Could use store_16x4_tr_p or
+    // store_16x3_tr_p.
+    //--------------------------------------------------------------------------
+    store_16x8_tr_p( dx, dy, dz, ii, ux, uy, uz, q,
+                     &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                     &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx );
+  }
+}
+
+#else
+
+void
+center_p_pipeline_v16( center_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No center_p_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
new file mode 100644
index 00000000..dc6d5e18
--- /dev/null
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
@@ -0,0 +1,150 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+center_p_pipeline_v4( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(16)  vp00;
+  const float          * ALIGNED(16)  vp01;
+  const float          * ALIGNED(16)  vp02;
+  const float          * ALIGNED(16)  vp03;
+
+  const v4float qdt_2mc(    args->qdt_2mc);
+  const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate.
+  const v4float one(1.0);
+  const v4float one_third(1.0/3.0);
+  const v4float two_fifteenths(2.0/15.0);
+
+  v4float dx, dy, dz, ux, uy, uz, q;
+  v4float hax, hay, haz, cbx, cby, cbz;
+  v4float v00, v01, v02, v03, v04, v05;
+  v4int   ii;
+
+  int itmp, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 2;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00, vp01, vp02, vp03,
+		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+		 cbx, v03, cby, v04 );
+
+    cbx = fma( v03, dx, cbx );
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.  Could use load_4x3_tr.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    //--------------------------------------------------------------------------
+    // Store particle momentum data.  Could use store_4x3_tr.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
+  }
+}
+
+#else
+
+void
+center_p_pipeline_v4( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No center_p_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v8.cc
new file mode 100644
index 00000000..33aaf868
--- /dev/null
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v8.cc
@@ -0,0 +1,166 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+center_p_pipeline_v8( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(32)  vp00;
+  const float          * ALIGNED(32)  vp01;
+  const float          * ALIGNED(32)  vp02;
+  const float          * ALIGNED(32)  vp03;
+  const float          * ALIGNED(32)  vp04;
+  const float          * ALIGNED(32)  vp05;
+  const float          * ALIGNED(32)  vp06;
+  const float          * ALIGNED(32)  vp07;
+
+  const v8float qdt_2mc(    args->qdt_2mc);
+  const v8float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate
+  const v8float one(1.0);
+  const v8float one_third(1.0/3.0);
+  const v8float two_fifteenths(2.0/15.0);
+
+  v8float dx, dy, dz, ux, uy, uz, q;
+  v8float hax, hay, haz, cbx, cby, cbz;
+  v8float v00, v01, v02, v03, v04, v05;
+  v8int   ii;
+
+  int itmp, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 3;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=8 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+		 &p[4].dx, &p[5].dx, &p[6].dx, &p[7].dx,
+		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(32) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(32) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(32) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(32) ) ( f0 + ii(3) );
+    vp04 = ( const float * ALIGNED(32) ) ( f0 + ii(4) );
+    vp05 = ( const float * ALIGNED(32) ) ( f0 + ii(5) );
+    vp06 = ( const float * ALIGNED(32) ) ( f0 + ii(6) );
+    vp07 = ( const float * ALIGNED(32) ) ( f0 + ii(7) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00, vp01, vp02, vp03,
+		 vp04, vp05, vp06, vp07,
+		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+		 vp04+4, vp05+4, vp06+4, vp07+4,
+		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+		 vp04+8, vp05+8, vp06+8, vp07+8,
+		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+		 vp04+12, vp05+12, vp06+12, vp07+12,
+		 cbx, v03, cby, v04 );
+
+    cbx = fma( v03, dx, cbx );
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_8x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 vp04+16, vp05+16, vp06+16, vp07+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.  Could use load_8x3_tr.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		 &p[4].ux, &p[5].ux, &p[6].ux, &p[7].ux,
+		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    //--------------------------------------------------------------------------
+    // Store particle momentum data.  Could use store_8x3_tr.
+    //--------------------------------------------------------------------------
+    store_8x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		  &p[4].ux, &p[5].ux, &p[6].ux, &p[7].ux );
+  }
+}
+
+#else
+
+void
+center_p_pipeline_v8( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No center_p_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/energy_p_pipeline.cc b/src/species_advance/standard/pipeline/energy_p_pipeline.cc
new file mode 100644
index 00000000..458b3cba
--- /dev/null
+++ b/src/species_advance/standard/pipeline/energy_p_pipeline.cc
@@ -0,0 +1,115 @@
+#define IN_spa
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "spa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an energy_p pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.  This function
+// calculates kinetic energy, normalized by c^2.
+//----------------------------------------------------------------------------//
+
+void
+energy_p_pipeline_scalar( energy_p_pipeline_args_t * RESTRICT args,
+                          int pipeline_rank,
+                          int n_pipeline )
+{
+  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
+  const particle_t     * RESTRICT ALIGNED(32)  p = args->p;
+
+  const float qdt_2mc = args->qdt_2mc;
+  const float msp     = args->msp;
+  const float one     = 1.0;
+
+  float dx, dy, dz;
+  float v0, v1, v2;
+
+  double en = 0.0;
+
+  int i, n, n0, n1;
+
+  // Determine which particles this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, n1 );
+
+  n1 += n0;
+
+  // Process particles quads for this pipeline.
+
+  for( n = n0; n < n1; n++ )
+  {
+    dx  = p[n].dx;
+    dy  = p[n].dy;
+    dz  = p[n].dz;
+    i   = p[n].i;
+
+    v0  = p[n].ux + qdt_2mc*(    ( f[i].ex    + dy*f[i].dexdy    ) +
+                              dz*( f[i].dexdz + dy*f[i].d2exdydz ) );
+
+    v1  = p[n].uy + qdt_2mc*(    ( f[i].ey    + dz*f[i].deydz    ) +
+                              dx*( f[i].deydx + dz*f[i].d2eydzdx ) );
+
+    v2  = p[n].uz + qdt_2mc*(    ( f[i].ez    + dx*f[i].dezdx    ) +
+                              dy*( f[i].dezdy + dx*f[i].d2ezdxdy ) );
+
+    v0  = v0*v0 + v1*v1 + v2*v2;
+
+    v0  = (msp * p[n].w) * (v0 / (one + sqrtf(one + v0)));
+
+    en += ( double ) v0;
+  }
+
+  args->en[pipeline_rank] = en;
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper energy_p pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+double
+energy_p_pipeline( const species_t * RESTRICT sp,
+                   const interpolator_array_t * RESTRICT ia )
+{
+  DECLARE_ALIGNED_ARRAY( energy_p_pipeline_args_t, 128, args, 1 );
+
+  DECLARE_ALIGNED_ARRAY( double, 128, en, MAX_PIPELINE+1 );
+
+  double local, global;
+  int rank;
+
+  if ( !sp || !ia || sp->g != ia->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have the pipelines do the bulk of particles in blocks and have the
+  // host do the final incomplete block.
+
+  args->p       = sp->p;
+  args->f       = ia->i;
+  args->en      = en;
+  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->msp     = sp->m;
+  args->np      = sp->np;
+
+  EXEC_PIPELINES( energy_p, args, 0 );
+
+  WAIT_PIPELINES();
+
+  local = 0.0;
+  for( rank = 0; rank <= N_PIPELINE; rank++ )
+  {
+    local += en[rank];
+  }
+
+  mp_allsum_d( &local, &global, 1 );
+
+  return global * ( ( double ) sp->g->cvac *
+		    ( double ) sp->g->cvac );
+}
diff --git a/src/species_advance/standard/pipeline/energy_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/energy_p_pipeline_v16.cc
new file mode 100644
index 00000000..0e1cac12
--- /dev/null
+++ b/src/species_advance/standard/pipeline/energy_p_pipeline_v16.cc
@@ -0,0 +1,189 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+energy_p_pipeline_v16( energy_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
+  const particle_t     * RESTRICT ALIGNED(128) p = args->p;
+
+  const float          * RESTRICT ALIGNED(64)  vp00;
+  const float          * RESTRICT ALIGNED(64)  vp01;
+  const float          * RESTRICT ALIGNED(64)  vp02;
+  const float          * RESTRICT ALIGNED(64)  vp03;
+  const float          * RESTRICT ALIGNED(64)  vp04;
+  const float          * RESTRICT ALIGNED(64)  vp05;
+  const float          * RESTRICT ALIGNED(64)  vp06;
+  const float          * RESTRICT ALIGNED(64)  vp07;
+  const float          * RESTRICT ALIGNED(64)  vp08;
+  const float          * RESTRICT ALIGNED(64)  vp09;
+  const float          * RESTRICT ALIGNED(64)  vp10;
+  const float          * RESTRICT ALIGNED(64)  vp11;
+  const float          * RESTRICT ALIGNED(64)  vp12;
+  const float          * RESTRICT ALIGNED(64)  vp13;
+  const float          * RESTRICT ALIGNED(64)  vp14;
+  const float          * RESTRICT ALIGNED(64)  vp15;
+
+  const v16float qdt_2mc(args->qdt_2mc);
+  const v16float msp(args->msp);
+  const v16float one(1.0);
+
+  v16float dx, dy, dz;
+  v16float ex, ey, ez;
+  v16float v00, v01, v02, w;
+  v16int i;
+
+  double en00 = 0.0, en01 = 0.0, en02 = 0.0, en03 = 0.0;
+  double en04 = 0.0, en05 = 0.0, en06 = 0.0, en07 = 0.0;
+  double en08 = 0.0, en09 = 0.0, en10 = 0.0, en11 = 0.0;
+  double en12 = 0.0, en13 = 0.0, en14 = 0.0, en15 = 0.0;
+
+  int n0, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, nq );
+
+  p += n0;
+
+  nq >>= 4;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=16 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_16x4_tr( &p[ 0].dx, &p[ 1].dx, &p[ 2].dx, &p[ 3].dx,
+                  &p[ 4].dx, &p[ 5].dx, &p[ 6].dx, &p[ 7].dx,
+                  &p[ 8].dx, &p[ 9].dx, &p[10].dx, &p[11].dx,
+                  &p[12].dx, &p[13].dx, &p[14].dx, &p[15].dx,
+                  dx, dy, dz, i );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ) ( f + i( 0) );
+    vp01 = ( float * ) ( f + i( 1) );
+    vp02 = ( float * ) ( f + i( 2) );
+    vp03 = ( float * ) ( f + i( 3) );
+    vp04 = ( float * ) ( f + i( 4) );
+    vp05 = ( float * ) ( f + i( 5) );
+    vp06 = ( float * ) ( f + i( 6) );
+    vp07 = ( float * ) ( f + i( 7) );
+    vp08 = ( float * ) ( f + i( 8) );
+    vp09 = ( float * ) ( f + i( 9) );
+    vp10 = ( float * ) ( f + i(10) );
+    vp11 = ( float * ) ( f + i(11) );
+    vp12 = ( float * ) ( f + i(12) );
+    vp13 = ( float * ) ( f + i(13) );
+    vp14 = ( float * ) ( f + i(14) );
+    vp15 = ( float * ) ( f + i(15) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x4_tr( vp00, vp01, vp02, vp03,
+                  vp04, vp05, vp06, vp07,
+                  vp08, vp09, vp10, vp11,
+                  vp12, vp13, vp14, vp15,
+                  ex, v00, v01, v02 );
+
+    ex = fma( fma( dy, v02, v01 ), dz, fma( dy, v00, ex ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+                  vp04+4, vp05+4, vp06+4, vp07+4,
+                  vp08+4, vp09+4, vp10+4, vp11+4,
+                  vp12+4, vp13+4, vp14+4, vp15+4,
+                  ey, v00, v01, v02 );
+
+    ey = fma( fma( dz, v02, v01 ), dx, fma( dz, v00, ey ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+                  vp04+8, vp05+8, vp06+8, vp07+8,
+                  vp08+8, vp09+8, vp10+8, vp11+8,
+                  vp12+8, vp13+8, vp14+8, vp15+8,
+                  ez, v00, v01, v02 );
+
+    ez = fma( fma( dx, v02, v01 ), dy, fma( dx, v00, ez ) );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.
+    //--------------------------------------------------------------------------
+    load_16x4_tr( &p[ 0].ux, &p[ 1].ux, &p[ 2].ux, &p[ 3].ux,
+                  &p[ 4].ux, &p[ 5].ux, &p[ 6].ux, &p[ 7].ux,
+                  &p[ 8].ux, &p[ 9].ux, &p[10].ux, &p[11].ux,
+                  &p[12].ux, &p[13].ux, &p[14].ux, &p[15].ux,
+                  v00, v01, v02, w );
+
+    //--------------------------------------------------------------------------
+    // Update momentum to half step. Note that Boris rotation does not change
+    // energy and thus is not necessary.
+    //--------------------------------------------------------------------------
+    v00 = fma( ex, qdt_2mc, v00 );
+    v01 = fma( ey, qdt_2mc, v01 );
+    v02 = fma( ez, qdt_2mc, v02 );
+
+    //--------------------------------------------------------------------------
+    // Calculate kinetic energy of particles.
+    //--------------------------------------------------------------------------
+    v00 = fma( v00, v00, fma( v01, v01, v02 * v02 ) );
+
+    v00 = ( msp * w ) * ( v00 / ( one + sqrt( one + v00 ) ) ); 
+
+    //--------------------------------------------------------------------------
+    // Accumulate energy for each vector element.
+    //--------------------------------------------------------------------------
+    en00 += ( double ) v00( 0);
+    en01 += ( double ) v00( 1);
+    en02 += ( double ) v00( 2);
+    en03 += ( double ) v00( 3);
+    en04 += ( double ) v00( 4);
+    en05 += ( double ) v00( 5);
+    en06 += ( double ) v00( 6);
+    en07 += ( double ) v00( 7);
+    en08 += ( double ) v00( 8);
+    en09 += ( double ) v00( 9);
+    en10 += ( double ) v00(10);
+    en11 += ( double ) v00(11);
+    en12 += ( double ) v00(12);
+    en13 += ( double ) v00(13);
+    en14 += ( double ) v00(14);
+    en15 += ( double ) v00(15);
+  }
+
+  //--------------------------------------------------------------------------
+  // Accumulate energy for each rank or thread.
+  //--------------------------------------------------------------------------
+  args->en[pipeline_rank] = en00 + en01 + en02 + en03 +
+                            en04 + en05 + en06 + en07 +
+                            en08 + en09 + en10 + en11 +
+                            en12 + en13 + en14 + en15;
+}
+
+#else
+
+void
+energy_p_pipeline_v16( energy_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No energy_p_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/energy_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/energy_p_pipeline_v4.cc
new file mode 100644
index 00000000..8d077986
--- /dev/null
+++ b/src/species_advance/standard/pipeline/energy_p_pipeline_v4.cc
@@ -0,0 +1,132 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+energy_p_pipeline_v4( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
+  const particle_t     * RESTRICT ALIGNED(128) p = args->p;
+
+  const float          * RESTRICT ALIGNED(16)  vp00;
+  const float          * RESTRICT ALIGNED(16)  vp01;
+  const float          * RESTRICT ALIGNED(16)  vp02;
+  const float          * RESTRICT ALIGNED(16)  vp03;
+
+  const v4float qdt_2mc(args->qdt_2mc);
+  const v4float msp(args->msp);
+  const v4float one(1.0);
+
+  v4float dx, dy, dz;
+  v4float ex, ey, ez;
+  v4float v00, v01, v02, w;
+  v4int i;
+
+  double en00 = 0.0, en01 = 0.0, en02 = 0.0, en03 = 0.0;
+
+  int n0, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, nq );
+
+  p += n0;
+
+  nq >>= 2;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 dx, dy, dz, i );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ) ( f + i(0) );
+    vp01 = ( float * ) ( f + i(1) );
+    vp02 = ( float * ) ( f + i(2) );
+    vp03 = ( float * ) ( f + i(3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00, vp01, vp02, vp03,
+                 ex, v00, v01, v02 );
+
+    ex = fma( fma( dy, v02, v01 ), dz, fma( dy, v00, ex ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+                 ey, v00, v01, v02);
+
+    ey = fma( fma( dz, v02, v01 ), dx, fma( dz, v00, ey ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+                 ez, v00, v01, v02);
+
+    ez = fma( fma( dx, v02, v01 ), dy, fma( dx, v00, ez ) );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+                 v00, v01, v02, w );
+
+    //--------------------------------------------------------------------------
+    // Update momentum to half step. Note that Boris rotation does not change
+    // energy and thus is not necessary.
+    //--------------------------------------------------------------------------
+    v00 = fma( ex, qdt_2mc, v00 );
+    v01 = fma( ey, qdt_2mc, v01 );
+    v02 = fma( ez, qdt_2mc, v02 );
+
+    //--------------------------------------------------------------------------
+    // Calculate kinetic energy of particles.
+    //--------------------------------------------------------------------------
+    v00 = fma( v00, v00, fma( v01, v01, v02 * v02 ) );
+
+    v00 = ( msp * w ) * ( v00 / ( one + sqrt( one + v00 ) ) ); 
+
+    //--------------------------------------------------------------------------
+    // Accumulate energy for each vector element.
+    //--------------------------------------------------------------------------
+    en00 += ( double ) v00(0);
+    en01 += ( double ) v00(1);
+    en02 += ( double ) v00(2);
+    en03 += ( double ) v00(3);
+  }
+
+  //--------------------------------------------------------------------------
+  // Accumulate energy for each rank or thread.
+  //--------------------------------------------------------------------------
+  args->en[pipeline_rank] = en00 + en01 + en02 + en03;
+}
+
+#else
+
+void
+energy_p_pipeline_v4( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No energy_p_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/energy_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/energy_p_pipeline_v8.cc
new file mode 100644
index 00000000..9189286c
--- /dev/null
+++ b/src/species_advance/standard/pipeline/energy_p_pipeline_v8.cc
@@ -0,0 +1,151 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+energy_p_pipeline_v8( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  const interpolator_t * RESTRICT ALIGNED(128) f = args->f;
+  const particle_t     * RESTRICT ALIGNED(128) p = args->p;
+
+  const float          * RESTRICT ALIGNED(32)  vp00;
+  const float          * RESTRICT ALIGNED(32)  vp01;
+  const float          * RESTRICT ALIGNED(32)  vp02;
+  const float          * RESTRICT ALIGNED(32)  vp03;
+  const float          * RESTRICT ALIGNED(32)  vp04;
+  const float          * RESTRICT ALIGNED(32)  vp05;
+  const float          * RESTRICT ALIGNED(32)  vp06;
+  const float          * RESTRICT ALIGNED(32)  vp07;
+
+  const v8float qdt_2mc(args->qdt_2mc);
+  const v8float msp(args->msp);
+  const v8float one(1.0);
+
+  v8float dx, dy, dz;
+  v8float ex, ey, ez;
+  v8float v00, v01, v02, w;
+  v8int i;
+
+  double en00 = 0.0, en01 = 0.0, en02 = 0.0, en03 = 0.0;
+  double en04 = 0.0, en05 = 0.0, en06 = 0.0, en07 = 0.0;
+
+  int n0, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, n0, nq );
+
+  p += n0;
+
+  nq >>= 3;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=8 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 &p[4].dx, &p[5].dx, &p[6].dx, &p[7].dx,
+                 dx, dy, dz, i );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( float * ) ( f + i(0) );
+    vp01 = ( float * ) ( f + i(1) );
+    vp02 = ( float * ) ( f + i(2) );
+    vp03 = ( float * ) ( f + i(3) );
+    vp04 = ( float * ) ( f + i(4) );
+    vp05 = ( float * ) ( f + i(5) );
+    vp06 = ( float * ) ( f + i(6) );
+    vp07 = ( float * ) ( f + i(7) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00, vp01, vp02, vp03,
+		 vp04, vp05, vp06, vp07,
+		 ex, v00, v01, v02 );
+
+    ex = fma( fma( dy, v02, v01 ), dz, fma( dy, v00, ex ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+		 vp04+4, vp05+4, vp06+4, vp07+4,
+		 ey, v00, v01, v02 );
+
+    ey = fma( fma( dz, v02, v01 ), dx, fma( dz, v00, ey ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+		 vp04+8, vp05+8, vp06+8, vp07+8,
+		 ez, v00, v01, v02 );
+
+    ez = fma( fma( dx, v02, v01 ), dy, fma( dx, v00, ez ) );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		 &p[4].ux, &p[5].ux, &p[6].ux, &p[7].ux,
+		 v00, v01, v02, w );
+
+    //--------------------------------------------------------------------------
+    // Update momentum to half step. Note that Boris rotation does not change
+    // energy and thus is not necessary.
+    //--------------------------------------------------------------------------
+    v00 = fma( ex, qdt_2mc, v00 );
+    v01 = fma( ey, qdt_2mc, v01 );
+    v02 = fma( ez, qdt_2mc, v02 );
+
+    //--------------------------------------------------------------------------
+    // Calculate kinetic energy of particles.
+    //--------------------------------------------------------------------------
+    v00 = fma( v00, v00, fma( v01, v01, v02 * v02 ) );
+
+    v00 = ( msp * w ) * ( v00 / ( one + sqrt( one + v00 ) ) ); 
+
+    //--------------------------------------------------------------------------
+    // Accumulate energy for each vector element.
+    //--------------------------------------------------------------------------
+    en00 += ( double ) v00(0);
+    en01 += ( double ) v00(1);
+    en02 += ( double ) v00(2);
+    en03 += ( double ) v00(3);
+    en04 += ( double ) v00(4);
+    en05 += ( double ) v00(5);
+    en06 += ( double ) v00(6);
+    en07 += ( double ) v00(7);
+  }
+
+  //--------------------------------------------------------------------------
+  // Accumulate energy for each rank or thread.
+  //--------------------------------------------------------------------------
+  args->en[pipeline_rank] = en00 + en01 + en02 + en03 +
+                            en04 + en05 + en06 + en07;
+}
+
+#else
+
+void
+energy_p_pipeline_v8( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No energy_p_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/sort_p_pipeline.c b/src/species_advance/standard/pipeline/sort_p_pipeline.c
new file mode 100644
index 00000000..bafd2dcb
--- /dev/null
+++ b/src/species_advance/standard/pipeline/sort_p_pipeline.c
@@ -0,0 +1,371 @@
+//============================================================================//
+// Written by:
+//   Kevin J. Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// March/April 2004 - Revised and extened from earlier V4PIC versions.
+//============================================================================//
+
+#define IN_spa
+
+#include "spa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// This is the new thread parallel version of the particle sort.
+//----------------------------------------------------------------------------//
+
+#if defined( __SSE__ )
+#include "xmmintrin.h"
+#endif
+
+// FIXME: HOOK UP IN-PLACE / OUT-PLACE OPTIONS AGAIN.
+
+//----------------------------------------------------------------------------//
+// 
+//----------------------------------------------------------------------------//
+
+void
+coarse_count_pipeline_scalar( sort_p_pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline )
+{
+  const particle_t * RESTRICT ALIGNED(128) p_src = args->p;
+
+  int i, i1;
+
+  int n_subsort = args->n_subsort;
+  int vl        = args->vl;
+  int vh        = args->vh;
+  int cp_stride = POW2_CEIL( n_subsort, 4 );
+
+  // On pipeline stack to avoid cache hot spots.
+  int count[256];
+
+  // No straggler cleanup needed.
+  if ( pipeline_rank == n_pipeline )
+  {
+    return;
+  }
+
+  if ( n_subsort > 256 )
+  {
+    ERROR( ( "n_subsort too large." ) );
+  }
+
+  DISTRIBUTE( args->n, 1, pipeline_rank, n_pipeline, i, i1 );
+
+  i1 += i;
+
+  // Clear the local coarse count.
+  CLEAR( count, n_subsort );
+
+  // Local coarse count the input particles.
+  for( ; i < i1; i++ )
+  {
+    count[ V2P( p_src[i].i, n_subsort, vl, vh ) ]++;
+  }
+
+  // Copy local coarse count to output.
+  COPY( args->coarse_partition + cp_stride*pipeline_rank,
+	count,
+	n_subsort );
+}
+
+//----------------------------------------------------------------------------//
+// 
+//----------------------------------------------------------------------------//
+
+void
+coarse_sort_pipeline_scalar( sort_p_pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline )
+{
+  const particle_t * RESTRICT ALIGNED(128) p_src = args->p;
+  /**/  particle_t * RESTRICT ALIGNED(128) p_dst = args->aux_p;
+
+  int i, i1;
+  int n_subsort = args->n_subsort;
+  int vl        = args->vl;
+  int vh        = args->vh;
+  int cp_stride = POW2_CEIL( n_subsort, 4 );
+  int j;
+
+  // On pipeline stack to avoid cache hot spots and to allow reuse of coarse
+  // partitioning for fine sort stage.
+  int next[ 256 ];
+
+  // No straggler cleanup needed.
+  if ( pipeline_rank == n_pipeline )
+  {
+    return;
+  }
+
+  if ( n_subsort > 256 )
+  {
+    ERROR( ( "n_subsort too large." ) );
+  }
+
+  DISTRIBUTE( args->n, 1, pipeline_rank, n_pipeline, i, i1 );
+
+  i1 += i;
+
+  // Load the local coarse partitioning into next.
+  COPY( next,
+	args->coarse_partition + cp_stride*pipeline_rank,
+	n_subsort );
+
+  // Copy particles into aux array in coarse sorted order.
+  for( ; i < i1; i++ )
+  {
+    j = next[ V2P( p_src[i].i, n_subsort, vl, vh ) ]++;
+
+#   if defined( __SSE__ )
+
+    _mm_store_ps( &p_dst[j].dx, _mm_load_ps( &p_src[i].dx ) );
+    _mm_store_ps( &p_dst[j].ux, _mm_load_ps( &p_src[i].ux ) );
+
+#   else
+
+    p_dst[j] = p_src[i];
+
+#   endif
+  }
+}
+
+//----------------------------------------------------------------------------//
+// 
+//----------------------------------------------------------------------------//
+
+void
+subsort_pipeline_scalar( sort_p_pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  const particle_t * RESTRICT ALIGNED(128) p_src = args->aux_p;
+  /**/  particle_t * RESTRICT ALIGNED(128) p_dst = args->p;
+
+  int i0, i1, v0, v1, i, j, v, sum, count;
+
+  int subsort;
+
+  int n_subsort = args->n_subsort;
+
+  int * RESTRICT ALIGNED(128) partition = args->partition;
+  int * RESTRICT ALIGNED(128) next      = args->next;
+
+  // No straggler cleanup needed.
+  if ( pipeline_rank == n_pipeline )
+  {
+    return;
+  }
+
+  for( subsort = pipeline_rank; subsort < n_subsort; subsort += n_pipeline )
+  {
+    // This subsort sorts particles in [i0,i1) in the aux array. These
+    // particles are in voxels [v0,v1).
+    i0 = args->coarse_partition[ subsort   ];
+    i1 = args->coarse_partition[ subsort+1 ];
+
+    v0 = P2V( subsort,   n_subsort, args->vl, args->vh );
+    v1 = P2V( subsort+1, n_subsort, args->vl, args->vh );
+
+    // Clear fine grained count.
+    CLEAR( &next[v0], v1 - v0 );
+
+    // Fine grained count.
+    for( i = i0; i < i1; i++ )
+    {
+      next[ p_src[i].i ]++;
+    }
+
+    // Compute the partitioning.
+    sum = i0;
+    for( v = v0; v < v1; v++ )
+    {
+      count         = next[v];
+      next[v]       = sum;
+      partition[v]  = sum;
+      sum          += count;
+    }
+    // All subsorts who write this agree.
+    partition[v1] = sum;
+
+    // Local fine grained sort.
+    for( i = i0; i < i1; i++ )
+    {
+      v = p_src[i].i;
+      j = next[v]++;
+
+#     if defined( __SSE__ )
+
+      _mm_store_ps( &p_dst[j].dx, _mm_load_ps( &p_src[i].dx ) );
+      _mm_store_ps( &p_dst[j].ux, _mm_load_ps( &p_src[i].ux ) );
+
+#     else
+
+      p_dst[j] = p_src[i];
+
+#     endif
+    }
+  }
+}
+
+//----------------------------------------------------------------------------//
+// 
+//----------------------------------------------------------------------------//
+
+void
+sort_p_pipeline( species_t * sp )
+{
+  if ( !sp )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  sp->last_sorted = sp->g->step;
+
+  static char * ALIGNED(128)     scratch = NULL;
+  static size_t              max_scratch = 0;
+
+  size_t sz_scratch;
+
+  particle_t * RESTRICT ALIGNED(128) p = sp->p;
+  particle_t * RESTRICT ALIGNED(128) aux_p;
+
+  int n_particle = sp->np;
+
+  int * RESTRICT ALIGNED(128) partition = sp->partition;
+  int * RESTRICT ALIGNED(128) next;
+
+  int vl = VOXEL( 1,
+		  1,
+		  1,
+		  sp->g->nx,
+		  sp->g->ny,
+		  sp->g->nz );
+
+  int vh = VOXEL( sp->g->nx,
+		  sp->g->ny,
+		  sp->g->nz,
+		  sp->g->nx,
+		  sp->g->ny,
+		  sp->g->nz );
+
+  int n_voxel = sp->g->nv;
+
+  int * RESTRICT ALIGNED(128) coarse_partition;
+
+  int n_pipeline = N_PIPELINE;
+  int n_subsort  = N_PIPELINE;
+
+  int cp_stride = POW2_CEIL( n_subsort, 4 );
+
+  int i, pipeline_rank, subsort, count, sum;
+
+  DECLARE_ALIGNED_ARRAY( sort_p_pipeline_args_t, 128, args, 1 );
+
+  // Ensure enough scratch space is allocated for the sorting.
+  sz_scratch = ( sizeof( *p ) * n_particle      +
+		 128                            +
+                 sizeof( *partition ) * n_voxel +
+		 128                            +
+                 sizeof( *coarse_partition ) * ( cp_stride * n_pipeline + 1 ) );
+
+  if ( sz_scratch > max_scratch )
+  {
+    FREE_ALIGNED( scratch );
+
+    MALLOC_ALIGNED( scratch, sz_scratch, 128 );
+
+    max_scratch = sz_scratch;
+  }
+
+  aux_p            = ALIGN_PTR( particle_t, scratch,            128 );
+  next             = ALIGN_PTR( int,        aux_p + n_particle, 128 );
+  coarse_partition = ALIGN_PTR( int,        next  + n_voxel,    128 );
+
+  // Setup pipeline arguments.
+  args->p                = p;
+  args->aux_p            = aux_p;
+  args->coarse_partition = coarse_partition;
+  args->next             = next;
+  args->partition        = partition;
+  args->n                = n_particle;
+  args->n_subsort        = n_subsort;
+  args->vl               = vl;
+  args->vh               = vh;
+  args->n_voxel          = n_voxel;
+
+  if ( n_subsort != 1 )
+  {
+    // Do the coarse count.
+    EXEC_PIPELINES( coarse_count, args, 0 );
+
+    WAIT_PIPELINES();
+
+    // Convert the coarse count into a coarse partitioning.
+    sum = 0;
+    for( subsort = 0; subsort < n_subsort; subsort++ )
+    {
+      for( pipeline_rank = 0; pipeline_rank < n_pipeline; pipeline_rank++ )
+      {
+        i                   = subsort + cp_stride * pipeline_rank;
+        count               = coarse_partition[i];
+        coarse_partition[i] = sum;
+        sum                += count;
+      }
+    }
+
+    // Do the coarse sort.
+    EXEC_PIPELINES( coarse_sort, args, 0 );
+
+    WAIT_PIPELINES();
+
+    // Convert the coarse partitioning used during the coarse sort into the
+    // partitioning of the particle list by subsort pipelines.
+    coarse_partition[ n_subsort ] = n_particle;
+
+    // Do fine grained subsorts.  While the fine grained subsorts are
+    // executing, clear the ghost parts of the partitioning array.
+    EXEC_PIPELINES( subsort, args, 0 );
+
+    CLEAR( partition, vl );
+
+    for( i = vh + 1; i < n_voxel; i++ )
+    {
+      partition[i] = n_particle;
+    }
+
+    WAIT_PIPELINES();
+  }
+
+  else
+  {
+    // Just do the subsort when single threaded.  We need to hack the aux
+    // arrays and what not to make it look like coarse sorting was done to
+    // the subsort pipeline.
+    coarse_partition[0] = 0;
+    coarse_partition[1] = n_particle;
+
+    args->p     = aux_p;
+    args->aux_p = p;
+
+    subsort_pipeline_scalar( args, 0, 1 );
+
+    CLEAR( partition, vl );
+
+    for( i = vh + 1; i < n_voxel; i++ )
+    {
+      partition[i] = n_particle;
+    }
+
+    // Results ended up in the wrong place as a result of the ugly hack above.
+    // Copy it to the right place and undo the above hack. FIXME: IF WILLING
+    // TO MOVE SP->P AROUND AND DO MORE MALLOCS PER STEP I.E. HEAP
+    // FRAGMENTATION, COULD AVOID THIS COPY.
+    COPY( p, aux_p, n_particle );
+  }
+}
diff --git a/src/species_advance/standard/spa_private.h b/src/species_advance/standard/pipeline/spa_private.h
similarity index 58%
rename from src/species_advance/standard/spa_private.h
rename to src/species_advance/standard/pipeline/spa_private.h
index a2a11e67..ed95881e 100644
--- a/src/species_advance/standard/spa_private.h
+++ b/src/species_advance/standard/pipeline/spa_private.h
@@ -5,13 +5,13 @@
 #error "Do not include spa_private.h; include species_advance.h"
 #endif
 
-#include "../species_advance.h"
+#include "../../species_advance.h"
 
 ///////////////////////////////////////////////////////////////////////////////
 // advance_p_pipeline interface
 
-typedef struct particle_mover_seg {
-
+typedef struct particle_mover_seg
+{
   MEM_PTR( particle_mover_t, 16 ) pm; // First mover in segment
   int max_nm;                         // Maximum number of movers
   int nm;                             // Number of movers used
@@ -21,8 +21,8 @@ typedef struct particle_mover_seg {
 
 } particle_mover_seg_t;
 
-typedef struct advance_p_pipeline_args {
-
+typedef struct advance_p_pipeline_args
+{
   MEM_PTR( particle_t,           128 ) p0;       // Particle array
   MEM_PTR( particle_mover_t,     128 ) pm;       // Particle mover array
   MEM_PTR( accumulator_t,        128 ) a0;       // Accumulator arrays
@@ -46,13 +46,33 @@ typedef struct advance_p_pipeline_args {
 
 } advance_p_pipeline_args_t;
 
-PROTOTYPE_PIPELINE( advance_p, advance_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( advance_p, advance_p_pipeline_args_t );
+
+void
+advance_p_pipeline_scalar( advance_p_pipeline_args_t * args,
+                           int pipeline_rank,
+                           int n_pipeline );
+
+void
+advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_p_pipeline_v8( advance_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+void
+advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline );
 
 ///////////////////////////////////////////////////////////////////////////////
 // center_p_pipeline and uncenter_p_pipeline interface
 
-typedef struct center_p_pipeline_args {
-
+typedef struct center_p_pipeline_args
+{
   MEM_PTR( particle_t,           128 ) p0;      // Particle array
   MEM_PTR( const interpolator_t, 128 ) f0;      // Interpolator array
   float                                qdt_2mc; // Particle/field coupling
@@ -62,14 +82,54 @@ typedef struct center_p_pipeline_args {
 
 } center_p_pipeline_args_t;
 
-PROTOTYPE_PIPELINE( center_p,   center_p_pipeline_args_t );
-PROTOTYPE_PIPELINE( uncenter_p, center_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( center_p,   center_p_pipeline_args_t );
+
+void
+center_p_pipeline_scalar( center_p_pipeline_args_t * args,
+                          int pipeline_rank,
+                          int n_pipeline );
+
+void
+center_p_pipeline_v4( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline );
+
+void
+center_p_pipeline_v8( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline );
+void
+center_p_pipeline_v16( center_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
+
+// PROTOTYPE_PIPELINE( uncenter_p, center_p_pipeline_args_t );
+
+void
+uncenter_p_pipeline_scalar( center_p_pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline );
+
+void
+uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline );
+
+void
+uncenter_p_pipeline_v8( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline );
+
+void
+uncenter_p_pipeline_v16( center_p_pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline );
 
 ///////////////////////////////////////////////////////////////////////////////
 // energy_p_pipeline interface
 
-typedef struct energy_p_pipeline_args {
-
+typedef struct energy_p_pipeline_args
+{
   MEM_PTR( const particle_t,     128 ) p;       // Particle array
   MEM_PTR( const interpolator_t, 128 ) f;       // Interpolator array
   MEM_PTR( double,               128 ) en;      // Return values
@@ -81,7 +141,27 @@ typedef struct energy_p_pipeline_args {
 
 } energy_p_pipeline_args_t;
 
-PROTOTYPE_PIPELINE( energy_p, energy_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( energy_p, energy_p_pipeline_args_t );
+
+void
+energy_p_pipeline_scalar( energy_p_pipeline_args_t * RESTRICT args,
+                          int pipeline_rank,
+                          int n_pipeline );
+
+void
+energy_p_pipeline_v4( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline );
+
+void
+energy_p_pipeline_v8( energy_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline );
+
+void
+energy_p_pipeline_v16( energy_p_pipeline_args_t * args,
+                       int pipeline_rank,
+                       int n_pipeline );
 
 ///////////////////////////////////////////////////////////////////////////////
 // sort_p_pipeline interface
@@ -112,8 +192,8 @@ PROTOTYPE_PIPELINE( energy_p, energy_p_pipeline_args_t );
 
 // FIXME: safe to remove? enum { max_subsort_voxel = 26624 };
 
-typedef struct sort_p_pipeline_args {
-
+typedef struct sort_p_pipeline_args
+{
   MEM_PTR( particle_t, 128 ) p;                // Particles (0:n-1)
   MEM_PTR( particle_t, 128 ) aux_p;            // Aux particle atorage (0:n-1)
   MEM_PTR( int,        128 ) coarse_partition; // Coarse partition storage
@@ -129,8 +209,23 @@ typedef struct sort_p_pipeline_args {
 
 } sort_p_pipeline_args_t;
 
-PROTOTYPE_PIPELINE( coarse_count, sort_p_pipeline_args_t );
-PROTOTYPE_PIPELINE( coarse_sort,  sort_p_pipeline_args_t );
-PROTOTYPE_PIPELINE( subsort,      sort_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( coarse_count, sort_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( coarse_sort,  sort_p_pipeline_args_t );
+// PROTOTYPE_PIPELINE( subsort,      sort_p_pipeline_args_t );
+
+void
+coarse_count_pipeline_scalar( sort_p_pipeline_args_t * args,
+                              int pipeline_rank,
+                              int n_pipeline );
+
+void
+coarse_sort_pipeline_scalar( sort_p_pipeline_args_t * args,
+                             int pipeline_rank,
+                             int n_pipeline );
+
+void
+subsort_pipeline_scalar( sort_p_pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline );
 
 #endif // _spa_private_h_
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc
new file mode 100644
index 00000000..f3b6d442
--- /dev/null
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc
@@ -0,0 +1,129 @@
+#define IN_spa
+
+#define HAS_V4_PIPELINE
+#define HAS_V8_PIPELINE
+#define HAS_V16_PIPELINE
+
+#include "spa_private.h"
+
+#include "../../../util/pipelines/pipelines_exec.h"
+
+//----------------------------------------------------------------------------//
+// Reference implementation for an uncenter_p pipeline function which does not
+// make use of explicit calls to vector intrinsic functions.
+//----------------------------------------------------------------------------//
+
+void
+uncenter_p_pipeline_scalar( center_p_pipeline_args_t * args,
+                            int pipeline_rank,
+                            int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(32)  p;
+
+  const interpolator_t * ALIGNED(16)  f;
+
+  const float qdt_2mc        =     -args->qdt_2mc; // For backward half advance
+  const float qdt_4mc        = -0.5*args->qdt_2mc; // For backward half rotate
+  const float one            = 1.0;
+  const float one_third      = 1.0/3.0;
+  const float two_fifteenths = 2.0/15.0;
+
+  float dx, dy, dz, ux, uy, uz;
+  float hax, hay, haz, cbx, cby, cbz;
+  float v0, v1, v2, v3, v4;
+  int   ii;
+
+  int first, n;
+
+  // Determine which particles this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, n );
+
+  p = args->p0 + first;
+
+  // Process particles for this pipeline.
+
+  for( ; n; n--, p++ )
+  {
+    dx   = p->dx;                            // Load position
+    dy   = p->dy;
+    dz   = p->dz;
+    ii   = p->i;
+
+    f    = f0 + ii;                          // Interpolate E
+
+    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
+                     dz*( f->dexdz + dy*f->d2exdydz ) );
+
+    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
+                     dx*( f->deydx + dz*f->d2eydzdx ) );
+
+    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
+                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
+
+    cbx  = f->cbx + dx*f->dcbxdx;            // Interpolate B
+    cby  = f->cby + dy*f->dcbydy;
+    cbz  = f->cbz + dz*f->dcbzdz;
+
+    ux   = p->ux;                            // Load momentum
+    uy   = p->uy;
+    uz   = p->uz;
+
+    v0   = qdt_4mc/(float)sqrt(one + (ux*ux + (uy*uy + uz*uz)));
+    /**/                                     // Boris - scalars
+    v1   = cbx*cbx + (cby*cby + cbz*cbz);
+    v2   = (v0*v0)*v1;
+    v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
+    v4   = v3/(one+v1*(v3*v3));
+    v4  += v4;
+
+    v0   = ux + v3*( uy*cbz - uz*cby );      // Boris - uprime
+    v1   = uy + v3*( uz*cbx - ux*cbz );
+    v2   = uz + v3*( ux*cby - uy*cbx );
+
+    ux  += v4*( v1*cbz - v2*cby );           // Boris - rotation
+    uy  += v4*( v2*cbx - v0*cbz );
+    uz  += v4*( v0*cby - v1*cbx );
+
+    ux  += hax;                              // Half advance E
+    uy  += hay;
+    uz  += haz;
+
+    p->ux = ux;                              // Store momentum
+    p->uy = uy;
+    p->uz = uz;
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper uncenter_p pipeline
+// function.
+//----------------------------------------------------------------------------//
+
+void
+uncenter_p_pipeline( species_t * RESTRICT sp,
+                     const interpolator_array_t * RESTRICT ia )
+{
+  DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 );
+
+  if ( !sp ||
+       !ia ||
+       sp->g != ia->g )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Have the pipelines do the bulk of particles in blocks and have the
+  // host do the final incomplete block.
+
+  args->p0      = sp->p;
+  args->f0      = ia->i;
+  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->np      = sp->np;
+
+  EXEC_PIPELINES( uncenter_p, args, 0 );
+
+  WAIT_PIPELINES();
+}
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v16.cc
new file mode 100644
index 00000000..e99ee084
--- /dev/null
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v16.cc
@@ -0,0 +1,161 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V16_ACCELERATION)
+
+using namespace v16;
+
+void
+uncenter_p_pipeline_v16( center_p_pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(64)  vp00;
+  const float          * ALIGNED(64)  vp01;
+  const float          * ALIGNED(64)  vp02;
+  const float          * ALIGNED(64)  vp03;
+  const float          * ALIGNED(64)  vp04;
+  const float          * ALIGNED(64)  vp05;
+  const float          * ALIGNED(64)  vp06;
+  const float          * ALIGNED(64)  vp07;
+  const float          * ALIGNED(64)  vp08;
+  const float          * ALIGNED(64)  vp09;
+  const float          * ALIGNED(64)  vp10;
+  const float          * ALIGNED(64)  vp11;
+  const float          * ALIGNED(64)  vp12;
+  const float          * ALIGNED(64)  vp13;
+  const float          * ALIGNED(64)  vp14;
+  const float          * ALIGNED(64)  vp15;
+
+  const v16float qdt_2mc(    -args->qdt_2mc); // For backward half advance.
+  const v16float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate.
+  const v16float one(1.0);
+  const v16float one_third(1.0/3.0);
+  const v16float two_fifteenths(2.0/15.0);
+
+  v16float dx, dy, dz, ux, uy, uz, q;
+  v16float hax, hay, haz, cbx, cby, cbz;
+  v16float v00, v01, v02, v03, v04, v05, v06, v07, v08, v09, v10;
+  v16int   ii;
+
+  int first, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
+
+  p = args->p0 + first;
+
+  nq >>= 4;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=16 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_16x8_tr_p( &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                    &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx,
+		    dx, dy, dz, ii, ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(64) ) ( f0 + ii( 0) );
+    vp01 = ( const float * ALIGNED(64) ) ( f0 + ii( 1) );
+    vp02 = ( const float * ALIGNED(64) ) ( f0 + ii( 2) );
+    vp03 = ( const float * ALIGNED(64) ) ( f0 + ii( 3) );
+    vp04 = ( const float * ALIGNED(64) ) ( f0 + ii( 4) );
+    vp05 = ( const float * ALIGNED(64) ) ( f0 + ii( 5) );
+    vp06 = ( const float * ALIGNED(64) ) ( f0 + ii( 6) );
+    vp07 = ( const float * ALIGNED(64) ) ( f0 + ii( 7) );
+    vp08 = ( const float * ALIGNED(64) ) ( f0 + ii( 8) );
+    vp09 = ( const float * ALIGNED(64) ) ( f0 + ii( 9) );
+    vp10 = ( const float * ALIGNED(64) ) ( f0 + ii(10) );
+    vp11 = ( const float * ALIGNED(64) ) ( f0 + ii(11) );
+    vp12 = ( const float * ALIGNED(64) ) ( f0 + ii(12) );
+    vp13 = ( const float * ALIGNED(64) ) ( f0 + ii(13) );
+    vp14 = ( const float * ALIGNED(64) ) ( f0 + ii(14) );
+    vp15 = ( const float * ALIGNED(64) ) ( f0 + ii(15) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_16x16_tr( vp00, vp01, vp02, vp03,
+                   vp04, vp05, vp06, vp07,
+                   vp08, vp09, vp10, vp11,
+                   vp12, vp13, vp14, vp15,
+                   hax, v00, v01, v02, hay, v03, v04, v05,
+                   haz, v06, v07, v08, cbx, v09, cby, v10 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    haz = qdt_2mc*fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
+
+    cbx = fma( v09, dx, cbx );
+
+    cby = fma( v10, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_16x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+                  vp04+16, vp05+16, vp06+16, vp07+16,
+                  vp08+16, vp09+16, vp10+16, vp11+16,
+                  vp12+16, vp13+16, vp14+16, vp15+16,
+                  cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    //--------------------------------------------------------------------------
+    // Store particle momentum data.  Could use store_16x4_tr_p or
+    // store_16x3_tr_p.
+    //--------------------------------------------------------------------------
+    store_16x8_tr_p( dx, dy, dz, ii, ux, uy, uz, q,
+                     &p[ 0].dx, &p[ 2].dx, &p[ 4].dx, &p[ 6].dx,
+                     &p[ 8].dx, &p[10].dx, &p[12].dx, &p[14].dx );
+  }
+}
+
+#else
+
+void
+uncenter_p_pipeline_v16( center_p_pipeline_args_t * args,
+                         int pipeline_rank,
+                         int n_pipeline )
+{
+  // No v16 implementation.
+  ERROR( ( "No uncenter_p_pipeline_v16 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
new file mode 100644
index 00000000..908fedec
--- /dev/null
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
@@ -0,0 +1,150 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V4_ACCELERATION)
+
+using namespace v4;
+
+void
+uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(16)  vp00;
+  const float          * ALIGNED(16)  vp01;
+  const float          * ALIGNED(16)  vp02;
+  const float          * ALIGNED(16)  vp03;
+
+  const v4float qdt_2mc(    -args->qdt_2mc); // For backward half advance.
+  const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate.
+  const v4float one(1.0);
+  const v4float one_third(1.0/3.0);
+  const v4float two_fifteenths(2.0/15.0);
+
+  v4float dx, dy, dz, ux, uy, uz, q;
+  v4float hax, hay, haz, cbx, cby, cbz;
+  v4float v00, v01, v02, v03, v04, v05;
+  v4int   ii;
+
+  int first, nq;
+
+  // Determine which particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
+
+  p = args->p0 + first;
+
+  nq >>= 2;
+
+  // Process the particle quads for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00, vp01, vp02, vp03,
+		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+		 cbx, v03, cby, v04 );
+
+    cbx = fma( v03, dx, cbx );
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.  Could use load_4x3_tr.
+    //--------------------------------------------------------------------------
+    load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    //--------------------------------------------------------------------------
+    // Store particle data.  Could use store_4x3_tr.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
+  }
+}
+
+#else
+
+void
+uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  // No v4 implementation.
+  ERROR( ( "No uncenter_p_pipeline_v4 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v8.cc
new file mode 100644
index 00000000..f0a7a3e8
--- /dev/null
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v8.cc
@@ -0,0 +1,163 @@
+#define IN_spa
+
+#include "spa_private.h"
+
+#if defined(V8_ACCELERATION)
+
+using namespace v8;
+
+void
+uncenter_p_pipeline_v8( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(32)  vp00;
+  const float          * ALIGNED(32)  vp01;
+  const float          * ALIGNED(32)  vp02;
+  const float          * ALIGNED(32)  vp03;
+  const float          * ALIGNED(32)  vp04;
+  const float          * ALIGNED(32)  vp05;
+  const float          * ALIGNED(32)  vp06;
+  const float          * ALIGNED(32)  vp07;
+
+  const v8float qdt_2mc(    -args->qdt_2mc); // For backward half advance.
+  const v8float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate.
+  const v8float one(1.0);
+  const v8float one_third(1.0/3.0);
+  const v8float two_fifteenths(2.0/15.0);
+
+  v8float dx, dy, dz, ux, uy, uz, q;
+  v8float hax, hay, haz, cbx, cby, cbz;
+  v8float v00, v01, v02, v03, v04, v05;
+  v8int   ii;
+
+  int first, nq;
+
+  // Determine which particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
+
+  p = args->p0 + first;
+
+  nq >>= 3;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=8 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle data.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+		 &p[4].dx, &p[5].dx, &p[6].dx, &p[7].dx,
+		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(32) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(32) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(32) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(32) ) ( f0 + ii(3) );
+    vp04 = ( const float * ALIGNED(32) ) ( f0 + ii(4) );
+    vp05 = ( const float * ALIGNED(32) ) ( f0 + ii(5) );
+    vp06 = ( const float * ALIGNED(32) ) ( f0 + ii(6) );
+    vp07 = ( const float * ALIGNED(32) ) ( f0 + ii(7) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00, vp01, vp02, vp03,
+		 vp04, vp05, vp06, vp07,
+		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+		 vp04+4, vp05+4, vp06+4, vp07+4,
+		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+		 vp04+8, vp05+8, vp06+8, vp07+8,
+		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+		 vp04+12, vp05+12, vp06+12, vp07+12,
+		 cbx, v03, cby, v04 );
+
+    cbx = fma( v03, dx, cbx );
+    cby = fma( v04, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_8x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 vp04+16, vp05+16, vp06+16, vp07+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle data.  Could use load_8x3_tr.
+    //--------------------------------------------------------------------------
+    load_8x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		 &p[4].ux, &p[5].ux, &p[6].ux, &p[7].ux,
+		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+    v00  = fma( fms(  uy, cbz,  uz * cby ), v03, ux );
+    v01  = fma( fms(  uz, cbx,  ux * cbz ), v03, uy );
+    v02  = fma( fms(  ux, cby,  uy * cbx ), v03, uz );
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    //--------------------------------------------------------------------------
+    // Store particle data.  Could use store_8x3_tr.
+    //--------------------------------------------------------------------------
+    store_8x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+		  &p[4].ux, &p[5].ux, &p[6].ux, &p[7].ux );
+  }
+}
+
+#else
+
+void
+uncenter_p_pipeline_v8( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  // No v8 implementation.
+  ERROR( ( "No uncenter_p_pipeline_v8 implementation." ) );
+}
+
+#endif
diff --git a/src/species_advance/standard/rho_p.cc b/src/species_advance/standard/rho_p.cc
index 2ad6800d..629badc1 100644
--- a/src/species_advance/standard/rho_p.cc
+++ b/src/species_advance/standard/rho_p.cc
@@ -9,7 +9,8 @@
  */
 
 #define IN_spa
-#include "spa_private.h"
+
+#include "../species_advance.h"
 
 // accumulate_rho_p adds the charge density associated with the
 // supplied particle array to the rhof of the fields.  Trilinear
@@ -208,4 +209,3 @@ accumulate_rhob( field_t          * RESTRICT ALIGNED(128) f,
 
 # endif
 }
-
diff --git a/src/species_advance/standard/sort_p.c b/src/species_advance/standard/sort_p.c
index 236309a1..3464011a 100644
--- a/src/species_advance/standard/sort_p.c
+++ b/src/species_advance/standard/sort_p.c
@@ -1,319 +1,165 @@
+//============================================================================//
+// Written by:
+//   Kevin J. Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// March/April 2004 - Revised and extened from earlier V4PIC versions.
+//============================================================================//
+
 #define IN_spa
-#include "spa_private.h"
 
-// FIXME: HOOK UP IN-PLACE / OUT-PLACE OPTIONS AGAIN
+#include "../species_advance.h"
 
-// FIXME: ALTIVEC ACCELERATE!
-#if defined(__SSE__)
-#include "xmmintrin.h"
-#endif
+//----------------------------------------------------------------------------//
+// This is the legacy thread serial version of the particle sort.
+//----------------------------------------------------------------------------//
 
-void
-coarse_count_pipeline( sort_p_pipeline_args_t * args,
-                       int pipeline_rank,
-                       int n_pipeline ) {
-  const particle_t * RESTRICT ALIGNED(128) p_src = args->p;
-  int i, i1, n_subsort = args->n_subsort, vl = args->vl, vh = args->vh;
-  int cp_stride = POW2_CEIL( n_subsort, 4 );
-
-  int count[256]; // On pipe stack to avoid cache hot spots
-
-  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
-  if( n_subsort>256 ) ERROR(( "n_subsort too large." ));
-
-  DISTRIBUTE( args->n, 1, pipeline_rank, n_pipeline, i, i1 ); i1 += i;
-  
-  // Clear the local coarse count
-  CLEAR( count, n_subsort );
-  
-  // Local coarse count the input particles
-  for( ; i<i1; i++ ) count[ V2P( p_src[i].i, n_subsort, vl, vh ) ]++;
-  
-  // Copy local coarse count to output
-  COPY( args->coarse_partition + cp_stride*pipeline_rank, count, n_subsort );
-}
+#if defined(VPIC_USE_LEGACY_SORT) 
 
-void
-coarse_sort_pipeline( sort_p_pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline ) {
-  const particle_t * RESTRICT ALIGNED(128) p_src = args->p;
-  /**/  particle_t * RESTRICT ALIGNED(128) p_dst = args->aux_p;
-  int i, i1, n_subsort = args->n_subsort, vl = args->vl, vh = args->vh;
-  int cp_stride = POW2_CEIL( n_subsort, 4 );
-  int j;
-
-  int next[ 256 ]; // On pipeline stack to avoid cache hot spots and to
-                   // allow reuse of coarse partitioning for fine sort
-                   // stage.
-
-  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
-  if( n_subsort>256 ) ERROR(( "n_subsort too large." ));
-
-  DISTRIBUTE( args->n, 1, pipeline_rank, n_pipeline, i, i1 ); i1 += i;
-  
-  // Load the local coarse partitioning into next
-  COPY( next, args->coarse_partition + cp_stride*pipeline_rank, n_subsort );
-
-  // Copy particles into aux array in coarse sorted order
-  for( ; i<i1; i++ ) {
-    j = next[ V2P( p_src[i].i, n_subsort, vl, vh ) ]++;
-#   if defined(__SSE__)
-    _mm_store_ps( &p_dst[j].dx, _mm_load_ps( &p_src[i].dx ) );
-    _mm_store_ps( &p_dst[j].ux, _mm_load_ps( &p_src[i].ux ) );
-#   else
-    p_dst[j] = p_src[i];
-#   endif
-  }
-}
-
-void
-subsort_pipeline( sort_p_pipeline_args_t * args,
-                  int pipeline_rank,
-                  int n_pipeline ) {
-  const particle_t * RESTRICT ALIGNED(128) p_src = args->aux_p;
-  /**/  particle_t * RESTRICT ALIGNED(128) p_dst = args->p;
-  int i0, i1, v0, v1, i, j, v, sum, count;
-  int subsort, n_subsort = args->n_subsort;
-
-  int * RESTRICT ALIGNED(128) partition = args->partition;
-  int * RESTRICT ALIGNED(128) next      = args->next;
-
-  if( pipeline_rank==n_pipeline ) return; // No straggler cleanup needed
-
-  for( subsort=pipeline_rank; subsort<n_subsort; subsort+=n_pipeline ) {
-
-    // This subsort sorts particles in [i0,i1) in the aux array.
-    // These particles are in voxels [v0,v1).
-    i0 = args->coarse_partition[subsort  ];
-    i1 = args->coarse_partition[subsort+1];
-    v0 = P2V( subsort,   n_subsort, args->vl, args->vh );
-    v1 = P2V( subsort+1, n_subsort, args->vl, args->vh );
-  
-    // Clear fine grained count
-    CLEAR( &next[v0], v1-v0 );
-  
-    // Fine grained count
-    for( i=i0; i<i1; i++ ) next[ p_src[i].i ]++;
-
-    // Compute the partitioning
-    sum = i0;
-    for( v=v0; v<v1; v++ ) {
-      count = next[v];
-      next[v] = sum;
-      partition[v] = sum;
-      sum += count;
-    }
-    partition[v1] = sum; // All subsorts who write this agree
-  
-    // Local fine grained sort
-    for( i=i0; i<i1; i++ ) {
-      v = p_src[i].i;
-      j = next[v]++;
-#     if defined(__SSE__)
-      _mm_store_ps( &p_dst[j].dx, _mm_load_ps( &p_src[i].dx ) );
-      _mm_store_ps( &p_dst[j].ux, _mm_load_ps( &p_src[i].ux ) );
-#     else
-      p_dst[j] = p_src[i];
-#     endif
-    }
-  }
-}
+//----------------------------------------------------------------------------//
+// 
+//----------------------------------------------------------------------------//
 
 void
-sort_p( species_t * sp ) {
-  if( !sp ) ERROR(( "Bad args" ));
-  sp->last_sorted = sp->g->step;
-
-  static char * ALIGNED(128) scratch = NULL;
-  static size_t max_scratch = 0;
-  size_t sz_scratch;
-
-  particle_t * RESTRICT ALIGNED(128) p = sp->p;
-  particle_t * RESTRICT ALIGNED(128) aux_p;
-  int n_particle = sp->np;
-
-  int * RESTRICT ALIGNED(128) partition = sp->partition;
-  int * RESTRICT ALIGNED(128) next;
-  int vl = VOXEL(1,1,1,                         sp->g->nx,sp->g->ny,sp->g->nz);
-  int vh = VOXEL(sp->g->nx,sp->g->ny,sp->g->nz, sp->g->nx,sp->g->ny,sp->g->nz);
-  int n_voxel = sp->g->nv;
-
-  int * RESTRICT ALIGNED(128) coarse_partition;
-  int n_pipeline = N_PIPELINE;
-  int n_subsort  = N_PIPELINE;
-  int cp_stride  = POW2_CEIL( n_subsort, 4 );
-
-  int i, pipeline_rank, subsort, count, sum;
-
-  DECLARE_ALIGNED_ARRAY( sort_p_pipeline_args_t, 128, args, 1 );
-
-  // Insure enough scratch space is allocated for the sorting
-  sz_scratch = ( sizeof(*p)*n_particle + 128 +
-                 sizeof(*partition)*n_voxel + 128 +
-                 sizeof(*coarse_partition)*(cp_stride*n_pipeline+1) );
-  if( sz_scratch > max_scratch ) {
-    FREE_ALIGNED( scratch );
-    MALLOC_ALIGNED( scratch, sz_scratch, 128 );
-    max_scratch = sz_scratch;
-  }
-  aux_p            = ALIGN_PTR( particle_t, scratch,            128 );
-  next             = ALIGN_PTR( int,        aux_p + n_particle, 128 );
-  coarse_partition = ALIGN_PTR( int,        next  + n_voxel,    128 );
-
-  // Setup pipeline arguments
-  args->p                = p;
-  args->aux_p            = aux_p;
-  args->coarse_partition = coarse_partition;
-  args->next             = next;
-  args->partition        = partition;
-  args->n                = n_particle;
-  args->n_subsort        = n_subsort;
-  args->vl               = vl;
-  args->vh               = vh;
-  args->n_voxel          = n_voxel;
-
-  if( n_subsort!=1 ) {
-
-    // Do the coarse count
-    EXEC_PIPELINES( coarse_count, args, 0 );
-    WAIT_PIPELINES();
-
-    // Convert the coarse count into a coarse partitioning
-    sum = 0;
-    for( subsort=0; subsort<n_subsort; subsort++ )
-      for( pipeline_rank=0; pipeline_rank<n_pipeline; pipeline_rank++ ) {
-        i = subsort + cp_stride*pipeline_rank;
-        count = coarse_partition[i];
-        coarse_partition[i] = sum;
-        sum += count;
-      }
+sort_p( species_t * sp )
+{
+  if ( !sp )
+    ERROR( ( "Bad args" ) );
 
-    // Do the coarse sort
-    EXEC_PIPELINES( coarse_sort, args, 0 );
-    WAIT_PIPELINES();
-
-    // Convert the coarse_partitioning used durign the coarse sort
-    // into the partitioning of the particle list by subsort pipelines
-    coarse_partition[ n_subsort ] = n_particle;
-
-    // Do fine grained subsorts
-    // While the fine grained subsorts are executing, clear the
-    // ghost parts of the partitioning array
-    EXEC_PIPELINES( subsort, args, 0 );
-    CLEAR( partition, vl );
-    for( i=vh+1; i<n_voxel; i++ ) partition[i] = n_particle;
-    WAIT_PIPELINES();
-
-  } else {
-
-    // Just do the subsort when single threaded.  We need to hack the
-    // aux arrays and what not to make it look like coarse sorting was
-    // done to the subsort pipeline.
-    coarse_partition[0] = 0;
-    coarse_partition[1] = n_particle;
-    args->p     = aux_p;
-    args->aux_p = p;
-    subsort_pipeline( args, 0, 1 );
-    CLEAR( partition, vl );
-    for( i=vh+1; i<n_voxel; i++ ) partition[i] = n_particle;
-
-    // Results ended up in the wrong place as a result of the ugly
-    // hack above.  Copy it to the right place and undo the above
-    // hack.  FIXME: IF WILLING TO MOVE SP->P AROUND AND DO MORE
-    // MALLOCS PER STEP (I.E. HEAP FRAGMENTATION), COULD AVOID THIS
-    // COPY.
-    COPY( p, aux_p, n_particle );
-
-  }
-}
-
-#if 0 // In-place, single threaded legacy version
-
-void
-sort_p( species_t * sp ) {
-  if( !sp ) ERROR(( "Bad args" ));
   sp->last_sorted = sp->g->step;
 
   particle_t * ALIGNED(128) p = sp->p;
-  //const int32_t * RESTRICT ALIGNED(128) sfc = g->sfc;
+
   const int np                = sp->np; 
   const int nc                = sp->g->nv;
-  const int nc1               = nc+1;
+  const int nc1               = nc + 1;
+
   int * RESTRICT ALIGNED(128) partition = sp->partition;
 
   static int * RESTRICT ALIGNED(128) next = NULL;
+
   static int max_nc1 = 0;
 
   int i, j;
 
-  if( np==0 ) return; // Do not need to sort
+  // Do not need to sort.
+  if ( np == 0 )
+    return;
 
-  // Allocate the sorting intermediate
-  // Making this into a static is done to avoid heap shredding
+  // Allocate the sorting intermediate. Making this into a static is done to
+  // avoid heap shredding.
  
-  if( max_nc1<nc1 ) {
-    int * tmp = next; // Hack around RESTRICT issues
-    FREE_ALIGNED(   tmp );
+  if ( max_nc1 < nc1 )
+  {
+    // Hack around RESTRICT issues.
+    int *tmp = next;
+
+    FREE_ALIGNED( tmp );
+
     MALLOC_ALIGNED( tmp, nc1, 128 );
+
     next    = tmp;
     max_nc1 = nc1;
   }
 
-  // Count particles in each cell
+  // Count particles in each cell.
   CLEAR( next, nc1 );
-  //for( i=0; i<np; i++ ) next[ sfc[ p[i].i ] ]++;
-  for( i=0; i<np; i++ ) next[ p[i].i ]++;
-
-  // Convert the count to a partitioning (and save a copy in next)
-  j=0;
-  for( i=0; i<nc1; i++ ) {
-    partition[i] = j;
-    j += next[i];
-    next[i] = partition[i];
+
+  for( i = 0; i < np; i++ )
+  {
+    next[ p[i].i ]++;
   }
 
-  if( sp->sort_out_of_place ) {
+  // Convert the count to a partitioning and save a copy in next.
+  j = 0;
+  for( i = 0; i < nc1; i++ )
+  {
+    partition[i]  = j;
+    j            += next[i];
+    next[i]       = partition[i];
+  }
 
-    // Throw down the particle array in order
+  if ( sp->sort_out_of_place )
+  {
+    // Throw down the particle array in order.
 
-    particle_t * ALIGNED(128) new_p;
-    const particle_t * RESTRICT ALIGNED(32) in_p;
-    /**/  particle_t * RESTRICT ALIGNED(32) out_p;
+    /**/  particle_t *          ALIGNED(128) new_p;
+    const particle_t * RESTRICT ALIGNED( 32)  in_p;
+    /**/  particle_t * RESTRICT ALIGNED( 32) out_p;
 
     MALLOC_ALIGNED( new_p, sp->max_np, 128 );
 
     in_p  = sp->p;
     out_p = new_p;
-    //for( i=0; i<np; i++ ) out_p[ next[ sfc[ in_p[i].i ] ]++ ] = in_p[i];
-    for( i=0; i<np; i++ ) out_p[ next[ in_p[i].i ]++ ] = in_p[i];
+
+    for( i = 0; i < np; i++ )
+    {
+      out_p[ next[ in_p[i].i ]++ ] = in_p[i];
+    }
 
     FREE_ALIGNED( sp->p );
+
     sp->p = new_p;
+  }
 
-  } else {
+  else
+  {
+    // Run sort cycles until the list is sorted.
 
-    // Run sort cycles until the list is sorted
+    particle_t               save_p;
+    particle_t * ALIGNED(32) src;
+    particle_t * ALIGNED(32) dest;
 
-    particle_t save_p, * ALIGNED(32) src, * ALIGNED(32) dest;
+    i = 0;
+    while( i < nc )
+    {
+      if ( next[i] >= partition[i+1] )
+      {
+        i++;
+      }
 
-    i=0;
-    while( i<nc ) {
-      if( next[i]>=partition[i+1] ) i++;
-      else {
+      else
+      {
         src = &p[ next[i] ];
-        for(;;) {
-          //dest = &p[ next[ sfc[ src->i ] ]++ ];
+
+        for( ; ; )
+        {
           dest = &p[ next[ src->i ]++ ];
-          if( src==dest ) break;
+
+          if ( src == dest ) break;
+
           save_p = *dest;
           *dest  = *src;
           *src   = save_p;
         }
       }
-
     }
   }
 }
 
+//----------------------------------------------------------------------------//
+// This is the new thread parallel version of the particle sort.
+//----------------------------------------------------------------------------//
+
+#else
+
+//----------------------------------------------------------------------------//
+// Top level function to select and call the proper sort_p function using the
+// desired particle sort abstraction.  Currently, the only abstraction
+// available is the pipeline abstraction.
+//----------------------------------------------------------------------------//
+
+void
+sort_p( species_t * sp )
+{
+  if ( !sp )
+  {
+    ERROR( ( "Bad args" ) );
+  }
+
+  // Conditionally execute this when more abstractions are available.
+  sort_p_pipeline( sp );
+}
+
 #endif
diff --git a/src/species_advance/standard/uncenter_p.cc b/src/species_advance/standard/uncenter_p.cc
index 8364a20a..b664b01e 100644
--- a/src/species_advance/standard/uncenter_p.cc
+++ b/src/species_advance/standard/uncenter_p.cc
@@ -1,166 +1,18 @@
 #define IN_spa
-#define HAS_V4_PIPELINE
-#include "spa_private.h"
 
-void
-uncenter_p_pipeline( center_p_pipeline_args_t * args,
-                     int pipeline_rank,
-                     int n_pipeline ) {
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-
-  particle_t           * ALIGNED(32)  p;
-  const interpolator_t * ALIGNED(16)  f;
-
-  const float qdt_2mc        =     -args->qdt_2mc; // For backward half advance
-  const float qdt_4mc        = -0.5*args->qdt_2mc; // For backward half rotate
-  const float one            = 1.;
-  const float one_third      = 1./3.;
-  const float two_fifteenths = 2./15.;
-
-  float dx, dy, dz, ux, uy, uz;
-  float hax, hay, haz, cbx, cby, cbz;
-  float v0, v1, v2, v3, v4;
+#include "../species_advance.h"
 
-  int first, ii, n;
-
-  // Determine which particles this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, n );
-  p = args->p0 + first;
-
-  // Process particles for this pipeline
-
-  for(;n;n--,p++) {
-    dx   = p->dx;                            // Load position
-    dy   = p->dy;
-    dz   = p->dz;
-    ii   = p->i;
-    f    = f0 + ii;                          // Interpolate E
-    hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
-                     dz*( f->dexdz + dy*f->d2exdydz ) );
-    hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
-                     dx*( f->deydx + dz*f->d2eydzdx ) );
-    haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
-                     dy*( f->dezdy + dx*f->d2ezdxdy ) );
-    cbx  = f->cbx + dx*f->dcbxdx;            // Interpolate B
-    cby  = f->cby + dy*f->dcbydy;
-    cbz  = f->cbz + dz*f->dcbzdz;
-    ux   = p->ux;                            // Load momentum
-    uy   = p->uy;
-    uz   = p->uz;
-    v0   = qdt_4mc/(float)sqrt(one + (ux*ux + (uy*uy + uz*uz)));
-    /**/                                     // Boris - scalars
-    v1   = cbx*cbx + (cby*cby + cbz*cbz);
-    v2   = (v0*v0)*v1;
-    v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
-    v4   = v3/(one+v1*(v3*v3));
-    v4  += v4;
-    v0   = ux + v3*( uy*cbz - uz*cby );      // Boris - uprime
-    v1   = uy + v3*( uz*cbx - ux*cbz );
-    v2   = uz + v3*( ux*cby - uy*cbx );
-    ux  += v4*( v1*cbz - v2*cby );           // Boris - rotation
-    uy  += v4*( v2*cbx - v0*cbz );
-    uz  += v4*( v0*cby - v1*cbx );
-    ux  += hax;                              // Half advance E
-    uy  += hay;
-    uz  += haz;
-    p->ux = ux;                              // Store momentum
-    p->uy = uy;
-    p->uz = uz;
-  }
-}
-
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-using namespace v4;
+//----------------------------------------------------------------------------//
+// Top level function to select and call particle uncenter function using the
+// desired particle center abstraction.  Currently, the only abstraction
+// available is the pipeline abstraction.
+//----------------------------------------------------------------------------//
 
 void
-uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
-                        int pipeline_rank,
-                        int n_pipeline ) {
-  const interpolator_t * ALIGNED(128) f0  = args->f0;
-
-  particle_t           * ALIGNED(128) p;
-  const float          * ALIGNED(16)  vp0;
-  const float          * ALIGNED(16)  vp1;
-  const float          * ALIGNED(16)  vp2;
-  const float          * ALIGNED(16)  vp3;
-
-  const v4float qdt_2mc(    -args->qdt_2mc); // For backward half advance
-  const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate
-  const v4float one(1.);
-  const v4float one_third(1./3.);
-  const v4float two_fifteenths(2./15.);
-
-  v4float dx, dy, dz, ux, uy, uz, q;
-  v4float hax, hay, haz, cbx, cby, cbz;
-  v4float v0, v1, v2, v3, v4, v5;
-  v4int ii;
-
-  int first, nq;
-
-  // Determine which particle quads this pipeline processes
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
-  p = args->p0 + first;
-  nq >>= 2;
-
-  // Process the particle quads for this pipeline
-
-  for( ; nq; nq--, p+=4 ) {
-    load_4x4_tr(&p[0].dx,&p[1].dx,&p[2].dx,&p[3].dx,dx,dy,dz,ii);
-
-    // Interpolate fields
-    vp0 = (const float * ALIGNED(16))(f0 + ii(0));
-    vp1 = (const float * ALIGNED(16))(f0 + ii(1));
-    vp2 = (const float * ALIGNED(16))(f0 + ii(2));
-    vp3 = (const float * ALIGNED(16))(f0 + ii(3));
-    load_4x4_tr(vp0,  vp1,  vp2,  vp3,  hax,v0,v1,v2); hax = qdt_2mc*fma( fma( dy, v2, v1 ), dz, fma( dy, v0, hax ) );
-    load_4x4_tr(vp0+4,vp1+4,vp2+4,vp3+4,hay,v3,v4,v5); hay = qdt_2mc*fma( fma( dz, v5, v4 ), dx, fma( dz, v3, hay ) );
-    load_4x4_tr(vp0+8,vp1+8,vp2+8,vp3+8,haz,v0,v1,v2); haz = qdt_2mc*fma( fma( dx, v2, v1 ), dy, fma( dx, v0, haz ) );
-    load_4x4_tr(vp0+12,vp1+12,vp2+12,vp3+12,cbx,v3,cby,v4); cbx = fma( v3, dx, cbx );
-    /**/                                                    cby = fma( v4, dy, cby );
-    load_4x2_tr(vp0+16,vp1+16,vp2+16,vp3+16,cbz,v5);        cbz = fma( v5, dz, cbz );
-
-    // Update momentum
-    load_4x4_tr(&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux,ux,uy,uz,q);
-    /**/                                              // Could use load_4x3_tr
-    v0  = qdt_4mc*rsqrt( one + fma( ux,ux, fma( uy,uy, uz*uz ) ) );
-    v1  = fma( cbx,cbx, fma( cby,cby, cbz*cbz ) );
-    v2  = (v0*v0)*v1;
-    v3  = v0*fma( v2, fma( v2, two_fifteenths, one_third ), one );
-    v4  = v3*rcp( fma( v3*v3, v1, one ) ); v4 += v4;
-    v0  = fma( fms( uy,cbz, uz*cby ), v3, ux );
-    v1  = fma( fms( uz,cbx, ux*cbz ), v3, uy );
-    v2  = fma( fms( ux,cby, uy*cbx ), v3, uz );
-    ux  = fma( fms( v1,cbz, v2*cby ), v4, ux );
-    uy  = fma( fms( v2,cbx, v0*cbz ), v4, uy );
-    uz  = fma( fms( v0,cby, v1*cbx ), v4, uz );
-    ux += hax;
-    uy += hay;
-    uz += haz;
-    store_4x4_tr(ux,uy,uz,q,&p[0].ux,&p[1].ux,&p[2].ux,&p[3].ux);
-    /**/                                              // Could use store_4x3_tr
-  }
-}
-
-#endif
-
-void
-uncenter_p( /**/  species_t            * RESTRICT sp,
-            const interpolator_array_t * RESTRICT ia ) {
-  DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 );
-
-  if( !sp || !ia || sp->g!=ia->g ) ERROR(( "Bad args" ));
-
-  // Have the pipelines do the bulk of particles in quads and have the
-  // host do the final incomplete quad.
-
-  args->p0      = sp->p;
-  args->f0      = ia->i;
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
-  args->np      = sp->np;
-
-  EXEC_PIPELINES( uncenter_p, args, 0 );
-  WAIT_PIPELINES();
+uncenter_p( species_t * RESTRICT sp,
+            const interpolator_array_t * RESTRICT ia )
+{
+  // Once more options are available, this should be conditionally executed
+  // based on user choice.
+  uncenter_p_pipeline( sp, ia );
 }
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
new file mode 100644
index 00000000..c5b2c1e4
--- /dev/null
+++ b/src/util/CMakeLists.txt
@@ -0,0 +1,105 @@
+#-----------------------------------------------------------------------------~#
+# Copyright (c) 2014 Los Alamos National Security, LLC
+# All rights reserved.
+#-----------------------------------------------------------------------------~#
+
+set(util_HEADERS
+  bitfield.h
+  checksum.h
+  swap.h
+  system.h
+  util.h
+  util_base.h
+  checkpt/checkpt.h
+  checkpt/checkpt_io.h
+  checkpt/checkpt_private.h
+  io/FileIO.h
+  io/FileIOData.h
+  io/FileUtils.h
+  io/P2PIOPolicy.h
+  io/P2PUtilsPolicy.h
+  io/StandardIOPolicy.h
+  io/StandardUtilsPolicy.h
+  mp/DMPPolicy.h
+  mp/mp.h
+  mp/MPWrapper.h
+  mp/RelayPolicy.h
+  pipelines/pipelines.h
+  pipelines/pipelines_openmp.h
+  pipelines/pipelines_pthreads.h
+  profile/profile.h
+  rng/drandn_table.h
+  rng/frandn_table.h
+  rng/rng.h
+  rng/rng_private.h
+  v4/v4.h
+  v4/v4_altivec.h
+  v4/v4_portable.h
+  v4/v4_sse.h
+  v4/v4_avx.h
+  v4/v4_avx2.h
+  v8/v8.h
+  v8/v8_avx.h
+  v8/v8_avx2.h
+  v8/v8_portable.h
+  v16/v16.h
+  v16/v16_avx512.h
+  v16/v16_portable.h
+  PARENT_SCOPE
+)
+
+set(util_SOURCES
+  checkpt/checkpt.c
+  checkpt/checkpt_io.cc
+  mp/mp.cc
+  boot.c
+  util_base.c
+  pipelines/pipelines_serial.c
+  pipelines/pipelines_thread.c
+  pipelines/pipelines_helper.c
+  profile/profile.c
+  rng/drandn_table.c
+  rng/frandn_table.c
+  rng/rng.c
+  rng/rng_pool.c
+  PARENT_SCOPE
+)
+
+if(ENABLE_UNIT_TESTS)
+
+# TODO: reenable unit tests
+#  if(USE_V4)
+#    cinch_add_unit(v4
+#    SOURCES v4/test/v4.cc)
+#    endif(USE_V4)
+
+#  if(USE_V8)
+#    cinch_add_unit(v8
+#    SOURCES v8/test/v8.cc)
+#    endif(USE_V8)
+
+#  if(USE_V16)
+#    cinch_add_unit(v16
+#    SOURCES v16/test/v16.cc)
+#    endif(USE_V16)
+
+#  cinch_add_unit(rng
+#    SOURCES rng/test/rng.cc
+#    LIBRARIES vpic dl
+#    POLICY VPIC
+#    THREADS 1)
+
+#  cinch_add_unit(casename
+#    SOURCES testfile.cc
+#    LIBRARIES list
+#    INCLUDES list
+#    POLICY MPI
+#    THREADS 1 2 4
+#  )
+
+endif(ENABLE_UNIT_TESTS)
+
+#----------------------------------------------------------------------------~-#
+# Formatting options for vim.
+# vim: set tabstop=2 shiftwidth=2 expandtab :
+#----------------------------------------------------------------------------~-#
diff --git a/src/util/boot.c b/src/util/boot.c
index 974bb21d..473fe5d5 100644
--- a/src/util/boot.c
+++ b/src/util/boot.c
@@ -24,13 +24,24 @@ boot_services( int * pargc,
   // FIXME: The thread utilities should take responsibility for
   // thread-core affinity instead of leaving this to chance.
 
-  serial.boot( pargc, pargv );
-  thread.boot( pargc, pargv );
-
   // Boot up the communications layer
-  // See note above about thread-core-affinity
 
-  boot_mp( pargc, pargv );
+#if defined(VPIC_USE_PTHREADS)
+  #if defined(VPIC_SWAP_MPI_PTHREAD_INIT)
+    boot_mp( pargc, pargv );      // Boot communication layer first.
+    serial.boot( pargc, pargv );
+    thread.boot( pargc, pargv );
+  #else
+    serial.boot( pargc, pargv );
+    thread.boot( pargc, pargv );
+    boot_mp( pargc, pargv );      // Boot communication layer last.
+  #endif
+
+#elif defined(VPIC_USE_OPENMP)
+  boot_mp( pargc, pargv );        // Boot communication layer first.
+  omp_helper.boot( pargc, pargv );
+
+#endif
 
   // Set the boot_timestamp
 
@@ -42,11 +53,26 @@ boot_services( int * pargc,
 // This operates in reverse order from boot_services
 
 void
-halt_services( void ) {
+halt_services( void )
+{
   _boot_timestamp = 0;
+
+#if defined(VPIC_USE_PTHREADS)
+  #if defined(VPIC_SWAP_MPI_PTHREAD_INIT)
+    thread.halt();
+    serial.halt();
+    halt_mp();
+  #else
+    halt_mp();
+    thread.halt();
+    serial.halt();
+  #endif
+
+#elif defined(VPIC_USE_OPENMP)
   halt_mp();
-  thread.halt();
-  serial.halt();
+
+#endif
+
   halt_checkpt();
 }
 
diff --git a/src/util/checkpt/checkpt.c b/src/util/checkpt/checkpt.c
index e60d7eac..7ed1d32a 100644
--- a/src/util/checkpt/checkpt.c
+++ b/src/util/checkpt/checkpt.c
@@ -165,11 +165,17 @@ object_ptr( size_t id ) {
   return node ? node->obj : NULL;
 }
 
-void
-register_object( void * obj,
-                 checkpt_func_t checkpt_func,
-                 restore_func_t restore_func,
-                 reanimate_func_t reanimate_func ) {
+// TODO: It's common to pass a ** in here, can we change the interface to just
+// be a regular pointer? I think we should avoid void casts where possible..
+// (but I accept checkpoint could possibly be re-written)
+void register_object(
+        void* obj,
+        checkpt_func_t checkpt_func,
+        restore_func_t restore_func,
+        reanimate_func_t reanimate_func
+)
+{
+
   registry_t * node, * prev;
 
   /* Check input args */
diff --git a/src/util/pipelines/pipelines.h b/src/util/pipelines/pipelines.h
index 8b37ee47..f4259f70 100644
--- a/src/util/pipelines/pipelines.h
+++ b/src/util/pipelines/pipelines.h
@@ -1,126 +1,57 @@
 #ifndef _pipelines_h_
 #define _pipelines_h_
 
-#include "../util_base.h"
-
-enum { MAX_PIPELINE = 64 };
-
-// A pipeline function takes a pointer to arguments for the pipeline
-// and a integer which gives the rank of the pipeline and the total
-// number of pipelines dispatched.
-
-typedef void
-(*pipeline_func_t)( void * args,
-                    int pipeline_rank,
-                    int n_pipeline );
-
-///////////////////////////////////////////////////////////////////////////////
+//----------------------------------------------------------------------------//
+// Include some stuff that is common to both Pthreads and OpenMP.
+//----------------------------------------------------------------------------//
 
-#if defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
-
-  // Use thread dispatcher on the v4 pipeline
-  // Caller will do straggler cleanup with scalar pipeline
-
-# define N_PIPELINE thread.n_pipeline
-# define EXEC_PIPELINES(name,args,str)                                 \
-  thread.dispatch( (pipeline_func_t)name##_pipeline_v4,                \
-                   args, sizeof(*args), str );                         \
-  name##_pipeline( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
-# define WAIT_PIPELINES() thread.wait()
-
-# define PROTOTYPE_PIPELINE( name, args_t ) \
-  void                                      \
-  name##_pipeline_v4( args_t * args,        \
-                      int pipeline_rank,    \
-                      int n_pipeline );     \
-                                            \
-  void                                      \
-  name##_pipeline( args_t * args,           \
-                   int pipeline_rank,       \
-                   int n_pipeline )
-
-# define PAD_STRUCT( sz )
-
-#else
-
-  // Use thread dispatcher on the scalar pipeline
-  // Caller will do straggler cleanup with scalar pipeline
-
-# define N_PIPELINE thread.n_pipeline
-# define EXEC_PIPELINES(name,args,str)                                  \
-  thread.dispatch( (pipeline_func_t)name##_pipeline,                    \
-                   args, sizeof(*args), str );                          \
-  name##_pipeline( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
-# define WAIT_PIPELINES() thread.wait()
+#include "../util_base.h"
 
-# define PROTOTYPE_PIPELINE( name, args_t ) \
-  void                                      \
-  name##_pipeline( args_t * args,           \
-                   int pipeline_rank,       \
-                   int n_pipeline )
+enum { MAX_PIPELINE = 272 };
 
+// Is this even related to pipelines.  Maybe this should be in util_base.h.
 # define PAD_STRUCT( sz )
 
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
+//----------------------------------------------------------------------------//
+// Make sure that pipelines_pthreads.h and pipelines_openmp.h can only be
+// included via this header file.
+//----------------------------------------------------------------------------//
 
-typedef struct pipeline_dispatcher {
+#define THREAD_REROUTE
 
-  // n_pipelines indicates the number of pipelines currently running.
-  // Technically, this should be read only for users!
+//----------------------------------------------------------------------------//
+// If using Pthreads, include pipelines_pthreads.h.
+//----------------------------------------------------------------------------//
 
-  int n_pipeline;
+#if defined(VPIC_USE_PTHREADS)
 
-  // boot creates the number of pipelines requested (in the command
-  // line args).  Generally, this is number of cores on a node if
-  // using symmetric multiprocessing or the number of pipeline
-  // processors if using heterogeneous multiprocessing.
+#include "pipelines_pthreads.h"
 
-  void
-  (*boot)( int * pargc,
-           char *** pargv );
+//----------------------------------------------------------------------------//
+// If using OpenMP, include pipelines_openmp.h.
+//----------------------------------------------------------------------------//
 
-  // halt destroys all the resources used by the dispatcher created
-  // in boot.
+#elif defined(VPIC_USE_OPENMP)
 
-  void (*halt)( void );
+#include "pipelines_openmp.h"
 
-  // dispatch begins executing the given pipeline function on all the
-  // pipelines.
-  //
-  // pipeline is the pipeline function to execute on the pipelines.
-  //
-  // args is an array of arguments to pass to each pipeline.
-  //
-  // sz gives the byte size of an element of the argument
-  // array.
-  //
-  // str gives the element stride between elements of the argument
-  // array.  Pass 0 if you want all pipelines to get the same
-  // arguments.
-  //
-  // If the pipeline functions do not take arguments, use NULL for
-  // args and 0 for sz and str
-                    
-  void
-  (*dispatch)( pipeline_func_t pipeline,
-               void * args,
-               int sz,
-               int str );
+//----------------------------------------------------------------------------//
+// I wonder if VPIC will actually run without a threading model.
+//----------------------------------------------------------------------------//
 
-  // wait waits for the previous dispatch to complete.
+#else
 
-  void
-  (*wait)( void );
+// Need to figure out how to handle this case.
 
-} pipeline_dispatcher_t;
+#error "VPIC_USE_OPENMP or VPIC_USE_PTHREADS must be specified"
 
-BEGIN_C_DECLS
+#endif
 
-extern pipeline_dispatcher_t serial; // For debugging purposes
-extern pipeline_dispatcher_t thread;
+//----------------------------------------------------------------------------//
+// Make sure that pipelines_pthreads.h and pipelines_openmp.h can only be
+// included via this header file.
+//----------------------------------------------------------------------------//
 
-END_C_DECLS
+#undef THREAD_REROUTE
 
 #endif // _pipelines_h_ 
diff --git a/src/util/pipelines/pipelines_exec.h b/src/util/pipelines/pipelines_exec.h
new file mode 100644
index 00000000..58736032
--- /dev/null
+++ b/src/util/pipelines/pipelines_exec.h
@@ -0,0 +1,55 @@
+#ifndef _pipelines_exec_h_
+#define _pipelines_exec_h_
+
+//----------------------------------------------------------------------------//
+// Include some stuff that is common to both Pthreads and OpenMP.
+//----------------------------------------------------------------------------//
+
+#include "pipelines.h"
+
+#include "../v4/v4.h"
+#include "../v8/v8.h"
+#include "../v16/v16.h"
+
+//----------------------------------------------------------------------------//
+// Make sure that pipelines_exec_pth.h and pipelines_exec_omp.h can only be
+// included via this header file.
+//----------------------------------------------------------------------------//
+
+#define THREAD_REROUTE
+
+//----------------------------------------------------------------------------//
+// If using Pthreads, include pipelines_exec_pth.h.
+//----------------------------------------------------------------------------//
+
+#if defined(VPIC_USE_PTHREADS)
+
+#include "pipelines_exec_pth.h"
+
+//----------------------------------------------------------------------------//
+// If using OpenMP, include pipelines_exec_omp.h.
+//----------------------------------------------------------------------------//
+
+#elif defined(VPIC_USE_OPENMP)
+
+#include "pipelines_exec_omp.h"
+
+//----------------------------------------------------------------------------//
+// I wonder if VPIC will actually run without a threading model.
+//----------------------------------------------------------------------------//
+
+#else
+
+// Need to figure out how to handle this case.
+
+#error "VPIC_USE_OPENMP or VPIC_USE_PTHREADS must be specified"
+
+#endif
+
+//----------------------------------------------------------------------------//
+// Undefine local macros.
+//----------------------------------------------------------------------------//
+
+#undef THREAD_REROUTE
+
+#endif // _pipelines_exec_h_ 
diff --git a/src/util/pipelines/pipelines_exec_omp.h b/src/util/pipelines/pipelines_exec_omp.h
new file mode 100644
index 00000000..bbd24a87
--- /dev/null
+++ b/src/util/pipelines/pipelines_exec_omp.h
@@ -0,0 +1,97 @@
+#ifndef _pipelines_exec_omp_h_
+#define _pipelines_exec_omp_h_
+
+#ifndef THREAD_REROUTE
+#error "Do not include pipelines_exec_omp.h directly; use pipelines_exec.h"
+#endif
+
+//----------------------------------------------------------------------------//
+// Generic macros that are used for all cases of vector acceleration as well
+// as the standard case that does not use vector acceleration.
+//----------------------------------------------------------------------------//
+
+// TODO: this could be removed as VPIC (elsewhere) knows how to stringify from a
+// macro
+#define TOSTRING( a ) #a       //convert pragma directives to string
+
+#define WAIT_PIPELINES() _Pragma( TOSTRING( omp barrier ) )
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v16 simd vector acceleration.  Uses thread
+// dispatcher on the v16 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#if defined(V16_ACCELERATION) && defined(HAS_V16_PIPELINE)
+
+# define EXEC_PIPELINES(name, args, str)                                   \
+  _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \
+  {                                                                        \
+    _Pragma( TOSTRING( omp for ) )                                         \
+    for( int id = 0; id < N_PIPELINE; id++ )                               \
+    {                                                                      \
+      name##_pipeline_v16( args+id*sizeof(*args)*str, id, N_PIPELINE );    \
+    }                                                                      \
+  }                                                                        \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE );
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v8 simd vector acceleration.  Uses thread
+// dispatcher on the v8 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#elif defined(V8_ACCELERATION) && defined(HAS_V8_PIPELINE)
+
+# define EXEC_PIPELINES(name, args, str)                                   \
+  _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \
+  {                                                                        \
+    _Pragma( TOSTRING( omp for ) )                                         \
+    for( int id = 0; id < N_PIPELINE; id++ )                               \
+    {                                                                      \
+      name##_pipeline_v8( args+id*sizeof(*args)*str, id, N_PIPELINE );     \
+    }                                                                      \
+  }                                                                        \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE );
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v4 simd vector acceleration.  Uses thread
+// dispatcher on the v4 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#elif defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+# define EXEC_PIPELINES(name, args, str)                                   \
+  _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \
+  {                                                                        \
+    _Pragma( TOSTRING( omp for ) )                                         \
+    for( int id = 0; id < N_PIPELINE; id++ )                               \
+    {                                                                      \
+      name##_pipeline_v4( args+id*sizeof(*args)*str, id, N_PIPELINE );     \
+    }                                                                      \
+  }                                                                        \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE );
+
+//----------------------------------------------------------------------------//
+// Macro defines to support the standard implementation which does not use
+// explicit simd vectorization.  Uses thread dispatcher on the scalar pipeline
+// and the caller does straggler cleanup with the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#else
+
+# define EXEC_PIPELINES(name, args, str)                                   \
+  _Pragma( TOSTRING( omp parallel num_threads(N_PIPELINE) shared(args) ) ) \
+  {                                                                        \
+    _Pragma( TOSTRING( omp for ) )                                         \
+    for( int id = 0; id < N_PIPELINE; id++ )                               \
+    {                                                                      \
+      name##_pipeline_scalar( args+id*sizeof(*args)*str, id, N_PIPELINE ); \
+    }                                                                      \
+  }                                                                        \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE );
+
+#endif
+
+#endif // _pipelines_exec_omp_h_ 
diff --git a/src/util/pipelines/pipelines_exec_pth.h b/src/util/pipelines/pipelines_exec_pth.h
new file mode 100644
index 00000000..ecfa0910
--- /dev/null
+++ b/src/util/pipelines/pipelines_exec_pth.h
@@ -0,0 +1,69 @@
+#ifndef _pipelines_exec_pth_h_
+#define _pipelines_exec_pth_h_
+
+#ifndef THREAD_REROUTE
+#error "Do not include pipelines_exec_pth.h directly; use pipelines_exec.h."
+#endif
+
+//----------------------------------------------------------------------------//
+// Generic macros that are used for all cases of vector acceleration as well
+// as the standard case that does not use vector acceleration.
+//----------------------------------------------------------------------------//
+
+# define WAIT_PIPELINES() thread.wait()
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v16 simd vector acceleration.  Uses thread
+// dispatcher on the v16 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#if defined(V16_ACCELERATION) && defined(HAS_V16_PIPELINE)
+
+# define EXEC_PIPELINES(name,args,str)                           \
+  thread.dispatch( (pipeline_func_t)name##_pipeline_v16,         \
+                   args, sizeof(*args), str );                   \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v8 simd vector acceleration.  Uses thread
+// dispatcher on the v8 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#elif defined(V8_ACCELERATION) && defined(HAS_V8_PIPELINE)
+
+# define EXEC_PIPELINES(name,args,str)                           \
+  thread.dispatch( (pipeline_func_t)name##_pipeline_v8,          \
+                   args, sizeof(*args), str );                   \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
+
+//----------------------------------------------------------------------------//
+// Macro defines to support v4 simd vector acceleration.  Uses thread
+// dispatcher on the v4 pipeline and the caller does straggler cleanup with
+// the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#elif defined(V4_ACCELERATION) && defined(HAS_V4_PIPELINE)
+
+# define EXEC_PIPELINES(name,args,str)                           \
+  thread.dispatch( (pipeline_func_t)name##_pipeline_v4,          \
+                   args, sizeof(*args), str );                   \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
+
+//----------------------------------------------------------------------------//
+// Macro defines to support the standard implementation which does not use
+// explicit simd vectorization.  Uses thread dispatcher on the scalar pipeline
+// and the caller does straggler cleanup with the scalar pipeline.
+//----------------------------------------------------------------------------//
+
+#else
+
+# define EXEC_PIPELINES(name,args,str)                           \
+  thread.dispatch( (pipeline_func_t)name##_pipeline_scalar,      \
+                   args, sizeof(*args), str );                   \
+  name##_pipeline_scalar( args+str*N_PIPELINE, N_PIPELINE, N_PIPELINE )
+
+#endif
+
+#endif // _pipelines_exec_pth_h_ 
diff --git a/src/util/pipelines/pipelines_helper.c b/src/util/pipelines/pipelines_helper.c
new file mode 100644
index 00000000..60c6d808
--- /dev/null
+++ b/src/util/pipelines/pipelines_helper.c
@@ -0,0 +1,48 @@
+#include "pipelines.h"
+
+#if defined(VPIC_USE_OPENMP)
+
+#include <omp.h>
+
+// omp_boot - mimics thread_boot - see 'pipelines_thread.c'
+
+static void
+omp_boot( int * pargc,
+	  char *** pargv )
+{
+  int n_pipeline;
+
+  if ( omp_helper.n_pipeline != 0 ) ERROR(( "OMP container has already booted" ));
+
+  n_pipeline = strip_cmdline_int( pargc, pargv, "--tpp", 1 );
+
+  omp_set_num_threads( n_pipeline );
+
+  if ( n_pipeline < 1 || n_pipeline > MAX_PIPELINE )
+    ERROR(( "Invalid number of pipelines requested (%i)", n_pipeline ));
+
+  //initialize dispatch_to_host
+  int dispatch_to_host = strip_cmdline_int( pargc, pargv, "--dispatch_to_host", 1 );
+
+  //assign our helper values
+  omp_helper.n_pipeline       = n_pipeline;
+  omp_helper.dispatch_to_host = dispatch_to_host;
+}
+
+/*
+static void
+omp_util( int * pargc,
+          char *** pargv )
+{
+  const char * f_dump = strip_cmdline_string( pargc, pargv, "--dump-fields",   "none.txt" );
+  const char * e_dump = strip_cmdline_string( pargc, pargv, "--dump-energies", "none.txt" );
+}
+*/
+
+omp_container_t omp_helper = {
+  0,
+  0,
+  omp_boot
+};
+
+#endif
diff --git a/src/util/pipelines/pipelines_openmp.h b/src/util/pipelines/pipelines_openmp.h
new file mode 100644
index 00000000..d76429d2
--- /dev/null
+++ b/src/util/pipelines/pipelines_openmp.h
@@ -0,0 +1,45 @@
+#ifndef _pipelines_openmp_h_
+#define _pipelines_openmp_h_
+
+#ifndef THREAD_REROUTE
+#error "Do not include pipelines_openmp.h directly; use pipelines.h"
+#endif
+
+// #include "../util_base.h"
+
+#include <omp.h>
+
+//----------------------------------------------------------------------------//
+// Generic macros that are used for all cases of vector acceleration as well
+// as the standard case that does not use vector acceleration.
+//----------------------------------------------------------------------------//
+
+#define N_PIPELINE omp_helper.n_pipeline
+
+//----------------------------------------------------------------------------//
+// A container object to mimic 'serial' and 'thread' objects.  Currently just
+// useful for avoiding global var 'n_pipeline'.
+//----------------------------------------------------------------------------//
+
+typedef struct omp_container
+{
+  int n_pipeline;
+  int dispatch_to_host;
+
+  //const char * f_dump;
+  //const char * e_dump;
+
+  //boot gets the number of pipelines from the cmd line
+  //and passes it on to the EXEC_PIPELINS macro eventually
+  void
+  (*boot)( int * pargc,
+	   char *** pargv );
+} omp_container_t;
+
+BEGIN_C_DECLS
+
+extern omp_container_t omp_helper;
+
+END_C_DECLS
+
+#endif // _pipelines_openmp_h_ 
diff --git a/src/util/pipelines/pipelines_pthreads.h b/src/util/pipelines/pipelines_pthreads.h
new file mode 100644
index 00000000..9d90936e
--- /dev/null
+++ b/src/util/pipelines/pipelines_pthreads.h
@@ -0,0 +1,86 @@
+#ifndef _pipelines_pthreads_h_
+#define _pipelines_pthreads_h_
+
+#ifndef THREAD_REROUTE
+#error "Do not include pipelines_pthreads.h directly; use pipelines.h."
+#endif
+
+//----------------------------------------------------------------------------//
+// A pipeline function takes a pointer to arguments for the pipeline and an
+// integer which gives the rank of the pipeline and the total number of
+// pipelines dispatched.
+//----------------------------------------------------------------------------//
+
+typedef void
+(*pipeline_func_t)( void * args,
+                    int pipeline_rank,
+                    int n_pipeline );
+
+//----------------------------------------------------------------------------//
+// Generic macros that are used for all cases of vector acceleration as well
+// as the standard case that does not use vector acceleration.
+//----------------------------------------------------------------------------//
+
+# define N_PIPELINE thread.n_pipeline
+
+////////////////////////////////////////////////////////////////////////////////
+
+typedef struct pipeline_dispatcher
+{
+  // n_pipelines indicates the number of pipelines currently running.
+  // Technically, this should be read only for users!
+
+  int n_pipeline;
+
+  // boot creates the number of pipelines requested (in the command
+  // line args).  Generally, this is number of cores on a node if
+  // using symmetric multiprocessing or the number of pipeline
+  // processors if using heterogeneous multiprocessing.
+
+  void
+  (*boot)( int * pargc,
+           char *** pargv );
+
+  // halt destroys all the resources used by the dispatcher created
+  // in boot.
+
+  void (*halt)( void );
+
+  // dispatch begins executing the given pipeline function on all the
+  // pipelines.
+  //
+  // pipeline is the pipeline function to execute on the pipelines.
+  //
+  // args is an array of arguments to pass to each pipeline.
+  //
+  // sz gives the byte size of an element of the argument
+  // array.
+  //
+  // str gives the element stride between elements of the argument
+  // array.  Pass 0 if you want all pipelines to get the same
+  // arguments.
+  //
+  // If the pipeline functions do not take arguments, use NULL for
+  // args and 0 for sz and str
+                    
+  void
+  (*dispatch)( pipeline_func_t pipeline,
+               void * args,
+               int sz,
+               int str );
+
+  // wait waits for the previous dispatch to complete.
+
+  void
+  (*wait)( void );
+
+} pipeline_dispatcher_t;
+
+BEGIN_C_DECLS
+
+extern pipeline_dispatcher_t serial; // For debugging purposes
+extern pipeline_dispatcher_t thread;
+
+END_C_DECLS
+
+#endif // _pipelines_pthreads_h_ 
diff --git a/src/util/pipelines/pipelines_serial.c b/src/util/pipelines/pipelines_serial.c
index 34638748..a2a5f2cd 100644
--- a/src/util/pipelines/pipelines_serial.c
+++ b/src/util/pipelines/pipelines_serial.c
@@ -1,5 +1,7 @@
 #include "pipelines.h" // For util_base.h, datatypes and prototypes
 
+#if defined(VPIC_USE_PTHREADS)
+
 static int Busy = 0;
 
 /*****************************************************************************/
@@ -71,3 +73,4 @@ pipeline_dispatcher_t serial = {
   serial_wait      // wait
 };
 
+#endif
diff --git a/src/util/pipelines/pipelines_thread.c b/src/util/pipelines/pipelines_thread.c
index 0edf20be..875b42dc 100644
--- a/src/util/pipelines/pipelines_thread.c
+++ b/src/util/pipelines/pipelines_thread.c
@@ -37,10 +37,12 @@
 // (?) Signal blocking in pipelines
 // (?) Timeouts in thread_halt, thread_boot (spin wait)
 
-#include <pthread.h>
-
 #include "pipelines.h"
 
+#if defined(VPIC_USE_PTHREADS)
+
+#include <pthread.h>
+
 static void *
 pipeline_mgr( void *_id );
 
@@ -175,11 +177,7 @@ thread_boot( int * pargc,
   // Attempt to detect if any old-style arguments exist, and if so warn the user.
   detect_old_style_arguments(pargc, pargv);
 
-# if defined(CELL_PPU_BUILD)
-  n_pipeline       = strip_cmdline_int( pargc, pargv, "--tpp",              2 );
-# else
   n_pipeline       = strip_cmdline_int( pargc, pargv, "--tpp",              1 );
-# endif
   Dispatch_To_Host = strip_cmdline_int( pargc, pargv, "--dispatch_to_host", 1 );
 
   if( n_pipeline<1 || n_pipeline>MAX_PIPELINE )
@@ -577,3 +575,4 @@ pipeline_dispatcher_t thread = {
   thread_wait      // wait
 };
 
+#endif
diff --git a/src/util/profile/profile.c b/src/util/profile/profile.c
index 4dfdf7d0..743c2d34 100644
--- a/src/util/profile/profile.c
+++ b/src/util/profile/profile.c
@@ -20,15 +20,27 @@ update_profile( int dump ) {
     sum_total  += p->t_total;
   }
 
-  if( dump ) {
+  if( dump )
+  {
+    #if defined(VPIC_PRINT_MORE_DIGITS)
+    log_printf( "\n" // 8901234567890123456 | xxx% x.xxxe+xx x.xe+xx x.xxxe+xx | xxx% x.xxxe+xx x.xe+xx x.xxxe+xx 
+                "                           |      Since   Last Update         |      Since   Last Restore\n"
+                "    Operation              | Pct   Time      Count      Per   | Pct   Time      Count      Per\n"
+                "---------------------------+----------------------------------+----------------------------------\n" );
+    #else
     log_printf( "\n" // 8901234567890123456 | xxx% x.xe+xx x.xe+xx x.xe+xx | xxx% x.xe+xx x.xe+xx x.xe+xx 
                 "                           |      Since Last Update       |     Since Last Restore\n"
                 "    Operation              | Pct   Time    Count    Per   | Pct   Time    Count    Per\n"
                 "---------------------------+------------------------------+------------------------------\n" );
+    #endif
 
     for( p=profile_internal_use_only; p->name; p++ ) {
       if( p->n==0 && p->n_total==0 ) continue;
+      #if defined(VPIC_PRINT_MORE_DIGITS)
+      log_printf( "%26.26s | % 3d%% %.3e %.1e %.3e | % 3d%% %.3e %.1e %.3e\n",
+      #else
       log_printf( "%26.26s | % 3d%% %.1e %.1e %.1e | % 3d%% %.1e %.1e %.1e\n",
+      #endif
                   p->name,
                   (int)( 100.*p->t/sum + 0.5 ), p->t,
                   (double)p->n,
diff --git a/src/util/util.h b/src/util/util.h
index 9ac9d68c..bfd97fe3 100644
--- a/src/util/util.h
+++ b/src/util/util.h
@@ -4,7 +4,9 @@
 // Expose all public functionality in util.  The below includes bring
 // in util_base.h and other low level includes automatically.
 
-#include "v4/v4.h" // Must be first (FIXME: REALLY?)
+#include "v4/v4.h"
+#include "v8/v8.h"
+#include "v16/v16.h"
 #include "checkpt/checkpt.h"
 #include "mp/mp.h"
 #include "rng/rng.h"
diff --git a/src/util/util_base.c b/src/util/util_base.c
index 97b5827b..a9aa6129 100644
--- a/src/util/util_base.c
+++ b/src/util/util_base.c
@@ -97,7 +97,7 @@ void detect_old_style_arguments(int* pargc, char *** pargv)
       prefix_keys[0] = "-tpp";
       prefix_keys[1] = "-restore";
 
-      match_keys[0] = "restart";
+      match_keys[0] = "-restart";
 
       char* arg = (*pargv)[i];
 
@@ -140,16 +140,16 @@ void detect_old_style_arguments(int* pargc, char *** pargv)
           }
       }
 
-		  // Check for "=" (equals)
+      // Check for "=" (equals)
       // TODO: Add an option to make this an error or a warning
-			if (string_contains(arg, "="))
-			{
+      if (string_contains(arg, "="))
+      {
          const int NUM_WARN_REPEAT = 5;
          for (j = 0; j < NUM_WARN_REPEAT; j++)
          {
             WARNING(( "Arguments contains '=', is this intentional? (use a space)" ));
          }
-			}
+      }
 
   }
 
diff --git a/src/util/util_base.h b/src/util/util_base.h
index d7bc0c03..bc9db329 100644
--- a/src/util/util_base.h
+++ b/src/util/util_base.h
@@ -123,8 +123,8 @@ typedef struct collective collective_t;
 // scope of context in which it is declared.  Note: This macro is
 // really two statements (there is no way to bundle the macro into one
 // semantic statement linguistically without defeating the purpose of
-// the macro).  Thus, it is not as robust as it could be.  Thus, sure
-// any usage of this macro occurs in contexts where two back-to-back
+// the macro).  Thus, it is not as robust as it could be.  Thus, make
+// sure any usage of this macro occurs in contexts where two back-to-back
 // statments in the same context would be in the same scope.  That is:
 //
 //   if(...) { DECLARE_ALIGNED_ARRAY(type,align,name,count); ... } // OKAY!
diff --git a/src/util/v16/test/v16.cc b/src/util/v16/test/v16.cc
new file mode 100644
index 00000000..df3516cb
--- /dev/null
+++ b/src/util/v16/test/v16.cc
@@ -0,0 +1,955 @@
+#include <gtest/gtest.h>
+
+#include "../../util.h"
+#include "../v16.h"
+
+#include <iostream>
+
+using namespace v16;
+
+TEST(v16, test_transpose)
+{
+  v16int a00(   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15 );
+  v16int a01(  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31 );
+  v16int a02(  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47 );
+  v16int a03(  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63 );
+  v16int a04(  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79 );
+  v16int a05(  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95 );
+  v16int a06(  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111 );
+  v16int a07( 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 );
+  v16int a08( 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143 );
+  v16int a09( 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159 );
+  v16int a10( 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175 );
+  v16int a11( 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191 );
+  v16int a12( 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207 );
+  v16int a13( 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223 );
+  v16int a14( 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239 );
+  v16int a15( 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 );
+
+  transpose( a00, a01, a02, a03, a04, a05, a06, a07,
+	     a08, a09, a10, a11, a12, a13, a14, a15 );
+
+  ASSERT_FALSE( any( a00 != v16int(  0, 16, 32, 48, 64, 80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240 ) ) ||
+                any( a01 != v16int(  1, 17, 33, 49, 65, 81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241 ) ) ||
+                any( a02 != v16int(  2, 18, 34, 50, 66, 82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242 ) ) ||
+                any( a03 != v16int(  3, 19, 35, 51, 67, 83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243 ) ) ||
+                any( a04 != v16int(  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244 ) ) ||
+                any( a05 != v16int(  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245 ) ) ||
+                any( a06 != v16int(  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246 ) ) ||
+                any( a07 != v16int(  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247 ) ) ||
+                any( a08 != v16int(  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248 ) ) ||
+                any( a09 != v16int(  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249 ) ) ||
+                any( a10 != v16int( 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250 ) ) ||
+                any( a11 != v16int( 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251 ) ) ||
+                any( a12 != v16int( 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252 ) ) ||
+                any( a13 != v16int( 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253 ) ) ||
+                any( a14 != v16int( 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254 ) ) ||
+                any( a15 != v16int( 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255 ) ) );
+}
+
+TEST(v16, test_load_16x16_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 256 );
+
+  v16int a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15;
+
+  int i;
+
+  for( i=0; i < 256; i++ ) mem[i] = i;
+
+  load_16x16_tr( mem,     mem+ 16, mem+ 32, mem+ 48,
+		 mem+ 64, mem+ 80, mem+ 96, mem+112,
+		 mem+128, mem+144, mem+160, mem+176,
+		 mem+192, mem+208, mem+224, mem+240,
+	         a00, a01, a02, a03, a04, a05, a06, a07,
+	         a08, a09, a10, a11, a12, a13, a14, a15 );
+
+  for( i=0; i < 256; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int(  0, 16, 32, 48, 64, 80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240 ) ) ||
+                any( a01 != v16int(  1, 17, 33, 49, 65, 81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241 ) ) ||
+                any( a02 != v16int(  2, 18, 34, 50, 66, 82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242 ) ) ||
+                any( a03 != v16int(  3, 19, 35, 51, 67, 83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243 ) ) ||
+                any( a04 != v16int(  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244 ) ) ||
+                any( a05 != v16int(  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245 ) ) ||
+                any( a06 != v16int(  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246 ) ) ||
+                any( a07 != v16int(  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247 ) ) ||
+                any( a08 != v16int(  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248 ) ) ||
+                any( a09 != v16int(  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249 ) ) ||
+                any( a10 != v16int( 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250 ) ) ||
+                any( a11 != v16int( 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251 ) ) ||
+                any( a12 != v16int( 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252 ) ) ||
+                any( a13 != v16int( 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253 ) ) ||
+                any( a14 != v16int( 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254 ) ) ||
+                any( a15 != v16int( 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255 ) ) ||
+                i != 256 );
+}
+
+TEST(v16, test_load_16x8_tr_p)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 128 );
+
+  v16int a00, a01, a02, a03, a04, a05, a06, a07;
+
+  int i;
+
+  for( i=0; i < 128; i++ ) mem[i] = i;
+
+  load_16x8_tr_p( mem,     mem+ 16, mem+ 32, mem+ 48,
+		  mem+ 64, mem+ 80, mem+ 96, mem+112,
+		  a00, a01, a02, a03, a04, a05, a06, a07 );
+
+  for( i=0; i < 128; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int( 0,  8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88,  96, 104, 112, 120 ) ) ||
+                any( a01 != v16int( 1,  9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89,  97, 105, 113, 121 ) ) ||
+                any( a02 != v16int( 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90,  98, 106, 114, 122 ) ) ||
+                any( a03 != v16int( 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91,  99, 107, 115, 123 ) ) ||
+                any( a04 != v16int( 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124 ) ) ||
+                any( a05 != v16int( 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125 ) ) ||
+                any( a06 != v16int( 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126 ) ) ||
+                any( a07 != v16int( 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127 ) ) ||
+                i != 128 );
+}
+
+TEST(v16, test_load_16x16_tr_p)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 256 );
+
+  v16int a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15;
+
+  int i;
+
+  for( i=0; i < 256; i++ ) mem[i] = i;
+
+  load_16x16_tr_p( mem,     mem+ 16, mem+ 32, mem+ 48,
+                   mem+ 64, mem+ 80, mem+ 96, mem+112,
+                   mem+128, mem+144, mem+160, mem+176,
+                   mem+192, mem+208, mem+224, mem+240,
+                   a00, a01, a02, a03, a04, a05, a06, a07,
+                   a08, a09, a10, a11, a12, a13, a14, a15 );
+
+  for( i=0; i < 256; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int(   0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120 ) ) ||
+                any( a01 != v16int(   1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121 ) ) ||
+                any( a02 != v16int(   2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122 ) ) ||
+                any( a03 != v16int(   3,  11,  19,  27,  35,  43,  51,  59,  67,  75,  83,  91,  99, 107, 115, 123 ) ) ||
+                any( a04 != v16int(   4,  12,  20,  28,  36,  44,  52,  60,  68,  76,  84,  92, 100, 108, 116, 124 ) ) ||
+                any( a05 != v16int(   5,  13,  21,  29,  37,  45,  53,  61,  69,  77,  85,  93, 101, 109, 117, 125 ) ) ||
+                any( a06 != v16int(   6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126 ) ) ||
+                any( a07 != v16int(   7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127 ) ) ||
+                any( a08 != v16int( 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248 ) ) ||
+                any( a09 != v16int( 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249 ) ) ||
+                any( a10 != v16int( 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250 ) ) ||
+                any( a11 != v16int( 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251 ) ) ||
+                any( a12 != v16int( 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252 ) ) ||
+                any( a13 != v16int( 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253 ) ) ||
+                any( a14 != v16int( 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254 ) ) ||
+                any( a15 != v16int( 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255 ) ) ||
+                i != 256 );
+}
+
+TEST(v16, test_store_16x16_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 256 );
+
+  v16int a00(  0, 16, 32, 48, 64, 80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240 );
+  v16int a01(  1, 17, 33, 49, 65, 81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241 );
+  v16int a02(  2, 18, 34, 50, 66, 82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242 );
+  v16int a03(  3, 19, 35, 51, 67, 83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243 );
+  v16int a04(  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244 );
+  v16int a05(  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245 );
+  v16int a06(  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246 );
+  v16int a07(  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247 );
+  v16int a08(  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248 );
+  v16int a09(  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249 );
+  v16int a10( 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250 );
+  v16int a11( 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251 );
+  v16int a12( 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252 );
+  v16int a13( 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253 );
+  v16int a14( 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254 );
+  v16int a15( 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255 );
+
+  int i;
+
+  for( i=0; i < 256; i++ ) mem[i] = 0;
+
+  store_16x16_tr( a00, a01, a02, a03, a04, a05, a06, a07,
+		  a08, a09, a10, a11, a12, a13, a14, a15,
+		  mem,     mem+ 16, mem+ 32, mem+ 48,
+		  mem+ 64, mem+ 80, mem+ 96, mem+112,
+		  mem+128, mem+144, mem+160, mem+176,
+		  mem+192, mem+208, mem+224, mem+240 );
+
+  for( i=0; i < 256; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int(  0, 16, 32, 48, 64, 80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240 ) ) ||
+                any( a01 != v16int(  1, 17, 33, 49, 65, 81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241 ) ) ||
+                any( a02 != v16int(  2, 18, 34, 50, 66, 82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242 ) ) ||
+                any( a03 != v16int(  3, 19, 35, 51, 67, 83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243 ) ) ||
+                any( a04 != v16int(  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244 ) ) ||
+                any( a05 != v16int(  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245 ) ) ||
+                any( a06 != v16int(  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246 ) ) ||
+                any( a07 != v16int(  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247 ) ) ||
+                any( a08 != v16int(  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248 ) ) ||
+                any( a09 != v16int(  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249 ) ) ||
+                any( a10 != v16int( 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250 ) ) ||
+                any( a11 != v16int( 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251 ) ) ||
+                any( a12 != v16int( 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252 ) ) ||
+                any( a13 != v16int( 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253 ) ) ||
+                any( a14 != v16int( 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254 ) ) ||
+                any( a15 != v16int( 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255 ) ) ||
+                i != 256 );
+}
+
+TEST(v16, test_store_16x8_tr_p)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 128 );
+
+  v16int a00( 0,  8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88,  96, 104, 112, 120 );
+  v16int a01( 1,  9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89,  97, 105, 113, 121 );
+  v16int a02( 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90,  98, 106, 114, 122 );
+  v16int a03( 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91,  99, 107, 115, 123 );
+  v16int a04( 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124 );
+  v16int a05( 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125 );
+  v16int a06( 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126 );
+  v16int a07( 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127 );
+
+  int i;
+
+  for( i=0; i < 128; i++ ) mem[i] = 0;
+
+  store_16x8_tr_p( a00, a01, a02, a03, a04, a05, a06, a07,
+		   mem,     mem+ 16, mem+ 32, mem+ 48,
+		   mem+ 64, mem+ 80, mem+ 96, mem+112 );
+
+  for( i=0; i < 128; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int( 0,  8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88,  96, 104, 112, 120 ) ) ||
+                any( a01 != v16int( 1,  9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89,  97, 105, 113, 121 ) ) ||
+                any( a02 != v16int( 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90,  98, 106, 114, 122 ) ) ||
+                any( a03 != v16int( 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91,  99, 107, 115, 123 ) ) ||
+                any( a04 != v16int( 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124 ) ) ||
+                any( a05 != v16int( 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125 ) ) ||
+                any( a06 != v16int( 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126 ) ) ||
+                any( a07 != v16int( 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127 ) ) ||
+                i != 128 );
+}
+
+TEST(v16, test_store_16x16_tr_p)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 256 );
+
+  v16int a00(   0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120 );
+  v16int a01(   1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121 );
+  v16int a02(   2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122 );
+  v16int a03(   3,  11,  19,  27,  35,  43,  51,  59,  67,  75,  83,  91,  99, 107, 115, 123 );
+  v16int a04(   4,  12,  20,  28,  36,  44,  52,  60,  68,  76,  84,  92, 100, 108, 116, 124 );
+  v16int a05(   5,  13,  21,  29,  37,  45,  53,  61,  69,  77,  85,  93, 101, 109, 117, 125 );
+  v16int a06(   6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126 );
+  v16int a07(   7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127 );
+  v16int a08( 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248 );
+  v16int a09( 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249 );
+  v16int a10( 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250 );
+  v16int a11( 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251 );
+  v16int a12( 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252 );
+  v16int a13( 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253 );
+  v16int a14( 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254 );
+  v16int a15( 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255 );
+
+  int i;
+
+  for( i=0; i < 256; i++ ) mem[i] = 0;
+
+  store_16x16_tr_p( a00, a01, a02, a03, a04, a05, a06, a07,
+                    a08, a09, a10, a11, a12, a13, a14, a15,
+                    mem,     mem+ 16, mem+ 32, mem+ 48,
+                    mem+ 64, mem+ 80, mem+ 96, mem+112,
+                    mem+128, mem+144, mem+160, mem+176,
+                    mem+192, mem+208, mem+224, mem+240 );
+
+  for( i=0; i < 256; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a00 != v16int(   0,   8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96, 104, 112, 120 ) ) ||
+                any( a01 != v16int(   1,   9,  17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97, 105, 113, 121 ) ) ||
+                any( a02 != v16int(   2,  10,  18,  26,  34,  42,  50,  58,  66,  74,  82,  90,  98, 106, 114, 122 ) ) ||
+                any( a03 != v16int(   3,  11,  19,  27,  35,  43,  51,  59,  67,  75,  83,  91,  99, 107, 115, 123 ) ) ||
+                any( a04 != v16int(   4,  12,  20,  28,  36,  44,  52,  60,  68,  76,  84,  92, 100, 108, 116, 124 ) ) ||
+                any( a05 != v16int(   5,  13,  21,  29,  37,  45,  53,  61,  69,  77,  85,  93, 101, 109, 117, 125 ) ) ||
+                any( a06 != v16int(   6,  14,  22,  30,  38,  46,  54,  62,  70,  78,  86,  94, 102, 110, 118, 126 ) ) ||
+                any( a07 != v16int(   7,  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95, 103, 111, 119, 127 ) ) ||
+                any( a08 != v16int( 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248 ) ) ||
+                any( a09 != v16int( 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249 ) ) ||
+                any( a10 != v16int( 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250 ) ) ||
+                any( a11 != v16int( 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251 ) ) ||
+                any( a12 != v16int( 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252 ) ) ||
+                any( a13 != v16int( 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253 ) ) ||
+                any( a14 != v16int( 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254 ) ) ||
+                any( a15 != v16int( 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255 ) ) ||
+                i != 256 );
+}
+
+//----------------------------------------------------------------------------//
+
+#if 0
+
+TEST(v16, test_any)
+{
+  v16int a;
+  int i;
+
+  for( i=0; i < 256; i++ )
+  {
+    a[0] = i&1,  a[1] = i&2,  a[2] = i&4,  a[3] = i&8;
+    a[4] = i&16, a[5] = i&32, a[6] = i&64, a[7] = i&128;
+
+    ASSERT_FALSE( ( i>0 && !any(a) ) || ( i==0 && any(a) ) );
+  }
+}
+
+TEST(v16, test_all)
+{
+  v16int a;
+  int i;
+  for( i=0; i < 256; i++ )
+  {
+    a[0] = i&1,  a[1] = i&2,  a[2] = i&4,  a[3] = i&8;
+    a[4] = i&16, a[5] = i&32, a[6] = i&64, a[7] = i&128;
+
+    ASSERT_FALSE( ( i < 255 && all(a) ) || ( i == 255 && !all(a) ) );
+  }
+}
+
+TEST(v16, test_splat)
+{
+  v16int a( 1, 2, 3, 4, 5, 6, 7, 8);
+  v16int b( 9,10,11,12,13,14,15,16);
+  v16int c(17,18,19,20,21,22,23,24);
+  v16int d(25,26,27,28,29,30,31,32);
+  v16int e(33,34,35,36,37,38,39,40);
+  v16int f(41,42,43,44,45,46,47,48);
+  v16int g(49,50,51,52,53,54,55,56);
+  v16int h(57,58,59,60,61,62,63,64);
+  v16int i(65,66,67,68,69,70,71,72);
+
+  b = splat<0>(a);
+  c = splat<1>(a);
+  d = splat<2>(a);
+  e = splat<3>(a);
+  f = splat<4>(a);
+  g = splat<5>(a);
+  h = splat<6>(a);
+  i = splat<7>(a);
+
+  ASSERT_FALSE( any(a!=v16int(1,2,3,4,5,6,7,8)) ||
+		any(b!=v16int(1,1,1,1,1,1,1,1)) ||
+		any(c!=v16int(2,2,2,2,2,2,2,2)) ||
+		any(d!=v16int(3,3,3,3,3,3,3,3)) ||
+		any(e!=v16int(4,4,4,4,4,4,4,4)) ||
+		any(f!=v16int(5,5,5,5,5,5,5,5)) ||
+		any(g!=v16int(6,6,6,6,6,6,6,6)) ||
+		any(h!=v16int(7,7,7,7,7,7,7,7)) ||
+		any(i!=v16int(8,8,8,8,8,8,8,8)) );
+}
+
+TEST(v16, test_shuffle)
+{
+  v16int a( 0, 1, 2, 3, 4, 5, 6, 7);
+  v16int b( 9,10,11,12,13,14,15,16);
+  v16int c(17,18,19,20,21,22,23,24);
+  v16int d(25,26,27,28,29,30,31,32);
+  v16int e(33,34,35,36,37,38,39,40);
+  v16int f(41,42,43,44,45,46,47,48);
+  v16int g(49,50,51,52,53,54,55,56);
+  v16int h(57,58,59,60,61,62,63,64);
+  v16int i(65,66,67,68,69,70,71,72);
+
+  b = shuffle<1,2,3,4,5,6,7,0>(a);
+  c = shuffle<2,3,4,5,6,7,0,1>(a);
+  d = shuffle<3,4,5,6,7,0,1,2>(a);
+  e = shuffle<4,5,6,7,0,1,2,3>(a);
+  f = shuffle<5,6,7,0,1,2,3,4>(a);
+  g = shuffle<6,7,0,1,2,3,4,5>(a);
+  h = shuffle<7,0,1,2,3,4,5,6>(a);
+  i = shuffle<7,6,5,4,3,2,1,0>(a);
+
+  ASSERT_FALSE( any(a!=v16int(0,1,2,3,4,5,6,7)) ||
+		any(b!=v16int(1,2,3,4,5,6,7,0)) ||
+		any(c!=v16int(2,3,4,5,6,7,0,1)) ||
+		any(d!=v16int(3,4,5,6,7,0,1,2)) ||
+		any(e!=v16int(4,5,6,7,0,1,2,3)) ||
+		any(f!=v16int(5,6,7,0,1,2,3,4)) ||
+		any(g!=v16int(6,7,0,1,2,3,4,5)) ||
+		any(h!=v16int(7,0,1,2,3,4,5,6)) ||
+		any(i!=v16int(7,6,5,4,3,2,1,0)) );
+}
+
+// #endif
+
+TEST(v16, test_swap)
+{
+  v16int a( 1, 2, 3, 4, 5, 6, 7, 8);
+  v16int b( 9,10,11,12,13,14,15,16);
+
+  swap(a,b);
+
+  ASSERT_FALSE( any( a != v16int( 9,10,11,12,13,14,15,16) ) ||
+		any( b != v16int( 1, 2, 3, 4, 5, 6, 7, 8) ) );
+}
+
+TEST(v16, test_load_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v16int a0(1,0,0,0,0,0,0,0);
+  v16int a1(0,0,0,0,0,0,0,0);
+  v16int a2(0,0,0,0,0,0,0,0);
+  v16int a3(0,0,0,0,0,0,0,0);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = i;
+
+  load_8x1( mem,    a0 );
+  load_8x1( mem+8,  a1 );
+  load_8x1( mem+16, a2 );
+  load_8x1( mem+24, a3 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v16int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v16int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v16int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v16, test_store_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v16int a0( 0, 1, 2, 3, 4, 5, 6, 7);
+  v16int a1( 8, 9,10,11,12,13,14,15);
+  v16int a2(16,17,18,19,20,21,22,23);
+  v16int a3(24,25,26,27,28,29,30,31);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = 0;
+
+  store_8x1( a0, mem      );
+  store_8x1( a1, mem +  8 );
+  store_8x1( a2, mem + 16 );
+  store_8x1( a3, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v16int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v16int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v16int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v16, test_stream_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v16int a0( 0, 1, 2, 3, 4, 5, 6, 7);
+  v16int a1( 8, 9,10,11,12,13,14,15);
+  v16int a2(16,17,18,19,20,21,22,23);
+  v16int a3(24,25,26,27,28,29,30,31);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = 0;
+
+  stream_8x1( a0, mem      );
+  stream_8x1( a1, mem +  8 );
+  stream_8x1( a2, mem + 16 );
+  stream_8x1( a3, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v16int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v16int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v16int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v16, test_clear_8x1)
+{
+  v16float vmem[4]; float * mem = (float *)vmem;
+
+  int i;
+
+  for(i=0; i < 32; i++) mem[i] = i;
+
+  clear_8x1( mem + 16 );
+  clear_8x1( mem + 24 );
+
+  for(i=16; i < 32; i++) mem[i] += i;
+
+  for(i=0; i < 32; i++) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v16, test_copy_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  int i;
+
+  for( i=0; i < 16; i++ ) mem[i] = i;
+
+  copy_8x1( mem + 16, mem     );
+  copy_8x1( mem + 24, mem + 8 );
+
+  for( i=16; i < 32; i++ ) mem[i] += 16;
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v16, test_swap_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  int i;
+
+  for( i=0; i < 16; i++ ) mem[i] = i;
+
+  copy_8x1( mem + 24, mem     );
+  copy_8x1( mem + 16, mem + 8 );
+
+  for( i=16; i < 32; i++ ) mem[i] += 16;
+
+  swap_8x1( mem + 16, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v16, test_load_8x1_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x1_tr( mem,   mem+8,  mem+16, mem+24, mem+32, mem+40, mem+48, mem+56, a0 );
+  load_8x1_tr( mem+1, mem+9,  mem+17, mem+25, mem+33, mem+41, mem+49, mem+57, a1 );
+  load_8x1_tr( mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58, a2 );
+  load_8x1_tr( mem+3, mem+11, mem+19, mem+27, mem+35, mem+43, mem+51, mem+59, a3 );
+  load_8x1_tr( mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60, a4 );
+  load_8x1_tr( mem+5, mem+13, mem+21, mem+29, mem+37, mem+45, mem+53, mem+61, a5 );
+  load_8x1_tr( mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62, a6 );
+  load_8x1_tr( mem+7, mem+15, mem+23, mem+31, mem+39, mem+47, mem+55, mem+63, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_load_8x2_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x2_tr( mem,   mem+8,  mem+16, mem+24, mem+32, mem+40, mem+48, mem+56, a0, a1 );
+  load_8x2_tr( mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58, a2, a3 );
+  load_8x2_tr( mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60, a4, a5 );
+  load_8x2_tr( mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62, a6, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_load_8x2_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x2_tr( mem,    mem+2,  mem+4,  mem+6,  mem+8,  mem+10, mem+12, mem+14, a0, a1 );
+  load_8x2_tr( mem+16, mem+18, mem+20, mem+22, mem+24, mem+26, mem+28, mem+30, a2, a3 );
+  load_8x2_tr( mem+32, mem+34, mem+36, mem+38, mem+40, mem+42, mem+44, mem+46, a4, a5 );
+  load_8x2_tr( mem+48, mem+50, mem+52, mem+54, mem+56, mem+58, mem+60, mem+62, a6, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 2, 4, 6, 8,10,12,14) ) ||
+		any( a1 != v16int( 1, 3, 5, 7, 9,11,13,15) ) ||
+		any( a2 != v16int(16,18,20,22,24,26,28,30) ) ||
+		any( a3 != v16int(17,19,21,23,25,27,29,31) ) ||
+		any( a4 != v16int(32,34,36,38,40,42,44,46) ) ||
+		any( a5 != v16int(33,35,37,39,41,43,45,47) ) ||
+		any( a6 != v16int(48,50,52,54,56,58,60,62) ) ||
+		any( a7 != v16int(49,51,53,55,57,59,61,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_load_8x3_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x3_tr( mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56,
+	       a0, a1, a2 );
+
+  for( i=0; i < 64; i++ ) if( mem[i]!=i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_load_8x4_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2, a3;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x4_tr( mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56,
+	       a0, a1, a2, a3 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_load_8x4_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0, a1, a2, a3;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x4_tr( mem, mem+4, mem+8, mem+12, mem+16, mem+20, mem+24, mem+28,
+	       a0, a1, a2, a3 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0,4, 8,12,16,20,24,28) ) ||
+		any( a1 != v16int(1,5, 9,13,17,21,25,29) ) ||
+		any( a2 != v16int(2,6,10,14,18,22,26,30) ) ||
+		any( a3 != v16int(3,7,11,15,19,23,27,31) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_store_8x1_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0(0, 8,16,24,32,40,48,56);
+  v16int a1(1, 9,17,25,33,41,49,57);
+  v16int a2(2,10,18,26,34,42,50,58);
+  v16int a3(3,11,19,27,35,43,51,59);
+  v16int a4(4,12,20,28,36,44,52,60);
+  v16int a5(5,13,21,29,37,45,53,61);
+  v16int a6(6,14,22,30,38,46,54,62);
+  v16int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x1_tr( a0, mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+  store_8x1_tr( a1, mem+1, mem+ 9, mem+17, mem+25, mem+33, mem+41, mem+49, mem+57 );
+  store_8x1_tr( a2, mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58 );
+  store_8x1_tr( a3, mem+3, mem+11, mem+19, mem+27, mem+35, mem+43, mem+51, mem+59 );
+  store_8x1_tr( a4, mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+  store_8x1_tr( a5, mem+5, mem+13, mem+21, mem+29, mem+37, mem+45, mem+53, mem+61 );
+  store_8x1_tr( a6, mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62 );
+  store_8x1_tr( a7, mem+7, mem+15, mem+23, mem+31, mem+39, mem+47, mem+55, mem+63 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_store_8x2_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0(0, 8,16,24,32,40,48,56);
+  v16int a1(1, 9,17,25,33,41,49,57);
+  v16int a2(2,10,18,26,34,42,50,58);
+  v16int a3(3,11,19,27,35,43,51,59);
+  v16int a4(4,12,20,28,36,44,52,60);
+  v16int a5(5,13,21,29,37,45,53,61);
+  v16int a6(6,14,22,30,38,46,54,62);
+  v16int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x2_tr( a0, a1, mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+  store_8x2_tr( a2, a3, mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58 );
+  store_8x2_tr( a4, a5, mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+  store_8x2_tr( a6, a7, mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_store_8x2_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0( 0, 2, 4, 6, 8,10,12,14);
+  v16int a1( 1, 3, 5, 7, 9,11,13,15);
+  v16int a2(16,18,20,22,24,26,28,30);
+  v16int a3(17,19,21,23,25,27,29,31);
+  v16int a4(32,34,36,38,40,42,44,46);
+  v16int a5(33,35,37,39,41,43,45,47);
+  v16int a6(48,50,52,54,56,58,60,62);
+  v16int a7(49,51,53,55,57,59,61,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x2_tr( a0, a1, mem,    mem+ 2, mem+ 4, mem+ 6, mem+ 8, mem+10, mem+12, mem+14 );
+  store_8x2_tr( a2, a3, mem+16, mem+18, mem+20, mem+22, mem+24, mem+26, mem+28, mem+30 );
+  store_8x2_tr( a4, a5, mem+32, mem+34, mem+36, mem+38, mem+40, mem+42, mem+44, mem+46 );
+  store_8x2_tr( a6, a7, mem+48, mem+50, mem+52, mem+54, mem+56, mem+58, mem+60, mem+62 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 2, 4, 6, 8,10,12,14) ) ||
+		any( a1 != v16int( 1, 3, 5, 7, 9,11,13,15) ) ||
+		any( a2 != v16int(16,18,20,22,24,26,28,30) ) ||
+		any( a3 != v16int(17,19,21,23,25,27,29,31) ) ||
+		any( a4 != v16int(32,34,36,38,40,42,44,46) ) ||
+		any( a5 != v16int(33,35,37,39,41,43,45,47) ) ||
+		any( a6 != v16int(48,50,52,54,56,58,60,62) ) ||
+		any( a7 != v16int(49,51,53,55,57,59,61,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_store_8x3_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0(0, 8,16,24,32,40,48,56);
+  v16int a1(1, 9,17,25,33,41,49,57);
+  v16int a2(2,10,18,26,34,42,50,58);
+  v16int a3(3,11,19,27,35,43,51,59);
+  v16int a4(4,12,20,28,36,44,52,60);
+  v16int a5(5,13,21,29,37,45,53,61);
+  v16int a6(6,14,22,30,38,46,54,62);
+  v16int a7(7,15,23,31,39,47,55,63);
+
+  int i, j;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x3_tr( a0, a1, a2,
+		mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+
+  j = 0;
+  for( i=0; i < 8; i++ )
+  {
+    if( ( i <  3 && mem[i]    != i    ) ||
+	( i >= 3 && mem[i]    != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+ 8] != i+ 8 ) ||
+	( i >= 3 && mem[i+ 8] != 0   ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+16] != i+16 ) ||
+	( i >= 3 && mem[i+16] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+24] != i+24 ) ||
+	( i >= 3 && mem[i+24] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+32] != i+32 ) ||
+	( i >= 3 && mem[i+32] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+40] != i+40 ) ||
+	( i >= 3 && mem[i+40] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+48] != i+48 ) ||
+	( i >= 3 && mem[i+48] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+56] != i+56 ) ||
+	( i >= 3 && mem[i+56] != 0    ) )
+      break;
+    else
+      j++;
+  }
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		j != 64 );
+}
+
+TEST(v16, test_store_8x4_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0(0, 8,16,24,32,40,48,56);
+  v16int a1(1, 9,17,25,33,41,49,57);
+  v16int a2(2,10,18,26,34,42,50,58);
+  v16int a3(3,11,19,27,35,43,51,59);
+  v16int a4(4,12,20,28,36,44,52,60);
+  v16int a5(5,13,21,29,37,45,53,61);
+  v16int a6(6,14,22,30,38,46,54,62);
+  v16int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x4_tr( a0, a1, a2, a3,
+		mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+
+  store_8x4_tr( a4, a5, a6, a7,
+		mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v16int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v16int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v16int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v16int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v16int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v16int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v16int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v16, test_store_8x4_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v16int a0( 0, 4, 8,12,16,20,24,28);
+  v16int a1( 1, 5, 9,13,17,21,25,29);
+  v16int a2( 2, 6,10,14,18,22,26,30);
+  v16int a3( 3, 7,11,15,19,23,27,31);
+  v16int a4(32,36,40,44,48,52,56,60);
+  v16int a5(33,37,41,45,49,53,57,61);
+  v16int a6(34,38,42,46,50,54,58,62);
+  v16int a7(35,39,43,47,51,55,59,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x4_tr( a0, a1, a2, a3,
+		mem,    mem+ 4, mem+ 8, mem+12, mem+16, mem+20, mem+24, mem+28 );
+
+  store_8x4_tr( a4, a5, a6, a7,
+		mem+32, mem+36, mem+40, mem+44, mem+48, mem+52, mem+56, mem+60 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v16int( 0, 4, 8,12,16,20,24,28) ) ||
+		any( a1 != v16int( 1, 5, 9,13,17,21,25,29) ) ||
+		any( a2 != v16int( 2, 6,10,14,18,22,26,30) ) ||
+		any( a3 != v16int( 3, 7,11,15,19,23,27,31) ) ||
+		any( a4 != v16int(32,36,40,44,48,52,56,60) ) ||
+		any( a5 != v16int(33,37,41,45,49,53,57,61) ) ||
+		any( a6 != v16int(34,38,42,46,50,54,58,62) ) ||
+		any( a7 != v16int(35,39,43,47,51,55,59,63) ) ||
+		i != 64 );
+}
+
+#endif
diff --git a/src/util/v16/v16.h b/src/util/v16/v16.h
new file mode 100644
index 00000000..0bf52264
--- /dev/null
+++ b/src/util/v16/v16.h
@@ -0,0 +1,14 @@
+#ifndef _v16_h_
+#define _v16_h_
+/* FIXME: STYLE */
+#define IN_v16_h
+/* FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H? */
+#ifdef __cplusplus
+# if defined USE_V16_PORTABLE
+#   include "v16_portable.h"
+# elif defined USE_V16_AVX512
+#   include "v16_avx512.h"
+# endif
+#endif
+#undef IN_v16_h
+#endif // _v16_h_
diff --git a/src/util/v16/v16_avx512.h b/src/util/v16/v16_avx512.h
new file mode 100644
index 00000000..b9331831
--- /dev/null
+++ b/src/util/v16/v16_avx512.h
@@ -0,0 +1,2867 @@
+#ifndef _v16_avx512_h_
+#define _v16_avx512_h_
+
+#ifndef IN_v16_h
+#error "Do not include v16_avx512.h directly; use v16.h"
+#endif
+
+#define V16_ACCELERATION
+#define V16_AVX512_ACCELERATION
+
+#include <immintrin.h>
+#include <math.h>
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v16
+{
+  class v16;
+  class v16int;
+  class v16float;
+
+  ////////////////
+  // v16 base class
+
+  class v16
+  {
+    friend class v16int;
+    friend class v16float;
+
+    // v16 miscellaneous friends
+
+    friend inline int any( const v16 &a ) ALWAYS_INLINE;
+    friend inline int all( const v16 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v16 splat( const v16 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+				  v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+				  v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+				  v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16    merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE;
+
+    // v16 memory manipulation friends
+
+    friend inline void   load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE;
+    friend inline void  store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void  clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE;
+    friend inline void   copy_16x1( void * ALIGNED(64) dst,
+				    const void * ALIGNED(64) src ) ALWAYS_INLINE;
+    friend inline void   swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE;
+
+    // v16 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 16x2_tr variants.
+
+    friend inline void load_16x1_tr( const void *a00, const void *a01,
+				     const void *a02, const void *a03,
+				     const void *a04, const void *a05,
+				     const void *a06, const void *a07,
+				     const void *a08, const void *a09,
+				     const void *a10, const void *a11,
+				     const void *a12, const void *a13,
+				     const void *a14, const void *a15,
+				     v16 &a ) ALWAYS_INLINE;
+    friend inline void load_16x2_tr( const void * ALIGNED(8) a00,
+				     const void * ALIGNED(8) a01,
+				     const void * ALIGNED(8) a02,
+				     const void * ALIGNED(8) a03,
+				     const void * ALIGNED(8) a04,
+				     const void * ALIGNED(8) a05,
+				     const void * ALIGNED(8) a06,
+				     const void * ALIGNED(8) a07,
+				     const void * ALIGNED(8) a08,
+				     const void * ALIGNED(8) a09,
+				     const void * ALIGNED(8) a10,
+				     const void * ALIGNED(8) a11,
+				     const void * ALIGNED(8) a12,
+				     const void * ALIGNED(8) a13,
+				     const void * ALIGNED(8) a14,
+				     const void * ALIGNED(8) a15,
+				     v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void load_16x2_bc( const void * ALIGNED(8) a00,
+				     v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void load_16x3_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE;
+    friend inline void load_16x4_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d,
+				     v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr( const void * ALIGNED(64) a00,
+				      const void * ALIGNED(64) a01,
+				      const void * ALIGNED(64) a02,
+				      const void * ALIGNED(64) a03,
+				      const void * ALIGNED(64) a04,
+				      const void * ALIGNED(64) a05,
+				      const void * ALIGNED(64) a06,
+				      const void * ALIGNED(64) a07,
+				      const void * ALIGNED(64) a08,
+				      const void * ALIGNED(64) a09,
+				      const void * ALIGNED(64) a10,
+				      const void * ALIGNED(64) a11,
+				      const void * ALIGNED(64) a12,
+				      const void * ALIGNED(64) a13,
+				      const void * ALIGNED(64) a14,
+				      const void * ALIGNED(64) a15,
+				      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+				      v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+				      v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+				      v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+    friend inline void load_16x16_bc( const void * ALIGNED(64) a00,
+				      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+				      v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+				      v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+				      v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+				       const void * ALIGNED(64) a01,
+				       const void * ALIGNED(64) a02,
+				       const void * ALIGNED(64) a03,
+				       const void * ALIGNED(64) a04,
+				       const void * ALIGNED(64) a05,
+				       const void * ALIGNED(64) a06,
+				       const void * ALIGNED(64) a07,
+				       v16 &a, v16 &b, v16 &c, v16 &d,
+				       v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+					const void * ALIGNED(64) a01,
+					const void * ALIGNED(64) a02,
+					const void * ALIGNED(64) a03,
+					const void * ALIGNED(64) a04,
+					const void * ALIGNED(64) a05,
+					const void * ALIGNED(64) a06,
+					const void * ALIGNED(64) a07,
+					const void * ALIGNED(64) a08,
+					const void * ALIGNED(64) a09,
+					const void * ALIGNED(64) a10,
+					const void * ALIGNED(64) a11,
+					const void * ALIGNED(64) a12,
+					const void * ALIGNED(64) a13,
+					const void * ALIGNED(64) a14,
+					const void * ALIGNED(64) a15,
+					v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+					v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+					v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+					v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+
+    friend inline void store_16x1_tr( const v16 &a,
+				      void *a00, void *a01, void *a02, void *a03,
+				      void *a04, void *a05, void *a06, void *a07,
+				      void *a08, void *a09, void *a10, void *a11,
+				      void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE;
+    friend inline void store_16x2_tr( const v16 &a, const v16 &b,
+				      void * ALIGNED(8) a00,
+				      void * ALIGNED(8) a01,
+				      void * ALIGNED(8) a02,
+				      void * ALIGNED(8) a03,
+				      void * ALIGNED(8) a04,
+				      void * ALIGNED(8) a05,
+				      void * ALIGNED(8) a06,
+				      void * ALIGNED(8) a07,
+				      void * ALIGNED(8) a08,
+				      void * ALIGNED(8) a09,
+				      void * ALIGNED(8) a10,
+				      void * ALIGNED(8) a11,
+				      void * ALIGNED(8) a12,
+				      void * ALIGNED(8) a13,
+				      void * ALIGNED(8) a14,
+				      void * ALIGNED(8) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x4_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      const v16 &e, const v16 &f,
+				      const v16 &g, const v16 &h,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr( const v16 &b00, const v16 &b01,
+				       const v16 &b02, const v16 &b03,
+				       const v16 &b04, const v16 &b05,
+				       const v16 &b06, const v16 &b07,
+				       const v16 &b08, const v16 &b09,
+				       const v16 &b10, const v16 &b11,
+				       const v16 &b12, const v16 &b13,
+				       const v16 &b14, const v16 &b15,
+				       void * ALIGNED(64) a00,
+				       void * ALIGNED(64) a01,
+				       void * ALIGNED(64) a02,
+				       void * ALIGNED(64) a03,
+				       void * ALIGNED(64) a04,
+				       void * ALIGNED(64) a05,
+				       void * ALIGNED(64) a06,
+				       void * ALIGNED(64) a07,
+				       void * ALIGNED(64) a08,
+				       void * ALIGNED(64) a09,
+				       void * ALIGNED(64) a10,
+				       void * ALIGNED(64) a11,
+				       void * ALIGNED(64) a12,
+				       void * ALIGNED(64) a13,
+				       void * ALIGNED(64) a14,
+				       void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr_p( const v16 &a, const v16 &b,
+					const v16 &c, const v16 &d,
+					const v16 &e, const v16 &f,
+					const v16 &g, const v16 &h,
+					void * ALIGNED(64) a00,
+					void * ALIGNED(64) a01,
+					void * ALIGNED(64) a02,
+					void * ALIGNED(64) a03,
+					void * ALIGNED(64) a04,
+					void * ALIGNED(64) a05,
+					void * ALIGNED(64) a06,
+					void * ALIGNED(64) a07 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01,
+					 const v16 &b02, const v16 &b03,
+					 const v16 &b04, const v16 &b05,
+					 const v16 &b06, const v16 &b07,
+					 const v16 &b08, const v16 &b09,
+					 const v16 &b10, const v16 &b11,
+					 const v16 &b12, const v16 &b13,
+					 const v16 &b14, const v16 &b15,
+					 void * ALIGNED(64) a00,
+					 void * ALIGNED(64) a01,
+					 void * ALIGNED(64) a02,
+					 void * ALIGNED(64) a03,
+					 void * ALIGNED(64) a04,
+					 void * ALIGNED(64) a05,
+					 void * ALIGNED(64) a06,
+					 void * ALIGNED(64) a07,
+					 void * ALIGNED(64) a08,
+					 void * ALIGNED(64) a09,
+					 void * ALIGNED(64) a10,
+					 void * ALIGNED(64) a11,
+					 void * ALIGNED(64) a12,
+					 void * ALIGNED(64) a13,
+					 void * ALIGNED(64) a14,
+					 void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int   i[16];
+      float f[16];
+      __m512 v;
+    };
+
+  public:
+
+    v16() {}                    // Default constructor
+
+    v16( const v16 &a )         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    ~v16() {}                   // Default destructor
+  };
+
+  // v16 miscellaneous functions
+
+  inline int any( const v16 &a )
+  {
+    return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] ||
+           a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] ||
+           a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] ||
+           a.i[12] || a.i[13] || a.i[14] || a.i[15];
+  }
+
+  inline int all( const v16 &a )
+  {
+    return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] &&
+           a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] &&
+           a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] &&
+           a.i[12] && a.i[13] && a.i[14] && a.i[15];
+  }
+
+  template<int n>
+  inline v16 splat( const v16 & a )
+  {
+    v16 b;
+
+    b.v = _mm512_set1_ps( a.v[n] );
+
+    return b;
+  }
+
+  template<int i00, int i01, int i02, int i03, int i04, int i05, int i06, int i07, int i08, int i09, int i10, int i11, int i12, int i13, int i14, int i15>
+  inline v16 shuffle( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[i00];
+    b.i[ 1] = a.i[i01];
+    b.i[ 2] = a.i[i02];
+    b.i[ 3] = a.i[i03];
+    b.i[ 4] = a.i[i04];
+    b.i[ 5] = a.i[i05];
+    b.i[ 6] = a.i[i06];
+    b.i[ 7] = a.i[i07];
+    b.i[ 8] = a.i[i08];
+    b.i[ 9] = a.i[i09];
+    b.i[10] = a.i[i10];
+    b.i[11] = a.i[i11];
+    b.i[12] = a.i[i12];
+    b.i[13] = a.i[i13];
+    b.i[14] = a.i[i14];
+    b.i[15] = a.i[i15];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v16 &a, v16 &b )
+  {
+    __m512 a_v = a.v;
+
+    a.v = b.v;
+
+    b.v = a_v;
+  }
+
+  inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+			 v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+			 v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+			 v16 &a12, v16 &a13, v16 &a14, v16 &a15 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+
+    // Start                                 a00 =   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    //                                       a01 =  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    //                                       a02 =  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    //                                       a03 =  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    //                                       a04 =  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    //                                       a05 =  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    //                                       a06 =  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    //                                       a07 = 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    //                                       a08 = 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    //                                       a09 = 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    //                                       a10 = 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    //                                       a11 = 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    //                                       a12 = 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    //                                       a13 = 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    //                                       a14 = 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    //                                       a15 = 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( a00.v, a01.v ); //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( a00.v, a01.v ); //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( a02.v, a03.v ); //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( a02.v, a03.v ); //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( a04.v, a05.v ); //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( a04.v, a05.v ); //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( a06.v, a07.v ); //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( a06.v, a07.v ); //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( a08.v, a09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( a08.v, a09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( a10.v, a11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( a10.v, a11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( a12.v, a13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( a12.v, a13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( a14.v, a15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( a14.v, a15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    a00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48 
+    a01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49 
+    a02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50 
+    a03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51 
+    a04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112 
+    a05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113 
+    a06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114 
+    a07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115 
+    a08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 
+    a09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 
+    a10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 
+    a11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 
+    a12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 228 240 
+    a13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 229 241 
+    a14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 230 242 
+    a15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 231 243 
+
+    t00   = _mm512_shuffle_f32x4( a00.v, a04.v, 0x88 ); //   0  16  32  48   8  24  40  56  64  80  96  112 ...
+    t01   = _mm512_shuffle_f32x4( a01.v, a05.v, 0x88 ); //   1  17  33  49 ...
+    t02   = _mm512_shuffle_f32x4( a02.v, a06.v, 0x88 ); //   2  18  34  50 ...
+    t03   = _mm512_shuffle_f32x4( a03.v, a07.v, 0x88 ); //   3  19  35  51 ...
+    t04   = _mm512_shuffle_f32x4( a00.v, a04.v, 0xdd ); //   4  20  36  52 ...
+    t05   = _mm512_shuffle_f32x4( a01.v, a05.v, 0xdd ); //   5  21  37  53 ...
+    t06   = _mm512_shuffle_f32x4( a02.v, a06.v, 0xdd ); //   6  22  38  54 ...
+    t07   = _mm512_shuffle_f32x4( a03.v, a07.v, 0xdd ); //   7  23  39  55 ...
+    t08   = _mm512_shuffle_f32x4( a08.v, a12.v, 0x88 ); // 128 144 160 176 ...
+    t09   = _mm512_shuffle_f32x4( a09.v, a13.v, 0x88 ); // 129 145 161 177 ...
+    t10   = _mm512_shuffle_f32x4( a10.v, a14.v, 0x88 ); // 130 146 162 178 ...
+    t11   = _mm512_shuffle_f32x4( a11.v, a15.v, 0x88 ); // 131 147 163 179 ...
+    t12   = _mm512_shuffle_f32x4( a08.v, a12.v, 0xdd ); // 132 148 164 180 ...
+    t13   = _mm512_shuffle_f32x4( a09.v, a13.v, 0xdd ); // 133 149 165 181 ...
+    t14   = _mm512_shuffle_f32x4( a10.v, a14.v, 0xdd ); // 134 150 166 182 ...
+    t15   = _mm512_shuffle_f32x4( a11.v, a15.v, 0xdd ); // 135 151 167 183 ...
+
+    a00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 ); //   0  16  32  48  64  80  96 112 ... 240
+    a01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 ); //   1  17  33  49  66  81  97 113 ... 241
+    a02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 ); //   2  18  34  50  67  82  98 114 ... 242
+    a03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 ); //   3  19  35  51  68  83  99 115 ... 243
+    a04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 ); //   4 ...
+    a05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 ); //   5 ...
+    a06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 ); //   6 ...
+    a07.v = _mm512_shuffle_f32x4( t07, t15, 0x88 ); //   7 ...
+    a08.v = _mm512_shuffle_f32x4( t00, t08, 0xdd ); //   8 ...
+    a09.v = _mm512_shuffle_f32x4( t01, t09, 0xdd ); //   9 ...
+    a10.v = _mm512_shuffle_f32x4( t02, t10, 0xdd ); //  10 ...
+    a11.v = _mm512_shuffle_f32x4( t03, t11, 0xdd ); //  11 ...
+    a12.v = _mm512_shuffle_f32x4( t04, t12, 0xdd ); //  12 ...
+    a13.v = _mm512_shuffle_f32x4( t05, t13, 0xdd ); //  13 ...
+    a14.v = _mm512_shuffle_f32x4( t06, t14, 0xdd ); //  14 ...
+    a15.v = _mm512_shuffle_f32x4( t07, t15, 0xdd ); //  15  31  47  63  79  96 111 127 ... 255
+  }
+
+# undef sw
+
+  // v16 memory manipulation functions
+
+  inline void load_16x1( const void * ALIGNED(64) p,
+			 v16 &a )
+  {
+    for( int j = 0; j < 16; j++ )
+      a.i[j] = ((const int * ALIGNED(64))p)[j];
+  }
+
+  inline void store_16x1( const v16 &a,
+			  void * ALIGNED(64) p )
+  {
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = a.i[j];
+  }
+
+  inline void stream_16x1( const v16 &a,
+			   void * ALIGNED(64) p )
+  {
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = a.i[j];
+  }
+
+  inline void clear_16x1( void * ALIGNED(64) p )
+  {
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_16x1( void * ALIGNED(64) dst,
+			 const void * ALIGNED(64) src )
+  {
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))dst)[j] = ((const int * ALIGNED(64))src)[j];
+  }
+
+  inline void swap_16x1( void * ALIGNED(64) a,
+			 void * ALIGNED(64) b )
+  {
+    int t;
+
+    for( int j = 0; j < 16; j++ )
+    {
+      t = ((int * ALIGNED(64))a)[j];
+      ((int * ALIGNED(64))a)[j] = ((int * ALIGNED(64))b)[j];
+      ((int * ALIGNED(64))b)[j] = t;
+    }
+  }
+
+  // v16 transposed memory manipulation functions
+
+  inline void load_16x1_tr( const void *a00, const void *a01,
+                            const void *a02, const void *a03,
+                            const void *a04, const void *a05,
+                            const void *a06, const void *a07,
+			    const void *a08, const void *a09,
+                            const void *a10, const void *a11,
+                            const void *a12, const void *a13,
+                            const void *a14, const void *a15,
+			    v16 &a )
+  {
+    a.i[ 0] = ((const int *)a00)[0];
+    a.i[ 1] = ((const int *)a01)[0];
+    a.i[ 2] = ((const int *)a02)[0];
+    a.i[ 3] = ((const int *)a03)[0];
+    a.i[ 4] = ((const int *)a04)[0];
+    a.i[ 5] = ((const int *)a05)[0];
+    a.i[ 6] = ((const int *)a06)[0];
+    a.i[ 7] = ((const int *)a07)[0];
+    a.i[ 8] = ((const int *)a08)[0];
+    a.i[ 9] = ((const int *)a09)[0];
+    a.i[10] = ((const int *)a10)[0];
+    a.i[11] = ((const int *)a11)[0];
+    a.i[12] = ((const int *)a12)[0];
+    a.i[13] = ((const int *)a13)[0];
+    a.i[14] = ((const int *)a14)[0];
+    a.i[15] = ((const int *)a15)[0];
+  }
+
+  inline void load_16x2_tr( const void * ALIGNED(8) a00,
+			    const void * ALIGNED(8) a01,
+			    const void * ALIGNED(8) a02,
+			    const void * ALIGNED(8) a03,
+			    const void * ALIGNED(8) a04,
+			    const void * ALIGNED(8) a05,
+			    const void * ALIGNED(8) a06,
+			    const void * ALIGNED(8) a07,
+			    const void * ALIGNED(8) a08,
+			    const void * ALIGNED(8) a09,
+			    const void * ALIGNED(8) a10,
+			    const void * ALIGNED(8) a11,
+			    const void * ALIGNED(8) a12,
+			    const void * ALIGNED(8) a13,
+			    const void * ALIGNED(8) a14,
+			    const void * ALIGNED(8) a15,
+			    v16 &b00, v16 &b01 )
+  {
+    __m512 t00, t01, t02,      t04,      t06,      t08, t09, t10,      t12,      t14;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    u00   = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01   = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02   = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03   = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04   = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05   = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06   = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07   = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    u08   = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    u09   = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    u10   = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    u11   = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    u12   = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    u13   = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    u14   = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    u15   = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( u00, u01 );                            //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t02   = _mm512_unpacklo_ps( u02, u03 );                            //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t04   = _mm512_unpacklo_ps( u04, u05 );                            //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t06   = _mm512_unpacklo_ps( u06, u07 );                            //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t08   = _mm512_unpacklo_ps( u08, u09 );                            // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t10   = _mm512_unpacklo_ps( u10, u11 );                            // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t12   = _mm512_unpacklo_ps( u12, u13 );                            // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t14   = _mm512_unpacklo_ps( u14, u15 );                            // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+
+    u00   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u04   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u08   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    u09   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    u12   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    u13   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+
+    t00   = _mm512_shuffle_f32x4( u00, u04, 0x88 );                    //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( u01, u05, 0x88 );                    //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t08   = _mm512_shuffle_f32x4( u08, u12, 0x88 );                    // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( u09, u13, 0x88 );                    // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+
+    b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 );                    //   0  16  32  48  64  80  96 112 128 144 160 176 192 208 224 240
+    b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 );                    //   1  17  33  49  66  81  97 113 129 145 161 177 193 209 225 241
+  }
+
+  inline void load_16x2_bc( const void * ALIGNED(64) a00,
+                            v16 &b00, v16 &b01 )
+  {
+    __m512 t00;
+
+    t00 = _mm512_load_ps( (const float *)a00 );
+
+    b00.v = _mm512_set1_ps( t00[0] );
+    b01.v = _mm512_set1_ps( t00[1] );
+  }
+
+  inline void load_16x3_tr( const void * ALIGNED(64) a00,
+                            const void * ALIGNED(64) a01,
+                            const void * ALIGNED(64) a02,
+                            const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+                            const void * ALIGNED(64) a09,
+                            const void * ALIGNED(64) a10,
+                            const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &b00, v16 &b01, v16 &b02 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    u00   = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01   = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02   = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03   = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04   = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05   = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06   = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07   = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    u08   = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    u09   = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    u10   = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    u11   = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    u12   = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    u13   = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    u14   = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    u15   = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( u00, u01 );                            //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( u00, u01 );                            //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( u02, u03 );                            //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( u02, u03 );                            //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( u04, u05 );                            //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( u04, u05 );                            //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( u06, u07 );                            //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( u06, u07 );                            //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( u08, u09 );                            // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( u08, u09 );                            // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( u10, u11 );                            // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( u10, u11 );                            // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( u12, u13 );                            // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( u12, u13 );                            // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( u14, u15 );                            // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( u14, u15 );                            // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    u00   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u02   = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    u04   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u06   = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    u08   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    u09   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    u10   = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    u12   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    u13   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    u14   = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+
+    t00   = _mm512_shuffle_f32x4( u00, u04, 0x88 );                    //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( u01, u05, 0x88 );                    //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( u02, u06, 0x88 );                    //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t08   = _mm512_shuffle_f32x4( u08, u12, 0x88 );                    // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( u09, u13, 0x88 );                    // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+    t10   = _mm512_shuffle_f32x4( u10, u14, 0x88 );                    // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250
+
+    b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 );                    //   0  16  32  48  64  80  96 112 128 144 160 176 192 208 224 240
+    b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 );                    //   1  17  33  49  66  81  97 113 129 145 161 177 193 209 225 241
+    b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 );                    //   2  18  34  50  67  82  98 114 130 146 162 178 194 210 226 242
+  }
+
+  inline void load_16x4_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &b00, v16 &b01, v16 &b02, v16 &b03 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    u00   = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01   = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02   = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03   = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04   = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05   = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06   = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07   = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    u08   = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    u09   = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    u10   = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    u11   = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    u12   = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    u13   = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    u14   = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    u15   = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( u00, u01 );                            //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( u00, u01 );                            //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( u02, u03 );                            //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( u02, u03 );                            //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( u04, u05 );                            //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( u04, u05 );                            //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( u06, u07 );                            //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( u06, u07 );                            //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( u08, u09 );                            // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( u08, u09 );                            // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( u10, u11 );                            // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( u10, u11 );                            // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( u12, u13 );                            // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( u12, u13 );                            // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( u14, u15 );                            // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( u14, u15 );                            // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    u00   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u02   = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    u03   = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    u04   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u06   = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    u07   = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    u08   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    u09   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    u10   = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    u11   = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191
+    u12   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    u13   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    u14   = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+    u15   = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255
+
+    t00   = _mm512_shuffle_f32x4( u00, u04, 0x88 );                    //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( u01, u05, 0x88 );                    //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( u02, u06, 0x88 );                    //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03   = _mm512_shuffle_f32x4( u03, u07, 0x88 );                    //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t08   = _mm512_shuffle_f32x4( u08, u12, 0x88 );                    // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( u09, u13, 0x88 );                    // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+    t10   = _mm512_shuffle_f32x4( u10, u14, 0x88 );                    // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250
+    t11   = _mm512_shuffle_f32x4( u11, u15, 0x88 );                    // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251
+
+    b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 );                    //   0  16  32  48  64  80  96 112 128 144 160 176 192 208 224 240
+    b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 );                    //   1  17  33  49  66  81  97 113 129 145 161 177 193 209 225 241
+    b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 );                    //   2  18  34  50  67  82  98 114 130 146 162 178 194 210 226 242
+    b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 );                    //   3  19  35  51  68  83  99 115 131 147 163 179 195 211 227 243
+  }
+
+  inline void load_16x8_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			    v16 &b04, v16 &b05, v16 &b06, v16 &b07 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    u00   = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01   = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02   = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03   = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04   = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05   = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06   = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07   = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    u08   = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    u09   = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    u10   = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    u11   = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    u12   = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    u13   = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    u14   = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    u15   = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( u00, u01 );                            //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( u00, u01 );                            //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( u02, u03 );                            //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( u02, u03 );                            //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( u04, u05 );                            //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( u04, u05 );                            //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( u06, u07 );                            //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( u06, u07 );                            //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( u08, u09 );                            // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( u08, u09 );                            // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( u10, u11 );                            // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( u10, u11 );                            // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( u12, u13 );                            // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( u12, u13 );                            // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( u14, u15 );                            // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( u14, u15 );                            // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    u00   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01   = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u02   = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    u03   = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    u04   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05   = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u06   = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    u07   = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    u08   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    u09   = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    u10   = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    u11   = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191
+    u12   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    u13   = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    u14   = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+    u15   = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255
+
+    t00   = _mm512_shuffle_f32x4( u00, u04, 0x88 );                    //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( u01, u05, 0x88 );                    //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( u02, u06, 0x88 );                    //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03   = _mm512_shuffle_f32x4( u03, u07, 0x88 );                    //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04   = _mm512_shuffle_f32x4( u00, u04, 0xdd );                    //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05   = _mm512_shuffle_f32x4( u01, u05, 0xdd );                    //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06   = _mm512_shuffle_f32x4( u02, u06, 0xdd );                    //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07   = _mm512_shuffle_f32x4( u03, u07, 0xdd );                    //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+    t08   = _mm512_shuffle_f32x4( u08, u12, 0x88 );                    // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( u09, u13, 0x88 );                    // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+    t10   = _mm512_shuffle_f32x4( u10, u14, 0x88 );                    // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250
+    t11   = _mm512_shuffle_f32x4( u11, u15, 0x88 );                    // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251
+    t12   = _mm512_shuffle_f32x4( u08, u12, 0xdd );                    // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252
+    t13   = _mm512_shuffle_f32x4( u09, u13, 0xdd );                    // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253
+    t14   = _mm512_shuffle_f32x4( u10, u14, 0xdd );                    // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254
+    t15   = _mm512_shuffle_f32x4( u11, u15, 0xdd );                    // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255
+
+    b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 );                    //   0  16  32  48  64  80  96 112 128 144 160 176 192 208 224 240
+    b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 );                    //   1  17  33  49  66  81  97 113 129 145 161 177 193 209 225 241
+    b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 );                    //   2  18  34  50  67  82  98 114 130 146 162 178 194 210 226 242
+    b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 );                    //   3  19  35  51  68  83  99 115 131 147 163 179 195 211 227 243
+    b04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 );                    //   4  20  36  52  69  84 100 116 132 148 164 180 196 212 228 244
+    b05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 );                    //   5  21  37  53  70  85 101 117 133 149 165 181 197 213 229 245
+    b06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 );                    //   6  22  38  54  71  86 102 118 134 150 166 182 198 214 230 246
+    b07.v = _mm512_shuffle_f32x4( t07, t15, 0x88 );                    //   7  23  39  55  72  87 103 119 135 151 167 183 199 215 231 247
+  }
+
+  // This is the reference AVX-512 implementation.
+  inline void load_16x16_tr( const void * ALIGNED(64) a00,
+			     const void * ALIGNED(64) a01,
+			     const void * ALIGNED(64) a02,
+			     const void * ALIGNED(64) a03,
+			     const void * ALIGNED(64) a04,
+			     const void * ALIGNED(64) a05,
+			     const void * ALIGNED(64) a06,
+			     const void * ALIGNED(64) a07,
+			     const void * ALIGNED(64) a08,
+			     const void * ALIGNED(64) a09,
+			     const void * ALIGNED(64) a10,
+			     const void * ALIGNED(64) a11,
+			     const void * ALIGNED(64) a12,
+			     const void * ALIGNED(64) a13,
+			     const void * ALIGNED(64) a14,
+			     const void * ALIGNED(64) a15,
+			     v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			     v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			     v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			     v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+
+    b00.v = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    b01.v = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    b02.v = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    b03.v = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    b04.v = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    b05.v = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    b06.v = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    b07.v = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    b08.v = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    b09.v = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    b10.v = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    b11.v = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    b12.v = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    b13.v = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    b14.v = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    b15.v = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( b00.v, b01.v );                        //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( b00.v, b01.v );                        //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( b02.v, b03.v );                        //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( b02.v, b03.v );                        //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( b04.v, b05.v );                        //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( b04.v, b05.v );                        //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( b06.v, b07.v );                        //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( b06.v, b07.v );                        //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( b08.v, b09.v );                        // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( b08.v, b09.v );                        // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( b10.v, b11.v );                        // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( b10.v, b11.v );                        // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( b12.v, b13.v );                        // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( b12.v, b13.v );                        // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( b14.v, b15.v );                        // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( b14.v, b15.v );                        // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    b08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    b09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    b10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    b11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191
+    b12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    b13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    b14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+    b15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255
+
+    t00   = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 );                //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 );                //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 );                //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03   = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 );                //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04   = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd );                //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05   = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd );                //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06   = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd );                //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07   = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd );                //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+    t08   = _mm512_shuffle_f32x4( b08.v, b12.v, 0x88 );                // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( b09.v, b13.v, 0x88 );                // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+    t10   = _mm512_shuffle_f32x4( b10.v, b14.v, 0x88 );                // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250
+    t11   = _mm512_shuffle_f32x4( b11.v, b15.v, 0x88 );                // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251
+    t12   = _mm512_shuffle_f32x4( b08.v, b12.v, 0xdd );                // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252
+    t13   = _mm512_shuffle_f32x4( b09.v, b13.v, 0xdd );                // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253
+    t14   = _mm512_shuffle_f32x4( b10.v, b14.v, 0xdd );                // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254
+    t15   = _mm512_shuffle_f32x4( b11.v, b15.v, 0xdd );                // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255
+
+    b00.v = _mm512_shuffle_f32x4( t00, t08, 0x88 );                    //   0  16  32  48  64  80  96 112 128 144 160 176 192 208 224 240
+    b01.v = _mm512_shuffle_f32x4( t01, t09, 0x88 );                    //   1  17  33  49  66  81  97 113 129 145 161 177 193 209 225 241
+    b02.v = _mm512_shuffle_f32x4( t02, t10, 0x88 );                    //   2  18  34  50  67  82  98 114 130 146 162 178 194 210 226 242
+    b03.v = _mm512_shuffle_f32x4( t03, t11, 0x88 );                    //   3  19  35  51  68  83  99 115 131 147 163 179 195 211 227 243
+    b04.v = _mm512_shuffle_f32x4( t04, t12, 0x88 );                    //   4  20  36  52  69  84 100 116 132 148 164 180 196 212 228 244
+    b05.v = _mm512_shuffle_f32x4( t05, t13, 0x88 );                    //   5  21  37  53  70  85 101 117 133 149 165 181 197 213 229 245
+    b06.v = _mm512_shuffle_f32x4( t06, t14, 0x88 );                    //   6  22  38  54  71  86 102 118 134 150 166 182 198 214 230 246
+    b07.v = _mm512_shuffle_f32x4( t07, t15, 0x88 );                    //   7  23  39  55  72  87 103 119 135 151 167 183 199 215 231 247
+    b08.v = _mm512_shuffle_f32x4( t00, t08, 0xdd );                    //   8  24  40  56  73  88 104 120 136 152 168 184 200 216 232 248
+    b09.v = _mm512_shuffle_f32x4( t01, t09, 0xdd );                    //   9  25  41  57  74  89 105 121 137 153 169 185 201 217 233 249
+    b10.v = _mm512_shuffle_f32x4( t02, t10, 0xdd );                    //  10  26  42  58  75  90 106 122 138 154 170 186 202 218 234 250
+    b11.v = _mm512_shuffle_f32x4( t03, t11, 0xdd );                    //  11  27  43  59  76  91 107 123 139 155 171 187 203 219 235 251
+    b12.v = _mm512_shuffle_f32x4( t04, t12, 0xdd );                    //  12  28  44  60  77  92 108 124 140 156 172 188 204 220 236 252
+    b13.v = _mm512_shuffle_f32x4( t05, t13, 0xdd );                    //  13  29  45  61  78  93 109 125 141 157 173 189 205 221 237 253
+    b14.v = _mm512_shuffle_f32x4( t06, t14, 0xdd );                    //  14  30  46  62  79  94 110 126 142 158 174 190 206 222 238 254
+    b15.v = _mm512_shuffle_f32x4( t07, t15, 0xdd );                    //  15  31  47  63  79  95 111 127 143 159 175 191 207 223 239 255
+  }
+
+  // This is the reference AVX-512 implementation.
+  inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+			      const void * ALIGNED(64) a01,
+			      const void * ALIGNED(64) a02,
+			      const void * ALIGNED(64) a03,
+			      const void * ALIGNED(64) a04,
+			      const void * ALIGNED(64) a05,
+			      const void * ALIGNED(64) a06,
+			      const void * ALIGNED(64) a07,
+			      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			      v16 &b04, v16 &b05, v16 &b06, v16 &b07 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07;
+
+    __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0 );
+
+    b00.v = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    b01.v = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    b02.v = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    b03.v = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    b04.v = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    b05.v = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    b06.v = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    b07.v = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+
+    t00   = _mm512_unpacklo_ps( b00.v, b01.v );                        //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( b00.v, b01.v );                        //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( b02.v, b03.v );                        //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( b02.v, b03.v );                        //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( b04.v, b05.v );                        //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( b04.v, b05.v );                        //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( b06.v, b07.v );                        //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( b06.v, b07.v );                        //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+
+    b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+
+    t00   = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 );                //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 );                //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 );                //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03   = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 );                //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04   = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd );                //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05   = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd );                //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06   = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd );                //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07   = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd );                //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+
+    b00.v = _mm512_permutexvar_ps( idx, t00 );                         //   0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
+    b01.v = _mm512_permutexvar_ps( idx, t01 );                         //   1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
+    b02.v = _mm512_permutexvar_ps( idx, t02 );                         //   2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
+    b03.v = _mm512_permutexvar_ps( idx, t03 );                         //   3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
+    b04.v = _mm512_permutexvar_ps( idx, t04 );                         //   4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
+    b05.v = _mm512_permutexvar_ps( idx, t05 );                         //   5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
+    b06.v = _mm512_permutexvar_ps( idx, t06 );                         //   6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
+    b07.v = _mm512_permutexvar_ps( idx, t07 );                         //   7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
+  }
+
+  // This is the reference AVX-512 implementation.
+  inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+			       const void * ALIGNED(64) a01,
+			       const void * ALIGNED(64) a02,
+			       const void * ALIGNED(64) a03,
+			       const void * ALIGNED(64) a04,
+			       const void * ALIGNED(64) a05,
+			       const void * ALIGNED(64) a06,
+			       const void * ALIGNED(64) a07,
+			       const void * ALIGNED(64) a08,
+			       const void * ALIGNED(64) a09,
+			       const void * ALIGNED(64) a10,
+			       const void * ALIGNED(64) a11,
+			       const void * ALIGNED(64) a12,
+			       const void * ALIGNED(64) a13,
+			       const void * ALIGNED(64) a14,
+			       const void * ALIGNED(64) a15,
+			       v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			       v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			       v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			       v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+
+    __m512i idx = _mm512_set_epi32( 15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0 );
+
+    b00.v = _mm512_load_ps( (const float *)a00 );                      //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    b01.v = _mm512_load_ps( (const float *)a01 );                      //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    b02.v = _mm512_load_ps( (const float *)a02 );                      //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    b03.v = _mm512_load_ps( (const float *)a03 );                      //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    b04.v = _mm512_load_ps( (const float *)a04 );                      //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    b05.v = _mm512_load_ps( (const float *)a05 );                      //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    b06.v = _mm512_load_ps( (const float *)a06 );                      //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    b07.v = _mm512_load_ps( (const float *)a07 );                      // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    b08.v = _mm512_load_ps( (const float *)a08 );                      // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    b09.v = _mm512_load_ps( (const float *)a09 );                      // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    b10.v = _mm512_load_ps( (const float *)a10 );                      // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    b11.v = _mm512_load_ps( (const float *)a11 );                      // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    b12.v = _mm512_load_ps( (const float *)a12 );                      // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    b13.v = _mm512_load_ps( (const float *)a13 );                      // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    b14.v = _mm512_load_ps( (const float *)a14 );                      // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    b15.v = _mm512_load_ps( (const float *)a15 );                      // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00   = _mm512_unpacklo_ps( b00.v, b01.v );                        //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01   = _mm512_unpackhi_ps( b00.v, b01.v );                        //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02   = _mm512_unpacklo_ps( b02.v, b03.v );                        //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03   = _mm512_unpackhi_ps( b02.v, b03.v );                        //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04   = _mm512_unpacklo_ps( b04.v, b05.v );                        //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05   = _mm512_unpackhi_ps( b04.v, b05.v );                        //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06   = _mm512_unpacklo_ps( b06.v, b07.v );                        //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07   = _mm512_unpackhi_ps( b06.v, b07.v );                        //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08   = _mm512_unpacklo_ps( b08.v, b09.v );                        // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09   = _mm512_unpackhi_ps( b08.v, b09.v );                        // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10   = _mm512_unpacklo_ps( b10.v, b11.v );                        // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11   = _mm512_unpackhi_ps( b10.v, b11.v );                        // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12   = _mm512_unpacklo_ps( b12.v, b13.v );                        // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13   = _mm512_unpackhi_ps( b12.v, b13.v );                        // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14   = _mm512_unpacklo_ps( b14.v, b15.v );                        // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15   = _mm512_unpackhi_ps( b14.v, b15.v );                        // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    b00.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    b01.v = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    b02.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    b03.v = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    b04.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    b05.v = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    b06.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    b07.v = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    b08.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    b09.v = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    b10.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    b11.v = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191
+    b12.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    b13.v = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    b14.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+    b15.v = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255
+
+    t00   = _mm512_shuffle_f32x4( b00.v, b04.v, 0x88 );                //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01   = _mm512_shuffle_f32x4( b01.v, b05.v, 0x88 );                //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02   = _mm512_shuffle_f32x4( b02.v, b06.v, 0x88 );                //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03   = _mm512_shuffle_f32x4( b03.v, b07.v, 0x88 );                //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04   = _mm512_shuffle_f32x4( b00.v, b04.v, 0xdd );                //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05   = _mm512_shuffle_f32x4( b01.v, b05.v, 0xdd );                //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06   = _mm512_shuffle_f32x4( b02.v, b06.v, 0xdd );                //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07   = _mm512_shuffle_f32x4( b03.v, b07.v, 0xdd );                //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+    t08   = _mm512_shuffle_f32x4( b08.v, b12.v, 0x88 );                // 128 144 160 176 136 152 168 184 192 208 224 240 200 216 232 248
+    t09   = _mm512_shuffle_f32x4( b09.v, b13.v, 0x88 );                // 129 145 161 177 137 153 169 185 193 209 225 241 201 217 233 249
+    t10   = _mm512_shuffle_f32x4( b10.v, b14.v, 0x88 );                // 130 146 162 178 138 154 170 186 194 210 226 242 202 218 234 250
+    t11   = _mm512_shuffle_f32x4( b11.v, b15.v, 0x88 );                // 131 147 163 179 139 155 171 187 195 211 227 243 203 219 235 251
+    t12   = _mm512_shuffle_f32x4( b08.v, b12.v, 0xdd );                // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252
+    t13   = _mm512_shuffle_f32x4( b09.v, b13.v, 0xdd );                // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253
+    t14   = _mm512_shuffle_f32x4( b10.v, b14.v, 0xdd );                // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254
+    t15   = _mm512_shuffle_f32x4( b11.v, b15.v, 0xdd );                // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255
+
+    b00.v = _mm512_permutexvar_ps( idx, t00 );                         //   0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
+    b01.v = _mm512_permutexvar_ps( idx, t01 );                         //   1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
+    b02.v = _mm512_permutexvar_ps( idx, t02 );                         //   2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
+    b03.v = _mm512_permutexvar_ps( idx, t03 );                         //   3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
+    b04.v = _mm512_permutexvar_ps( idx, t04 );                         //   4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
+    b05.v = _mm512_permutexvar_ps( idx, t05 );                         //   5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
+    b06.v = _mm512_permutexvar_ps( idx, t06 );                         //   6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
+    b07.v = _mm512_permutexvar_ps( idx, t07 );                         //   7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
+    b08.v = _mm512_permutexvar_ps( idx, t08 );                         // 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248
+    b09.v = _mm512_permutexvar_ps( idx, t09 );                         // 129 137 145 153 161 169 177 185 193 201 209 217 225 233 241 249
+    b10.v = _mm512_permutexvar_ps( idx, t10 );                         // 130 138 146 154 162 170 178 186 194 202 210 218 226 234 242 250
+    b11.v = _mm512_permutexvar_ps( idx, t11 );                         // 131 139 147 155 163 171 179 187 195 203 211 219 227 235 243 251
+    b12.v = _mm512_permutexvar_ps( idx, t12 );                         // 132 140 148 156 164 172 180 188 196 204 212 220 228 236 244 252
+    b13.v = _mm512_permutexvar_ps( idx, t13 );                         // 133 141 149 157 165 173 181 189 197 205 213 221 229 237 245 253
+    b14.v = _mm512_permutexvar_ps( idx, t14 );                         // 134 142 150 158 166 174 182 190 198 206 214 222 230 238 246 254
+    b15.v = _mm512_permutexvar_ps( idx, t15 );                         // 135 143 151 159 167 175 183 191 199 207 215 223 231 239 247 255
+  }
+
+  inline void store_16x1_tr( const v16 &a,
+			     void *a00, void *a01, void *a02, void *a03,
+			     void *a04, void *a05, void *a06, void *a07,
+			     void *a08, void *a09, void *a10, void *a11,
+			     void *a12, void *a13, void *a14, void *a15 )
+  {
+    ((int *)a00)[0] = a.i[ 0];
+    ((int *)a01)[0] = a.i[ 1];
+    ((int *)a02)[0] = a.i[ 2];
+    ((int *)a03)[0] = a.i[ 3];
+    ((int *)a04)[0] = a.i[ 4];
+    ((int *)a05)[0] = a.i[ 5];
+    ((int *)a06)[0] = a.i[ 6];
+    ((int *)a07)[0] = a.i[ 7];
+    ((int *)a08)[0] = a.i[ 8];
+    ((int *)a09)[0] = a.i[ 9];
+    ((int *)a10)[0] = a.i[10];
+    ((int *)a11)[0] = a.i[11];
+    ((int *)a12)[0] = a.i[12];
+    ((int *)a13)[0] = a.i[13];
+    ((int *)a14)[0] = a.i[14];
+    ((int *)a15)[0] = a.i[15];
+  }
+
+  inline void store_16x2_tr( const v16 &a, const v16 &b,
+			     void * ALIGNED(8) a00, void * ALIGNED(8) a01,
+			     void * ALIGNED(8) a02, void * ALIGNED(8) a03,
+			     void * ALIGNED(8) a04, void * ALIGNED(8) a05,
+			     void * ALIGNED(8) a06, void * ALIGNED(8) a07,
+			     void * ALIGNED(8) a08, void * ALIGNED(8) a09,
+			     void * ALIGNED(8) a10, void * ALIGNED(8) a11,
+			     void * ALIGNED(8) a12, void * ALIGNED(8) a13,
+			     void * ALIGNED(8) a14, void * ALIGNED(8) a15 )
+  {
+    ((int * ALIGNED(8))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(8))a00)[1] = b.i[ 0];
+
+    ((int * ALIGNED(8))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(8))a01)[1] = b.i[ 1];
+
+    ((int * ALIGNED(8))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(8))a02)[1] = b.i[ 2];
+
+    ((int * ALIGNED(8))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(8))a03)[1] = b.i[ 3];
+
+    ((int * ALIGNED(8))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(8))a04)[1] = b.i[ 4];
+
+    ((int * ALIGNED(8))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(8))a05)[1] = b.i[ 5];
+
+    ((int * ALIGNED(8))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(8))a06)[1] = b.i[ 6];
+
+    ((int * ALIGNED(8))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(8))a07)[1] = b.i[ 7];
+
+    ((int * ALIGNED(8))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(8))a08)[1] = b.i[ 8];
+
+    ((int * ALIGNED(8))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(8))a09)[1] = b.i[ 9];
+
+    ((int * ALIGNED(8))a10)[0] = a.i[10];
+    ((int * ALIGNED(8))a10)[1] = b.i[10];
+
+    ((int * ALIGNED(8))a11)[0] = a.i[11];
+    ((int * ALIGNED(8))a11)[1] = b.i[11];
+
+    ((int * ALIGNED(8))a12)[0] = a.i[12];
+    ((int * ALIGNED(8))a12)[1] = b.i[12];
+
+    ((int * ALIGNED(8))a13)[0] = a.i[13];
+    ((int * ALIGNED(8))a13)[1] = b.i[13];
+
+    ((int * ALIGNED(8))a14)[0] = a.i[14];
+    ((int * ALIGNED(8))a14)[1] = b.i[14];
+
+    ((int * ALIGNED(8))a15)[0] = a.i[15];
+    ((int * ALIGNED(8))a15)[1] = b.i[15];
+  }
+
+  inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+  }
+
+  inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+  }
+
+  inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     const v16 &e, const v16 &f, const v16 &g, const v16 &h,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+    ((int * ALIGNED(64))a00)[4] = e.i[ 0];
+    ((int * ALIGNED(64))a00)[5] = f.i[ 0];
+    ((int * ALIGNED(64))a00)[6] = g.i[ 0];
+    ((int * ALIGNED(64))a00)[7] = h.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+    ((int * ALIGNED(64))a01)[4] = e.i[ 1];
+    ((int * ALIGNED(64))a01)[5] = f.i[ 1];
+    ((int * ALIGNED(64))a01)[6] = g.i[ 1];
+    ((int * ALIGNED(64))a01)[7] = h.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+    ((int * ALIGNED(64))a02)[4] = e.i[ 2];
+    ((int * ALIGNED(64))a02)[5] = f.i[ 2];
+    ((int * ALIGNED(64))a02)[6] = g.i[ 2];
+    ((int * ALIGNED(64))a02)[7] = h.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+    ((int * ALIGNED(64))a03)[4] = e.i[ 3];
+    ((int * ALIGNED(64))a03)[5] = f.i[ 3];
+    ((int * ALIGNED(64))a03)[6] = g.i[ 3];
+    ((int * ALIGNED(64))a03)[7] = h.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+    ((int * ALIGNED(64))a04)[4] = e.i[ 4];
+    ((int * ALIGNED(64))a04)[5] = f.i[ 4];
+    ((int * ALIGNED(64))a04)[6] = g.i[ 4];
+    ((int * ALIGNED(64))a04)[7] = h.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+    ((int * ALIGNED(64))a05)[4] = e.i[ 5];
+    ((int * ALIGNED(64))a05)[5] = f.i[ 5];
+    ((int * ALIGNED(64))a05)[6] = g.i[ 5];
+    ((int * ALIGNED(64))a05)[7] = h.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+    ((int * ALIGNED(64))a06)[4] = e.i[ 6];
+    ((int * ALIGNED(64))a06)[5] = f.i[ 6];
+    ((int * ALIGNED(64))a06)[6] = g.i[ 6];
+    ((int * ALIGNED(64))a06)[7] = h.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+    ((int * ALIGNED(64))a07)[4] = e.i[ 7];
+    ((int * ALIGNED(64))a07)[5] = f.i[ 7];
+    ((int * ALIGNED(64))a07)[6] = g.i[ 7];
+    ((int * ALIGNED(64))a07)[7] = h.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+    ((int * ALIGNED(64))a08)[4] = e.i[ 8];
+    ((int * ALIGNED(64))a08)[5] = f.i[ 8];
+    ((int * ALIGNED(64))a08)[6] = g.i[ 8];
+    ((int * ALIGNED(64))a08)[7] = h.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+    ((int * ALIGNED(64))a09)[4] = e.i[ 9];
+    ((int * ALIGNED(64))a09)[5] = f.i[ 9];
+    ((int * ALIGNED(64))a09)[6] = g.i[ 9];
+    ((int * ALIGNED(64))a09)[7] = h.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+    ((int * ALIGNED(64))a10)[4] = e.i[10];
+    ((int * ALIGNED(64))a10)[5] = f.i[10];
+    ((int * ALIGNED(64))a10)[6] = g.i[10];
+    ((int * ALIGNED(64))a10)[7] = h.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+    ((int * ALIGNED(64))a11)[4] = e.i[11];
+    ((int * ALIGNED(64))a11)[5] = f.i[11];
+    ((int * ALIGNED(64))a11)[6] = g.i[11];
+    ((int * ALIGNED(64))a11)[7] = h.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+    ((int * ALIGNED(64))a12)[4] = e.i[12];
+    ((int * ALIGNED(64))a12)[5] = f.i[12];
+    ((int * ALIGNED(64))a12)[6] = g.i[12];
+    ((int * ALIGNED(64))a12)[7] = h.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+    ((int * ALIGNED(64))a13)[4] = e.i[13];
+    ((int * ALIGNED(64))a13)[5] = f.i[13];
+    ((int * ALIGNED(64))a13)[6] = g.i[13];
+    ((int * ALIGNED(64))a13)[7] = h.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+    ((int * ALIGNED(64))a14)[4] = e.i[14];
+    ((int * ALIGNED(64))a14)[5] = f.i[14];
+    ((int * ALIGNED(64))a14)[6] = g.i[14];
+    ((int * ALIGNED(64))a14)[7] = h.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+    ((int * ALIGNED(64))a15)[4] = e.i[15];
+    ((int * ALIGNED(64))a15)[5] = f.i[15];
+    ((int * ALIGNED(64))a15)[6] = g.i[15];
+    ((int * ALIGNED(64))a15)[7] = h.i[15];
+  }
+
+  inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+			      const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+			      const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+			      const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+			      void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			      void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			      void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			      void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			      void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			      void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			      void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			      void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    // Start                                 a00 =   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    //                                       a01 =  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    //                                       a02 =  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    //                                       a03 =  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    //                                       a04 =  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    //                                       a05 =  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    //                                       a06 =  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    //                                       a07 = 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    //                                       a08 = 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    //                                       a09 = 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    //                                       a10 = 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    //                                       a11 = 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    //                                       a12 = 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    //                                       a13 = 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    //                                       a14 = 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    //                                       a15 = 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    t00 = _mm512_unpacklo_ps( b00.v, b01.v ); //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01 = _mm512_unpackhi_ps( b00.v, b01.v ); //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02 = _mm512_unpacklo_ps( b02.v, b03.v ); //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03 = _mm512_unpackhi_ps( b02.v, b03.v ); //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04 = _mm512_unpacklo_ps( b04.v, b05.v ); //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05 = _mm512_unpackhi_ps( b04.v, b05.v ); //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06 = _mm512_unpacklo_ps( b06.v, b07.v ); //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07 = _mm512_unpackhi_ps( b06.v, b07.v ); //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08 = _mm512_unpacklo_ps( b08.v, b09.v ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09 = _mm512_unpackhi_ps( b08.v, b09.v ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10 = _mm512_unpacklo_ps( b10.v, b11.v ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11 = _mm512_unpackhi_ps( b10.v, b11.v ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12 = _mm512_unpacklo_ps( b12.v, b13.v ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13 = _mm512_unpackhi_ps( b12.v, b13.v ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14 = _mm512_unpacklo_ps( b14.v, b15.v ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15 = _mm512_unpackhi_ps( b14.v, b15.v ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    u00 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   0  16  32  48 
+    u01 = _mm512_shuffle_ps( t00, t02, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   1  17  33  49 
+    u02 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //   2  18  34  50 
+    u03 = _mm512_shuffle_ps( t01, t03, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //   3  19  35  51 
+    u04 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  64  80  96 112 
+    u05 = _mm512_shuffle_ps( t04, t06, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  65  81  97 113 
+    u06 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 1, 0, 1, 0 ) );  //  66  82  98 114 
+    u07 = _mm512_shuffle_ps( t05, t07, _MM_SHUFFLE( 3, 2, 3, 2 ) );  //  67  83  99 115 
+    u08 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 128 144 160 176 
+    u09 = _mm512_shuffle_ps( t08, t10, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 129 145 161 177 
+    u10 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 130 146 162 178 
+    u11 = _mm512_shuffle_ps( t09, t11, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 131 147 163 179 
+    u12 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 192 208 228 240 
+    u13 = _mm512_shuffle_ps( t12, t14, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 193 209 229 241 
+    u14 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 1, 0, 1, 0 ) );  // 194 210 230 242 
+    u15 = _mm512_shuffle_ps( t13, t15, _MM_SHUFFLE( 3, 2, 3, 2 ) );  // 195 211 231 243 
+
+    t00 = _mm512_shuffle_f32x4( u00, u04, 0x88 ); //   0  16  32  48   8  24  40  56  64  80  96  112 ...
+    t01 = _mm512_shuffle_f32x4( u01, u05, 0x88 ); //   1  17  33  49 ...
+    t02 = _mm512_shuffle_f32x4( u02, u06, 0x88 ); //   2  18  34  50 ...
+    t03 = _mm512_shuffle_f32x4( u03, u07, 0x88 ); //   3  19  35  51 ...
+    t04 = _mm512_shuffle_f32x4( u00, u04, 0xdd ); //   4  20  36  52 ...
+    t05 = _mm512_shuffle_f32x4( u01, u05, 0xdd ); //   5  21  37  53 ...
+    t06 = _mm512_shuffle_f32x4( u02, u06, 0xdd ); //   6  22  38  54 ...
+    t07 = _mm512_shuffle_f32x4( u03, u07, 0xdd ); //   7  23  39  55 ...
+    t08 = _mm512_shuffle_f32x4( u08, u12, 0x88 ); // 128 144 160 176 ...
+    t09 = _mm512_shuffle_f32x4( u09, u13, 0x88 ); // 129 145 161 177 ...
+    t10 = _mm512_shuffle_f32x4( u10, u14, 0x88 ); // 130 146 162 178 ...
+    t11 = _mm512_shuffle_f32x4( u11, u15, 0x88 ); // 131 147 163 179 ...
+    t12 = _mm512_shuffle_f32x4( u08, u12, 0xdd ); // 132 148 164 180 ...
+    t13 = _mm512_shuffle_f32x4( u09, u13, 0xdd ); // 133 149 165 181 ...
+    t14 = _mm512_shuffle_f32x4( u10, u14, 0xdd ); // 134 150 166 182 ...
+    t15 = _mm512_shuffle_f32x4( u11, u15, 0xdd ); // 135 151 167 183 ...
+
+    u00 = _mm512_shuffle_f32x4( t00, t08, 0x88 ); //   0  16  32  48  64  80  96 112 ... 240
+    u01 = _mm512_shuffle_f32x4( t01, t09, 0x88 ); //   1  17  33  49  66  81  97 113 ... 241
+    u02 = _mm512_shuffle_f32x4( t02, t10, 0x88 ); //   2  18  34  50  67  82  98 114 ... 242
+    u03 = _mm512_shuffle_f32x4( t03, t11, 0x88 ); //   3  19  35  51  68  83  99 115 ... 243
+    u04 = _mm512_shuffle_f32x4( t04, t12, 0x88 ); //   4 ...
+    u05 = _mm512_shuffle_f32x4( t05, t13, 0x88 ); //   5 ...
+    u06 = _mm512_shuffle_f32x4( t06, t14, 0x88 ); //   6 ...
+    u07 = _mm512_shuffle_f32x4( t07, t15, 0x88 ); //   7 ...
+    u08 = _mm512_shuffle_f32x4( t00, t08, 0xdd ); //   8 ...
+    u09 = _mm512_shuffle_f32x4( t01, t09, 0xdd ); //   9 ...
+    u10 = _mm512_shuffle_f32x4( t02, t10, 0xdd ); //  10 ...
+    u11 = _mm512_shuffle_f32x4( t03, t11, 0xdd ); //  11 ...
+    u12 = _mm512_shuffle_f32x4( t04, t12, 0xdd ); //  12 ...
+    u13 = _mm512_shuffle_f32x4( t05, t13, 0xdd ); //  13 ...
+    u14 = _mm512_shuffle_f32x4( t06, t14, 0xdd ); //  14 ...
+    u15 = _mm512_shuffle_f32x4( t07, t15, 0xdd ); //  15  31  47  63  79  96 111 127 ... 255
+
+    _mm512_store_ps( (float *)a00, u00 );
+    _mm512_store_ps( (float *)a01, u01 );
+    _mm512_store_ps( (float *)a02, u02 );
+    _mm512_store_ps( (float *)a03, u03 );
+    _mm512_store_ps( (float *)a04, u04 );
+    _mm512_store_ps( (float *)a05, u05 );
+    _mm512_store_ps( (float *)a06, u06 );
+    _mm512_store_ps( (float *)a07, u07 );
+    _mm512_store_ps( (float *)a08, u08 );
+    _mm512_store_ps( (float *)a09, u09 );
+    _mm512_store_ps( (float *)a10, u10 );
+    _mm512_store_ps( (float *)a11, u11 );
+    _mm512_store_ps( (float *)a12, u12 );
+    _mm512_store_ps( (float *)a13, u13 );
+    _mm512_store_ps( (float *)a14, u14 );
+    _mm512_store_ps( (float *)a15, u15 );
+  }
+
+  // This is the reference AVX-512 implementation.
+  inline void store_16x8_tr_p( const v16 &b00,
+			       const v16 &b01,
+			       const v16 &b02,
+			       const v16 &b03,
+			       const v16 &b04,
+			       const v16 &b05,
+			       const v16 &b06,
+			       const v16 &b07,
+			       void * ALIGNED(64) a00,
+			       void * ALIGNED(64) a01,
+			       void * ALIGNED(64) a02,
+			       void * ALIGNED(64) a03,
+			       void * ALIGNED(64) a04,
+			       void * ALIGNED(64) a05,
+			       void * ALIGNED(64) a06,
+			       void * ALIGNED(64) a07 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07;
+
+    __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, 4, 2, 0 );
+
+    __m512i idx1, idx2;
+
+    // Start                                                     b00 =   0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
+    //                                                           b01 =   1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
+    //                                                           b02 =   2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
+    //                                                           b03 =   3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
+    //                                                           b04 =   4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
+    //                                                           b05 =   5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
+    //                                                           b06 =   6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
+    //                                                           b07 =   7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
+
+    t00 = _mm512_permutexvar_ps( idx, b00.v );                      //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01 = _mm512_permutexvar_ps( idx, b01.v );                      //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02 = _mm512_permutexvar_ps( idx, b02.v );                      //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03 = _mm512_permutexvar_ps( idx, b03.v );                      //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04 = _mm512_permutexvar_ps( idx, b04.v );                      //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05 = _mm512_permutexvar_ps( idx, b05.v );                      //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06 = _mm512_permutexvar_ps( idx, b06.v );                      //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07 = _mm512_permutexvar_ps( idx, b07.v );                      //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+
+    idx1 = _mm512_set_epi32(  7+16,  6+16,  5+16,  4+16,  7,  6,  5,  4,  3+16,  2+16, 1+16, 0+16,  3,  2, 1, 0 );
+    idx2 = _mm512_set_epi32( 15+16, 14+16, 13+16, 12+16, 15, 14, 13, 12, 11+16, 10+16, 9+16, 8+16, 11, 10, 9, 8 );
+
+    u00 = _mm512_permutex2var_ps( t00, idx1, t04 );                 //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01 = _mm512_permutex2var_ps( t01, idx1, t05 );                 //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u02 = _mm512_permutex2var_ps( t02, idx1, t06 );                 //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    u03 = _mm512_permutex2var_ps( t03, idx1, t07 );                 //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    u04 = _mm512_permutex2var_ps( t00, idx2, t04 );                 //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05 = _mm512_permutex2var_ps( t01, idx2, t05 );                 //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u06 = _mm512_permutex2var_ps( t02, idx2, t06 );                 //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    u07 = _mm512_permutex2var_ps( t03, idx2, t07 );                 //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+
+    t00 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+
+    u00 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+
+    _mm512_store_ps( (float *)a00, u00 );
+    _mm512_store_ps( (float *)a01, u01 );
+    _mm512_store_ps( (float *)a02, u02 );
+    _mm512_store_ps( (float *)a03, u03 );
+    _mm512_store_ps( (float *)a04, u04 );
+    _mm512_store_ps( (float *)a05, u05 );
+    _mm512_store_ps( (float *)a06, u06 );
+    _mm512_store_ps( (float *)a07, u07 );
+  }
+
+  // This is the reference AVX-512 implementation.
+  inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+                                const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+                                const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+                                const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+                                void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+                                void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+                                void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+                                void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+                                void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+                                void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+                                void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+                                void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    __m512 t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15;
+    __m512 u00, u01, u02, u03, u04, u05, u06, u07, u08, u09, u10, u11, u12, u13, u14, u15;
+
+    __m512i idx = _mm512_set_epi32( 15, 13, 11, 9, 14, 12, 10, 8, 7, 5, 3, 1, 6, 4, 2, 0 );
+
+    __m512i idx1, idx2;
+
+    // Start                                                     b00 =   0   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120
+    //                                                           b01 =   1   9  17  25  33  41  49  57  65  73  81  89  97 105 113 121
+    //                                                           b02 =   2  10  18  26  34  42  50  58  66  74  82  90  98 106 114 122
+    //                                                           b03 =   3  11  19  27  35  43  51  59  67  75  83  91  99 107 115 123
+    //                                                           b04 =   4  12  20  28  36  44  52  60  68  76  84  92 100 108 116 124
+    //                                                           b05 =   5  13  21  29  37  45  53  61  69  77  85  93 101 109 117 125
+    //                                                           b06 =   6  14  22  30  38  46  54  62  70  78  86  94 102 110 118 126
+    //                                                           b07 =   7  15  23  31  39  47  55  63  71  79  87  95 103 111 119 127
+    //                                                           b08 = 128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248
+    //                                                           b09 = 129 137 145 153 161 169 177 185 193 201 209 217 225 233 241 249
+    //                                                           b10 = 130 138 146 154 162 170 178 186 194 202 210 218 226 234 242 250
+    //                                                           b11 = 131 139 147 155 163 171 179 187 195 203 211 219 227 235 243 251
+    //                                                           b12 = 132 140 148 156 164 172 180 188 196 204 212 220 228 236 244 252
+    //                                                           b13 = 133 141 149 157 165 173 181 189 197 205 213 221 229 237 245 253
+    //                                                           b14 = 134 142 150 158 166 174 182 190 198 206 214 222 230 238 246 254
+    //                                                           b15 = 135 143 151 159 167 175 183 191 199 207 215 223 231 239 247 255
+
+    t00 = _mm512_permutexvar_ps( idx, b00.v );                      //   0  16  32  48   8  24  40  56  64  80  96 112  72  88 104 120
+    t01 = _mm512_permutexvar_ps( idx, b01.v );                      //   1  17  33  49   9  25  41  57  65  81  97 113  73  89 105 121
+    t02 = _mm512_permutexvar_ps( idx, b02.v );                      //   2  18  34  50  10  26  42  58  66  82  98 114  74  90 106 122
+    t03 = _mm512_permutexvar_ps( idx, b03.v );                      //   3  19  35  51  11  27  43  59  67  83  99 115  75  91 107 123
+    t04 = _mm512_permutexvar_ps( idx, b04.v );                      //   4  20  36  52  12  28  44  60  68  84 100 116  76  92 108 124
+    t05 = _mm512_permutexvar_ps( idx, b05.v );                      //   5  21  37  53  13  29  45  61  69  85 101 117  77  93 109 125
+    t06 = _mm512_permutexvar_ps( idx, b06.v );                      //   6  22  38  54  14  30  46  62  70  86 102 118  78  94 110 126
+    t07 = _mm512_permutexvar_ps( idx, b07.v );                      //   7  23  39  55  15  31  47  63  71  87 103 119  79  95 111 127
+    t08 = _mm512_permutexvar_ps( idx, b08.v );                      // 128 144 160 176 136 152 168 184 192 208 228 240 200 216 232 248
+    t09 = _mm512_permutexvar_ps( idx, b09.v );                      // 129 145 161 177 137 153 169 185 193 209 229 241 201 217 233 249
+    t10 = _mm512_permutexvar_ps( idx, b10.v );                      // 130 146 162 178 138 154 170 186 194 210 230 242 202 218 234 250
+    t11 = _mm512_permutexvar_ps( idx, b11.v );                      // 131 147 163 179 139 155 171 187 195 211 231 243 203 219 235 251
+    t12 = _mm512_permutexvar_ps( idx, b12.v );                      // 132 148 164 180 140 156 172 188 196 212 228 244 204 220 236 252
+    t13 = _mm512_permutexvar_ps( idx, b13.v );                      // 133 149 165 181 141 157 173 189 197 213 229 245 205 221 237 253
+    t14 = _mm512_permutexvar_ps( idx, b14.v );                      // 134 150 166 182 142 158 174 190 198 214 230 246 206 222 238 254
+    t15 = _mm512_permutexvar_ps( idx, b15.v );                      // 135 151 167 183 143 159 175 191 199 215 231 247 207 223 239 255
+
+    idx1 = _mm512_set_epi32(  7+16,  6+16,  5+16,  4+16,  7,  6,  5,  4,  3+16,  2+16, 1+16, 0+16,  3,  2, 1, 0 );
+    idx2 = _mm512_set_epi32( 15+16, 14+16, 13+16, 12+16, 15, 14, 13, 12, 11+16, 10+16, 9+16, 8+16, 11, 10, 9, 8 );
+
+    u00 = _mm512_permutex2var_ps( t00, idx1, t04 );                 //   0  16  32  48   4  20  36  52   8  24  40  56  12  28  44  60
+    u01 = _mm512_permutex2var_ps( t01, idx1, t05 );                 //   1  17  33  49   5  21  37  53   9  25  41  57  13  29  45  61
+    u02 = _mm512_permutex2var_ps( t02, idx1, t06 );                 //   2  18  34  50   6  22  38  54  10  26  42  58  14  30  46  62
+    u03 = _mm512_permutex2var_ps( t03, idx1, t07 );                 //   3  19  35  51   7  23  39  55  11  27  43  59  15  31  47  63
+    u04 = _mm512_permutex2var_ps( t00, idx2, t04 );                 //  64  80  96 112  68  84 100 116  72  88 104 120  76  92 108 124
+    u05 = _mm512_permutex2var_ps( t01, idx2, t05 );                 //  65  81  97 113  69  85 101 117  73  89 105 121  77  93 109 125
+    u06 = _mm512_permutex2var_ps( t02, idx2, t06 );                 //  66  82  98 114  70  86 102 118  74  90 106 122  78  94 110 126
+    u07 = _mm512_permutex2var_ps( t03, idx2, t07 );                 //  67  83  99 115  71  87 103 119  75  91 107 123  79  95 111 127
+    u08 = _mm512_permutex2var_ps( t08, idx1, t12 );                 // 128 144 160 176 132 148 164 180 136 152 168 184 140 156 172 188
+    u09 = _mm512_permutex2var_ps( t09, idx1, t13 );                 // 129 145 161 177 133 149 165 181 137 153 169 185 141 157 173 189
+    u10 = _mm512_permutex2var_ps( t10, idx1, t14 );                 // 130 146 162 178 134 150 166 182 138 154 170 186 142 158 174 190
+    u11 = _mm512_permutex2var_ps( t11, idx1, t15 );                 // 131 147 163 179 135 151 167 183 139 155 171 187 143 159 175 191
+    u12 = _mm512_permutex2var_ps( t08, idx2, t12 );                 // 192 208 224 240 196 212 228 244 200 216 232 248 204 220 236 252
+    u13 = _mm512_permutex2var_ps( t09, idx2, t13 );                 // 193 209 225 241 197 213 229 245 201 217 233 249 205 221 237 253
+    u14 = _mm512_permutex2var_ps( t10, idx2, t14 );                 // 194 210 226 242 198 214 230 246 202 218 234 250 206 222 238 254
+    u15 = _mm512_permutex2var_ps( t11, idx2, t15 );                 // 195 211 227 243 199 215 231 247 203 219 235 251 207 223 239 255
+
+    t00 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //   0  16   1  17   4  20   5  21   8  24   9  25  12  28  13  29 
+    t01 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //   2  18   3  19   6  22   7  23  10  26  11  27  14  30  15  31
+    t02 = _mm512_shuffle_ps( u00, u01, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  32  48  33  49  36  52  37  53  40  56  41  57  44  60  45  61
+    t03 = _mm512_shuffle_ps( u02, u03, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  34  50  35  51  38  54  39  55  42  58  43  59  46  62  47  63
+    t04 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //  64  80  65  81  68  84  69  85  72  88  73  89  76  92  77  93
+    t05 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 1, 0, 1, 0 ) ); //  66  82  67  83  70  86  71  87  74  90  75  91  78  94  79  95
+    t06 = _mm512_shuffle_ps( u04, u05, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  96 112  97 113 100 116 101 117 104 120 105 121 108 124 109 125
+    t07 = _mm512_shuffle_ps( u06, u07, _MM_SHUFFLE( 3, 2, 3, 2 ) ); //  98 114  99 115 102 118 103 119 106 122 107 123 110 126 111 127
+    t08 = _mm512_shuffle_ps( u08, u09, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 128 144 129 145 132 148 133 149 136 152 137 153 140 156 141 157
+    t09 = _mm512_shuffle_ps( u10, u11, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 130 146 131 147 134 150 135 151 138 154 139 155 142 158 143 159
+    t10 = _mm512_shuffle_ps( u08, u09, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 160 176 161 177 164 180 165 181 168 184 169 185 172 188 173 189
+    t11 = _mm512_shuffle_ps( u10, u11, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 162 178 163 179 166 182 167 183 170 186 171 187 174 190 175 191
+    t12 = _mm512_shuffle_ps( u12, u13, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 192 208 193 209 196 212 197 213 200 216 201 217 204 220 205 221
+    t13 = _mm512_shuffle_ps( u14, u15, _MM_SHUFFLE( 1, 0, 1, 0 ) ); // 194 210 195 211 198 214 199 215 202 218 203 219 206 222 207 223
+    t14 = _mm512_shuffle_ps( u12, u13, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 224 240 225 241 228 244 229 245 232 248 233 249 236 252 237 253
+    t15 = _mm512_shuffle_ps( u14, u15, _MM_SHUFFLE( 3, 2, 3, 2 ) ); // 226 242 227 243 230 246 231 247 234 250 235 251 238 254 239 255
+
+    u00 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+    u01 = _mm512_shuffle_ps( t00, t01, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+    u02 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
+    u03 = _mm512_shuffle_ps( t02, t03, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
+    u04 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79
+    u05 = _mm512_shuffle_ps( t04, t05, _MM_SHUFFLE( 3, 1, 3, 1 ) ); //  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+    u06 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 2, 0, 2, 0 ) ); //  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
+    u07 = _mm512_shuffle_ps( t06, t07, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
+    u08 = _mm512_shuffle_ps( t08, t09, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+    u09 = _mm512_shuffle_ps( t08, t09, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
+    u10 = _mm512_shuffle_ps( t10, t11, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
+    u11 = _mm512_shuffle_ps( t10, t11, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
+    u12 = _mm512_shuffle_ps( t12, t13, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
+    u13 = _mm512_shuffle_ps( t12, t13, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
+    u14 = _mm512_shuffle_ps( t14, t15, _MM_SHUFFLE( 2, 0, 2, 0 ) ); // 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
+    u15 = _mm512_shuffle_ps( t14, t15, _MM_SHUFFLE( 3, 1, 3, 1 ) ); // 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
+
+    _mm512_store_ps( (float *)a00, u00 );
+    _mm512_store_ps( (float *)a01, u01 );
+    _mm512_store_ps( (float *)a02, u02 );
+    _mm512_store_ps( (float *)a03, u03 );
+    _mm512_store_ps( (float *)a04, u04 );
+    _mm512_store_ps( (float *)a05, u05 );
+    _mm512_store_ps( (float *)a06, u06 );
+    _mm512_store_ps( (float *)a07, u07 );
+    _mm512_store_ps( (float *)a08, u08 );
+    _mm512_store_ps( (float *)a09, u09 );
+    _mm512_store_ps( (float *)a10, u10 );
+    _mm512_store_ps( (float *)a11, u11 );
+    _mm512_store_ps( (float *)a12, u12 );
+    _mm512_store_ps( (float *)a13, u13 );
+    _mm512_store_ps( (float *)a14, u14 );
+    _mm512_store_ps( (float *)a15, u15 );
+  }
+
+  //////////////
+  // v16int class
+
+  class v16int : public v16
+  {
+    // v16int prefix unary operator friends
+
+    friend inline v16int operator  +( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  ~( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  !( const v16int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16int prefix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a ) ALWAYS_INLINE;
+
+    // v16int postfix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE;
+
+    // v16int binary operator friends
+
+    friend inline v16int operator  +( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  *( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  /( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  %( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  ^( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  &( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  |( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int logical operator friends
+
+    friend inline v16int operator  <( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16int abs( const v16int &a ) ALWAYS_INLINE;
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE;
+
+    // v16float unary operator friends
+
+    friend inline v16int operator  !( const v16float & a ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float miscellaneous friends
+
+    friend inline v16float clear_bits(  const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float set_bits(    const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16int constructors / destructors
+
+    v16int() {}                                  // Default constructor
+
+    v16int( const v16int &a )                    // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v16int( const v16 &a )                       // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v16int( int a )                              // Init from scalar
+    {
+      union
+      {
+	int i;
+	float f;
+      } u;
+      u.i = a;
+      v = _mm512_set1_ps( u.f );
+    }
+
+    v16int( int i00, int i01, int i02, int i03,
+	    int i04, int i05, int i06, int i07,
+	    int i08, int i09, int i10, int i11,
+	    int i12, int i13, int i14, int i15 ) // Init from scalars
+    {
+      union
+      {
+	int i;
+	float f;
+      } u00, u01, u02, u03, u04, u05, u06, u07,
+	u08, u09, u10, u11, u12, u13, u14, u15;
+
+      u00.i = i00; u01.i = i01; u02.i = i02; u03.i = i03;
+      u04.i = i04; u05.i = i05; u06.i = i06; u07.i = i07;
+      u08.i = i08; u09.i = i09; u10.i = i10; u11.i = i11;
+      u12.i = i12; u13.i = i13; u14.i = i14; u15.i = i15;
+
+      v = _mm512_setr_ps( u00.f, u01.f, u02.f, u03.f,
+			  u04.f, u05.f, u06.f, u07.f,
+			  u08.f, u09.f, u10.f, u11.f,
+			  u12.f, u13.f, u14.f, u15.f );
+    }
+
+    ~v16int() {}                                 // Destructor
+
+    // v16int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v16int &operator op( const v16int &b ) \
+    {						  \
+      for( int j = 0; j < 16; j++ )               \
+        i[j] op b.i[j];                           \
+      return *this;                               \
+    }
+
+    inline v16int &operator =( const v16int &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+
+    inline v16int &operator ^=( const v16int &b )
+    {
+      v = _mm512_xor_ps( v, b.v );
+      return *this;
+    }
+
+    inline v16int &operator &=( const v16int &b )
+    {
+      v = _mm512_and_ps( v, b.v );
+      return *this;
+    }
+
+    inline v16int &operator |=( const v16int &b )
+    {
+      v = _mm512_or_ps( v, b.v );
+      return *this;
+    }
+
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v16int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v16int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v16int operator op( const v16int & a ) \
+  {						\
+    v16int b;                                   \
+    for( int j = 0; j < 16; j++ )               \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  inline v16int operator +( const v16int & a )
+  {
+    v16int b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  PREFIX_UNARY(-)
+
+  inline v16int operator !( const v16int & a )
+  {
+    v16int b;
+
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = - ( !a.i[j] );
+
+    return b;
+  }
+
+  inline v16int operator ~( const v16int & a )
+  {
+    v16int b;
+    union
+    {
+      int i;
+      float f;
+    } u;
+    u.i = -1;
+    b.v = _mm512_xor_ps( a.v, _mm512_set1_ps( u.f ) );
+    return b;
+  }
+
+# undef PREFIX_UNARY
+
+  // v16int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v16int operator op( v16int & a )       \
+  {						\
+    v16int b;                                   \
+    for( int j = 0; j < 16; j++ )               \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v16int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v16int operator op( v16int & a, int ) \
+  {					       \
+    v16int b;                                  \
+    for( int j = 0; j < 16; j++ )              \
+      b.i[j] = ( a.i[j] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v16int binary operators
+
+# define BINARY(op)                                             \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {								\
+    v16int c;                                                   \
+    for( int j = 0; j < 16; j++ )                               \
+      c.i[j] = a.i[j] op b.i[j];                                \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+
+  inline v16int operator ^( const v16int &a, const v16int &b )
+  {
+    v16int c;
+
+    c.v = _mm512_xor_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v16int operator &( const v16int &a, const v16int &b )
+  {
+    v16int c;
+
+    c.v = _mm512_and_ps( a.v, b.v );
+
+    return c;
+  }
+
+  #if 0
+  inline v16int operator |( const v16int &a, const v16int &b )
+  {
+    v16int c;
+
+    c.v = _mm512_or_ps( a.v, b.v );
+
+    return c;
+  }
+  #endif
+
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+  #undef BINARY
+
+  // v16int logical operators
+
+# define LOGICAL(op)                                            \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {                                                             \
+    v16int c;                                                   \
+    for( int j = 0; j < 16; j++ )                               \
+      c.i[j] = - ( a.i[j] op b.i[j] );                          \
+    return c;                                                   \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16int miscellaneous functions
+
+  inline v16int abs( const v16int &a )
+  {
+    v16int b;
+
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j];
+
+    return b;
+  }
+
+  inline v16 czero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
+
+    return b;
+  }
+
+  #if 0
+  inline v16 czero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.v = _mm512_andnot_ps( c.v, a.v );
+
+    return b;
+  }
+  #endif
+
+  inline v16 notczero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.v = _mm512_and_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v16 merge( const v16int &c, const v16 &t, const v16 &f )
+  {
+    v16 m;
+
+    for( int j = 0; j < 16; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+
+    return m;
+  }
+
+  #if 0
+  inline v16 merge( const v16int &c, const v16 &t, const v16 &f )
+  {
+    __m512 c_v = c.v;
+
+    v16 tf;
+
+    tf.v = _mm512_or_ps( _mm512_andnot_ps( c_v, f.v ),
+			 _mm512_and_ps( c_v, t.v ) );
+
+    return tf;
+  }
+  #endif
+
+  ////////////////
+  // v16float class
+
+  class v16float : public v16
+  {
+    // v16float prefix unary operator friends
+
+    friend inline v16float operator  +( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  ~( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16int   operator  !( const v16float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16float prefix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a ) ALWAYS_INLINE;
+
+    // v16float postfix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE;
+
+    // v16float binary operator friends
+
+    friend inline v16float operator  +( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  *( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  /( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float math library friends
+
+#   define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v16float fn( const v16float &a,  \
+                                                    const v16float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v16float miscellaneous friends
+
+    friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rsqrt       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void     scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16float constructors / destructors
+
+    v16float() {}                                          // Default constructor
+
+    v16float( const v16float &a )                          // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v16float( const v16 &a )                               // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v16float( float a )                                    // Init from scalar
+    {
+      v = _mm512_set1_ps( a );
+    }
+
+    v16float( float f00, float f01, float f02, float f03,
+	      float f04, float f05, float f06, float f07,
+	      float f08, float f09, float f10, float f11,
+	      float f12, float f13, float f14, float f15 ) // Init from scalars
+    {
+      v = _mm512_setr_ps( f00, f01, f02, f03, f04, f05, f06, f07,
+			  f08, f09, f10, f11, f12, f13, f14, f15 );
+    }
+
+    ~v16float() {}                                         // Destructor
+
+    // v16float assignment operators
+
+#   define ASSIGN(op,intrin)				\
+    inline v16float &operator op( const v16float &b )   \
+    {							\
+      v = intrin( v, b.v );                             \
+      return *this;                                     \
+    }
+
+    inline v16float &operator =( const v16float &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN( +=, _mm512_add_ps )
+    ASSIGN( -=, _mm512_sub_ps )
+    ASSIGN( *=, _mm512_mul_ps )
+    ASSIGN( /=, _mm512_div_ps )
+
+#   undef ASSIGN
+
+    // v16float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v16float prefix unary operators
+
+  inline v16float operator +( const v16float &a )
+  {
+    v16float b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  inline v16float operator -( const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_sub_ps( _mm512_setzero_ps(), a.v );
+
+    return b;
+  }
+
+  inline v16int operator !( const v16float &a )
+  {
+    v16int b;
+
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[j] ? 0 : -1;
+
+    return b;
+  }
+
+  #if 0
+  inline v16int operator !( const v16float &a )
+  {
+    v16int b;
+
+    b.v = _mm512_cmp_ps( _mm512_setzero_ps(), a.v, _CMP_EQ_OS );
+
+    return b;
+  }
+  #endif
+
+  // v16float prefix increment / decrement operators
+
+  inline v16float operator ++( v16float &a )
+  {
+    v16float b;
+    __m512 t = _mm512_add_ps( a.v, _mm512_set1_ps( 1.0f ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a )
+  {
+    v16float b;
+    __m512 t = _mm512_sub_ps( a.v, _mm512_set1_ps( 1.0f ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  // v16float postfix increment / decrement operators
+
+  inline v16float operator ++( v16float &a, int )
+  {
+    v16float b;
+    __m512 a_v = a.v;
+
+    a.v = _mm512_add_ps( a_v, _mm512_set1_ps( 1.0f ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a, int )
+  {
+    v16float b;
+    __m512 a_v = a.v;
+
+    a.v = _mm512_sub_ps( a_v, _mm512_set1_ps( 1.0f ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  // v16float binary operators
+
+# define BINARY(op,intrin)                                            \
+  inline v16float operator op( const v16float &a, const v16float &b ) \
+  {								      \
+    v16float c;                                                       \
+    c.v = intrin( a.v, b.v );                                         \
+    return c;                                                         \
+  }
+
+  BINARY( +, _mm512_add_ps )
+  BINARY( -, _mm512_sub_ps )
+  BINARY( *, _mm512_mul_ps )
+  BINARY( /, _mm512_div_ps )
+
+# undef BINARY
+
+  // v16float logical operators
+
+# define LOGICAL(op)                                                \
+  inline v16int operator op( const v16float &a, const v16float &b ) \
+  {								    \
+    v16int c;                                                       \
+    for( int j = 0; j < 16; j++ )                                   \
+      c.i[j] = -( a.f[j] op b.f[j] );                               \
+    return c;                                                       \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+#if 0
+# define LOGICAL(op,intrin,flag)                                    \
+  inline v16int operator op( const v16float &a, const v16float &b ) \
+  {							            \
+    v16int c;                                                       \
+    c.v = intrin( a.v, b.v, flag );				    \
+    return c;                                                       \
+  }
+
+  LOGICAL( <,  _mm512_cmp_ps, _CMP_LT_OS  )
+  LOGICAL( >,  _mm512_cmp_ps, _CMP_GT_OS  )
+  LOGICAL( ==, _mm512_cmp_ps, _CMP_EQ_OS  )
+  LOGICAL( !=, _mm512_cmp_ps, _CMP_NEQ_OS )
+  LOGICAL( <=, _mm512_cmp_ps, _CMP_LE_OS  )
+  LOGICAL( >=, _mm512_cmp_ps, _CMP_GE_OS  )
+
+  inline v16int operator &&( const v16float &a, const v16float &b )
+  {
+    v16int c;
+    __m512 vzero = _mm512_setzero_ps();
+    c.v = _mm512_and_ps( _mm512_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			 _mm512_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+    return c;
+  }
+
+  inline v16int operator ||( const v16float &a, const v16float &b )
+  {
+    v16int c;
+    __m512 vzero = _mm512_setzero_ps();
+    c.v = _mm512_or_ps( _mm512_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			_mm512_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+    return c;
+  }
+
+# undef LOGICAL
+#endif
+
+  // v16float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v16float fn( const v16float &a )       \
+  {						\
+    v16float b;                                 \
+    for( int j = 0; j < 16; j++ )               \
+      b.f[j] = ::fn( a.f[j] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v16float fn( const v16float &a, const v16float &b )    \
+  {								\
+    v16float c;                                                 \
+    for( int j = 0; j < 16; j++ )                               \
+      c.f[j] = ::fn( a.f[j], b.f[j] );                          \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v16float fabs( const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_andnot_ps( _mm512_set1_ps( -0.0f ), a.v );
+
+    return b;
+  }
+
+  inline v16float sqrt( const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_sqrt_ps( a.v );
+
+    return b;
+  }
+
+  inline v16float copysign( const v16float &a, const v16float &b )
+  {
+    v16float c;
+    __m512 t = _mm512_set1_ps( -0.0f );
+
+    c.v = _mm512_or_ps( _mm512_and_ps( t, b.v ), _mm512_andnot_ps( t, a.v ) );
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v16float miscellaneous functions
+
+  inline v16float rsqrt_approx( const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_rsqrt14_ps(a.v);
+
+    // b.v = _mm512_rsqrt28_ps(a.v);
+
+    return b;
+  }
+
+  inline v16float rsqrt( const v16float &a )
+  {
+    v16float b;
+    __m512 a_v = a.v, b_v;
+
+    // b_v = _mm512_rsqrt28_ps(a_v);
+
+    b_v = _mm512_rsqrt14_ps(a_v);
+
+    b.v = _mm512_add_ps( b_v, _mm512_mul_ps( _mm512_set1_ps( 0.5f ),
+					     _mm512_sub_ps( b_v,
+							    _mm512_mul_ps( a_v,
+									   _mm512_mul_ps( b_v,
+											  _mm512_mul_ps( b_v, b_v ) ) ) ) ) );
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more refinement
+    // is desired.
+    // b.v = _mm512_div_ps( _mm512_set1_ps( 1.0f ), _mm512_sqrt_ps( a.v ) );
+
+    return b;
+  }
+
+  inline v16float rcp_approx( const v16float &a )
+  {
+    v16float b;
+
+    // b.v = _mm512_rcp28_ps( a.v );
+
+    b.v = _mm512_rcp14_ps( a.v );
+
+    return b;
+  }
+
+  inline v16float rcp( const v16float &a )
+  {
+    v16float b;
+    __m512 a_v = a.v, b_v;
+
+    // b_v = _mm512_rcp28_ps( a_v );
+
+    b_v = _mm512_rcp14_ps( a_v );
+
+    b.v = _mm512_sub_ps( _mm512_add_ps( b_v, b_v ),
+			 _mm512_mul_ps( a_v, _mm512_mul_ps( b_v, b_v ) ) );
+
+    // b.v = _mm512_div_ps( _mm512_set1_ps( 1.0f ), a.v );
+
+    return b;
+  }
+
+  inline v16float fma( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.v = _mm512_fmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v16float fms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.v = _mm512_fmsub_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v16float fnms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.v = _mm512_fnmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v16float clear_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_andnot_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v16float set_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_or_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v16float toggle_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.v = _mm512_xor_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline void increment_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    _mm512_store_ps( p, _mm512_add_ps( _mm512_load_ps( p ), a.v ) );
+  }
+
+  inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    _mm512_store_ps( p, _mm512_sub_ps( _mm512_load_ps( p ), a.v ) );
+  }
+
+  inline void scale_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    _mm512_store_ps( p, _mm512_mul_ps( _mm512_load_ps( p ), a.v ) );
+  }
+
+} // namespace v16
+
+#endif // _v16_avx512_h_
diff --git a/src/util/v16/v16_portable.h b/src/util/v16/v16_portable.h
new file mode 100644
index 00000000..084d1bb2
--- /dev/null
+++ b/src/util/v16/v16_portable.h
@@ -0,0 +1,4253 @@
+#ifndef _v16_portable_h_
+#define _v16_portable_h_
+
+#ifndef IN_v16_h
+#error "Do not include v16_portable.h directly; use v16.h"
+#endif
+
+#define V16_ACCELERATION
+#define V16_PORTABLE_ACCELERATION
+
+#include <math.h>
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v16
+{
+  class v16;
+  class v16int;
+  class v16float;
+
+  ////////////////
+  // v16 base class
+
+  class v16
+  {
+    friend class v16int;
+    friend class v16float;
+
+    // v16 miscellaneous friends
+
+    friend inline int any( const v16 &a ) ALWAYS_INLINE;
+    friend inline int all( const v16 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v16 splat( const v16 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+				  v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+				  v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+				  v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16    merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE;
+
+    // v16 memory manipulation friends
+
+    friend inline void   load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE;
+    friend inline void  store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void  clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE;
+    friend inline void   copy_16x1( void * ALIGNED(64) dst,
+				    const void * ALIGNED(64) src ) ALWAYS_INLINE;
+    friend inline void   swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE;
+
+    // v16 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 16x2_tr variants.
+
+    friend inline void load_16x1_tr( const void *a00, const void *a01,
+				     const void *a02, const void *a03,
+				     const void *a04, const void *a05,
+				     const void *a06, const void *a07,
+				     const void *a08, const void *a09,
+				     const void *a10, const void *a11,
+				     const void *a12, const void *a13,
+				     const void *a14, const void *a15,
+				     v16 &a ) ALWAYS_INLINE;
+    friend inline void load_16x2_tr( const void * ALIGNED(8) a00,
+				     const void * ALIGNED(8) a01,
+				     const void * ALIGNED(8) a02,
+				     const void * ALIGNED(8) a03,
+				     const void * ALIGNED(8) a04,
+				     const void * ALIGNED(8) a05,
+				     const void * ALIGNED(8) a06,
+				     const void * ALIGNED(8) a07,
+				     const void * ALIGNED(8) a08,
+				     const void * ALIGNED(8) a09,
+				     const void * ALIGNED(8) a10,
+				     const void * ALIGNED(8) a11,
+				     const void * ALIGNED(8) a12,
+				     const void * ALIGNED(8) a13,
+				     const void * ALIGNED(8) a14,
+				     const void * ALIGNED(8) a15,
+				     v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void load_16x3_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE;
+    friend inline void load_16x4_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d,
+				     v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr( const void * ALIGNED(64) a00,
+				      const void * ALIGNED(64) a01,
+				      const void * ALIGNED(64) a02,
+				      const void * ALIGNED(64) a03,
+				      const void * ALIGNED(64) a04,
+				      const void * ALIGNED(64) a05,
+				      const void * ALIGNED(64) a06,
+				      const void * ALIGNED(64) a07,
+				      const void * ALIGNED(64) a08,
+				      const void * ALIGNED(64) a09,
+				      const void * ALIGNED(64) a10,
+				      const void * ALIGNED(64) a11,
+				      const void * ALIGNED(64) a12,
+				      const void * ALIGNED(64) a13,
+				      const void * ALIGNED(64) a14,
+				      const void * ALIGNED(64) a15,
+				      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+				      v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+				      v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+				      v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+				       const void * ALIGNED(64) a01,
+				       const void * ALIGNED(64) a02,
+				       const void * ALIGNED(64) a03,
+				       const void * ALIGNED(64) a04,
+				       const void * ALIGNED(64) a05,
+				       const void * ALIGNED(64) a06,
+				       const void * ALIGNED(64) a07,
+				       v16 &a, v16 &b, v16 &c, v16 &d,
+				       v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+					const void * ALIGNED(64) a01,
+					const void * ALIGNED(64) a02,
+					const void * ALIGNED(64) a03,
+					const void * ALIGNED(64) a04,
+					const void * ALIGNED(64) a05,
+					const void * ALIGNED(64) a06,
+					const void * ALIGNED(64) a07,
+					const void * ALIGNED(64) a08,
+					const void * ALIGNED(64) a09,
+					const void * ALIGNED(64) a10,
+					const void * ALIGNED(64) a11,
+					const void * ALIGNED(64) a12,
+					const void * ALIGNED(64) a13,
+					const void * ALIGNED(64) a14,
+					const void * ALIGNED(64) a15,
+					v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+					v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+					v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+					v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+
+    friend inline void store_16x1_tr( const v16 &a,
+				      void *a00, void *a01, void *a02, void *a03,
+				      void *a04, void *a05, void *a06, void *a07,
+				      void *a08, void *a09, void *a10, void *a11,
+				      void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE;
+    friend inline void store_16x2_tr( const v16 &a, const v16 &b,
+				      void * ALIGNED(8) a00,
+				      void * ALIGNED(8) a01,
+				      void * ALIGNED(8) a02,
+				      void * ALIGNED(8) a03,
+				      void * ALIGNED(8) a04,
+				      void * ALIGNED(8) a05,
+				      void * ALIGNED(8) a06,
+				      void * ALIGNED(8) a07,
+				      void * ALIGNED(8) a08,
+				      void * ALIGNED(8) a09,
+				      void * ALIGNED(8) a10,
+				      void * ALIGNED(8) a11,
+				      void * ALIGNED(8) a12,
+				      void * ALIGNED(8) a13,
+				      void * ALIGNED(8) a14,
+				      void * ALIGNED(8) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x4_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      const v16 &e, const v16 &f,
+				      const v16 &g, const v16 &h,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr( const v16 &b00, const v16 &b01,
+				       const v16 &b02, const v16 &b03,
+				       const v16 &b04, const v16 &b05,
+				       const v16 &b06, const v16 &b07,
+				       const v16 &b08, const v16 &b09,
+				       const v16 &b10, const v16 &b11,
+				       const v16 &b12, const v16 &b13,
+				       const v16 &b14, const v16 &b15,
+				       void * ALIGNED(64) a00,
+				       void * ALIGNED(64) a01,
+				       void * ALIGNED(64) a02,
+				       void * ALIGNED(64) a03,
+				       void * ALIGNED(64) a04,
+				       void * ALIGNED(64) a05,
+				       void * ALIGNED(64) a06,
+				       void * ALIGNED(64) a07,
+				       void * ALIGNED(64) a08,
+				       void * ALIGNED(64) a09,
+				       void * ALIGNED(64) a10,
+				       void * ALIGNED(64) a11,
+				       void * ALIGNED(64) a12,
+				       void * ALIGNED(64) a13,
+				       void * ALIGNED(64) a14,
+				       void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr_p( const v16 &a, const v16 &b,
+					const v16 &c, const v16 &d,
+					const v16 &e, const v16 &f,
+					const v16 &g, const v16 &h,
+					void * ALIGNED(64) a00,
+					void * ALIGNED(64) a01,
+					void * ALIGNED(64) a02,
+					void * ALIGNED(64) a03,
+					void * ALIGNED(64) a04,
+					void * ALIGNED(64) a05,
+					void * ALIGNED(64) a06,
+					void * ALIGNED(64) a07 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01,
+					 const v16 &b02, const v16 &b03,
+					 const v16 &b04, const v16 &b05,
+					 const v16 &b06, const v16 &b07,
+					 const v16 &b08, const v16 &b09,
+					 const v16 &b10, const v16 &b11,
+					 const v16 &b12, const v16 &b13,
+					 const v16 &b14, const v16 &b15,
+					 void * ALIGNED(64) a00,
+					 void * ALIGNED(64) a01,
+					 void * ALIGNED(64) a02,
+					 void * ALIGNED(64) a03,
+					 void * ALIGNED(64) a04,
+					 void * ALIGNED(64) a05,
+					 void * ALIGNED(64) a06,
+					 void * ALIGNED(64) a07,
+					 void * ALIGNED(64) a08,
+					 void * ALIGNED(64) a09,
+					 void * ALIGNED(64) a10,
+					 void * ALIGNED(64) a11,
+					 void * ALIGNED(64) a12,
+					 void * ALIGNED(64) a13,
+					 void * ALIGNED(64) a14,
+					 void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int   i[16];
+      float f[16];
+    };
+
+  public:
+
+    v16() {}                    // Default constructor
+
+    v16( const v16 &a )         // Copy constructor
+    {
+      i[ 0]=a.i[ 0]; i[ 1]=a.i[ 1]; i[ 2]=a.i[ 2]; i[ 3]=a.i[ 3];
+      i[ 4]=a.i[ 4]; i[ 5]=a.i[ 5]; i[ 6]=a.i[ 6]; i[ 7]=a.i[ 7];
+      i[ 8]=a.i[ 8]; i[ 9]=a.i[ 9]; i[10]=a.i[10]; i[11]=a.i[11];
+      i[12]=a.i[12]; i[13]=a.i[13]; i[14]=a.i[14]; i[15]=a.i[15];
+    }
+
+    ~v16() {}                   // Default destructor
+  };
+
+  // v16 miscellaneous functions
+
+  inline int any( const v16 &a )
+  {
+    return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] ||
+           a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] ||
+           a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] ||
+           a.i[12] || a.i[13] || a.i[14] || a.i[15];
+  }
+
+  inline int all( const v16 &a )
+  {
+    return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] &&
+           a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] &&
+           a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] &&
+           a.i[12] && a.i[13] && a.i[14] && a.i[15];
+  }
+
+  template<int n>
+  inline v16 splat( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[n];
+    b.i[ 1] = a.i[n];
+    b.i[ 2] = a.i[n];
+    b.i[ 3] = a.i[n];
+    b.i[ 4] = a.i[n];
+    b.i[ 5] = a.i[n];
+    b.i[ 6] = a.i[n];
+    b.i[ 7] = a.i[n];
+    b.i[ 8] = a.i[n];
+    b.i[ 9] = a.i[n];
+    b.i[10] = a.i[n];
+    b.i[11] = a.i[n];
+    b.i[12] = a.i[n];
+    b.i[13] = a.i[n];
+    b.i[14] = a.i[n];
+    b.i[15] = a.i[n];
+
+    return b;
+  }
+
+  template<int i00, int i01, int i02, int i03, int i04, int i05, int i06, int i07, int i08, int i09, int i10, int i11, int i12, int i13, int i14, int i15>
+  inline v16 shuffle( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[i00];
+    b.i[ 1] = a.i[i01];
+    b.i[ 2] = a.i[i02];
+    b.i[ 3] = a.i[i03];
+    b.i[ 4] = a.i[i04];
+    b.i[ 5] = a.i[i05];
+    b.i[ 6] = a.i[i06];
+    b.i[ 7] = a.i[i07];
+    b.i[ 8] = a.i[i08];
+    b.i[ 9] = a.i[i09];
+    b.i[10] = a.i[i10];
+    b.i[11] = a.i[i11];
+    b.i[12] = a.i[i12];
+    b.i[13] = a.i[i13];
+    b.i[14] = a.i[i14];
+    b.i[15] = a.i[i15];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v16 &a, v16 &b )
+  {
+    sw( a.i[ 0], b.i[ 0] );
+    sw( a.i[ 1], b.i[ 1] );
+    sw( a.i[ 2], b.i[ 2] );
+    sw( a.i[ 3], b.i[ 3] );
+    sw( a.i[ 4], b.i[ 4] );
+    sw( a.i[ 5], b.i[ 5] );
+    sw( a.i[ 6], b.i[ 6] );
+    sw( a.i[ 7], b.i[ 7] );
+    sw( a.i[ 8], b.i[ 8] );
+    sw( a.i[ 9], b.i[ 9] );
+    sw( a.i[10], b.i[10] );
+    sw( a.i[11], b.i[11] );
+    sw( a.i[12], b.i[12] );
+    sw( a.i[13], b.i[13] );
+    sw( a.i[14], b.i[14] );
+    sw( a.i[15], b.i[15] );
+  }
+
+  inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+			 v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+			 v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+			 v16 &a12, v16 &a13, v16 &a14, v16 &a15 )
+  {
+    sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] );
+                             sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] );
+                                                      sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] );
+                                                                               sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] );
+                                                                                                        sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] );
+                                                                                                                                 sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] );
+                                                                                                                                                          sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] );
+                                                                                                                                                                                   sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] );
+                                                                                                                                                                                                            sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] );
+                                                                                                                                                                                                                                     sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] );
+                                                                                                                                                                                                                                                               sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] );
+                                                                                                                                                                                                                                                                                          sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] );
+                                                                                                                                                                                                                                                                                                                     sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] );
+                                                                                                                                                                                                                                                                                                                                                sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] );
+                                                                                                                                                                                                                                                                                                                                                                           sw( a14.i[15],a15.i[14] );
+  }
+
+# undef sw
+
+  // v16 memory manipulation functions
+
+  inline void load_16x1( const void * ALIGNED(64) p,
+			 v16 &a )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))p)[ 0];
+    a.i[ 1] = ((const int * ALIGNED(64))p)[ 1];
+    a.i[ 2] = ((const int * ALIGNED(64))p)[ 2];
+    a.i[ 3] = ((const int * ALIGNED(64))p)[ 3];
+    a.i[ 4] = ((const int * ALIGNED(64))p)[ 4];
+    a.i[ 5] = ((const int * ALIGNED(64))p)[ 5];
+    a.i[ 6] = ((const int * ALIGNED(64))p)[ 6];
+    a.i[ 7] = ((const int * ALIGNED(64))p)[ 7];
+    a.i[ 8] = ((const int * ALIGNED(64))p)[ 8];
+    a.i[ 9] = ((const int * ALIGNED(64))p)[ 9];
+    a.i[10] = ((const int * ALIGNED(64))p)[10];
+    a.i[11] = ((const int * ALIGNED(64))p)[11];
+    a.i[12] = ((const int * ALIGNED(64))p)[12];
+    a.i[13] = ((const int * ALIGNED(64))p)[13];
+    a.i[14] = ((const int * ALIGNED(64))p)[14];
+    a.i[15] = ((const int * ALIGNED(64))p)[15];
+  }
+
+  inline void store_16x1( const v16 &a,
+			  void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = a.i[ 0];
+    ((int * ALIGNED(64))p)[ 1] = a.i[ 1];
+    ((int * ALIGNED(64))p)[ 2] = a.i[ 2];
+    ((int * ALIGNED(64))p)[ 3] = a.i[ 3];
+    ((int * ALIGNED(64))p)[ 4] = a.i[ 4];
+    ((int * ALIGNED(64))p)[ 5] = a.i[ 5];
+    ((int * ALIGNED(64))p)[ 6] = a.i[ 6];
+    ((int * ALIGNED(64))p)[ 7] = a.i[ 7];
+    ((int * ALIGNED(64))p)[ 8] = a.i[ 8];
+    ((int * ALIGNED(64))p)[ 9] = a.i[ 9];
+    ((int * ALIGNED(64))p)[10] = a.i[10];
+    ((int * ALIGNED(64))p)[11] = a.i[11];
+    ((int * ALIGNED(64))p)[12] = a.i[12];
+    ((int * ALIGNED(64))p)[13] = a.i[13];
+    ((int * ALIGNED(64))p)[14] = a.i[14];
+    ((int * ALIGNED(64))p)[15] = a.i[15];
+  }
+
+  inline void stream_16x1( const v16 &a,
+			   void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = a.i[ 0];
+    ((int * ALIGNED(64))p)[ 1] = a.i[ 1];
+    ((int * ALIGNED(64))p)[ 2] = a.i[ 2];
+    ((int * ALIGNED(64))p)[ 3] = a.i[ 3];
+    ((int * ALIGNED(64))p)[ 4] = a.i[ 4];
+    ((int * ALIGNED(64))p)[ 5] = a.i[ 5];
+    ((int * ALIGNED(64))p)[ 6] = a.i[ 6];
+    ((int * ALIGNED(64))p)[ 7] = a.i[ 7];
+    ((int * ALIGNED(64))p)[ 8] = a.i[ 8];
+    ((int * ALIGNED(64))p)[ 9] = a.i[ 9];
+    ((int * ALIGNED(64))p)[10] = a.i[10];
+    ((int * ALIGNED(64))p)[11] = a.i[11];
+    ((int * ALIGNED(64))p)[12] = a.i[12];
+    ((int * ALIGNED(64))p)[13] = a.i[13];
+    ((int * ALIGNED(64))p)[14] = a.i[14];
+    ((int * ALIGNED(64))p)[15] = a.i[15];
+  }
+
+  inline void clear_16x1( void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = 0;
+    ((int * ALIGNED(64))p)[ 1] = 0;
+    ((int * ALIGNED(64))p)[ 2] = 0;
+    ((int * ALIGNED(64))p)[ 3] = 0;
+    ((int * ALIGNED(64))p)[ 4] = 0;
+    ((int * ALIGNED(64))p)[ 5] = 0;
+    ((int * ALIGNED(64))p)[ 6] = 0;
+    ((int * ALIGNED(64))p)[ 7] = 0;
+    ((int * ALIGNED(64))p)[ 8] = 0;
+    ((int * ALIGNED(64))p)[ 9] = 0;
+    ((int * ALIGNED(64))p)[10] = 0;
+    ((int * ALIGNED(64))p)[11] = 0;
+    ((int * ALIGNED(64))p)[12] = 0;
+    ((int * ALIGNED(64))p)[13] = 0;
+    ((int * ALIGNED(64))p)[14] = 0;
+    ((int * ALIGNED(64))p)[15] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_16x1( void * ALIGNED(64) dst,
+			 const void * ALIGNED(64) src )
+  {
+    ((int * ALIGNED(64))dst)[ 0] = ((const int * ALIGNED(64))src)[ 0];
+    ((int * ALIGNED(64))dst)[ 1] = ((const int * ALIGNED(64))src)[ 1];
+    ((int * ALIGNED(64))dst)[ 2] = ((const int * ALIGNED(64))src)[ 2];
+    ((int * ALIGNED(64))dst)[ 3] = ((const int * ALIGNED(64))src)[ 3];
+    ((int * ALIGNED(64))dst)[ 4] = ((const int * ALIGNED(64))src)[ 4];
+    ((int * ALIGNED(64))dst)[ 5] = ((const int * ALIGNED(64))src)[ 5];
+    ((int * ALIGNED(64))dst)[ 6] = ((const int * ALIGNED(64))src)[ 6];
+    ((int * ALIGNED(64))dst)[ 7] = ((const int * ALIGNED(64))src)[ 7];
+    ((int * ALIGNED(64))dst)[ 8] = ((const int * ALIGNED(64))src)[ 8];
+    ((int * ALIGNED(64))dst)[ 9] = ((const int * ALIGNED(64))src)[ 9];
+    ((int * ALIGNED(64))dst)[10] = ((const int * ALIGNED(64))src)[10];
+    ((int * ALIGNED(64))dst)[11] = ((const int * ALIGNED(64))src)[11];
+    ((int * ALIGNED(64))dst)[12] = ((const int * ALIGNED(64))src)[12];
+    ((int * ALIGNED(64))dst)[13] = ((const int * ALIGNED(64))src)[13];
+    ((int * ALIGNED(64))dst)[14] = ((const int * ALIGNED(64))src)[14];
+    ((int * ALIGNED(64))dst)[15] = ((const int * ALIGNED(64))src)[15];
+  }
+
+  inline void swap_16x1( void * ALIGNED(64) a,
+			 void * ALIGNED(64) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(64))a)[ 0];
+    ((int * ALIGNED(64))a)[ 0] = ((int * ALIGNED(64))b)[ 0];
+    ((int * ALIGNED(64))b)[ 0] = t;
+
+    t = ((int * ALIGNED(64))a)[ 1];
+    ((int * ALIGNED(64))a)[ 1] = ((int * ALIGNED(64))b)[ 1];
+    ((int * ALIGNED(64))b)[ 1] = t;
+
+    t = ((int * ALIGNED(64))a)[ 2];
+    ((int * ALIGNED(64))a)[ 2] = ((int * ALIGNED(64))b)[ 2];
+    ((int * ALIGNED(64))b)[ 2] = t;
+
+    t = ((int * ALIGNED(64))a)[ 3];
+    ((int * ALIGNED(64))a)[ 3] = ((int * ALIGNED(64))b)[ 3];
+    ((int * ALIGNED(64))b)[ 3] = t;
+
+    t = ((int * ALIGNED(64))a)[ 4];
+    ((int * ALIGNED(64))a)[ 4] = ((int * ALIGNED(64))b)[ 4];
+    ((int * ALIGNED(64))b)[ 4] = t;
+
+    t = ((int * ALIGNED(64))a)[ 5];
+    ((int * ALIGNED(64))a)[ 5] = ((int * ALIGNED(64))b)[ 5];
+    ((int * ALIGNED(64))b)[ 5] = t;
+
+    t = ((int * ALIGNED(64))a)[ 6];
+    ((int * ALIGNED(64))a)[ 6] = ((int * ALIGNED(64))b)[ 6];
+    ((int * ALIGNED(64))b)[ 6] = t;
+
+    t = ((int * ALIGNED(64))a)[ 7];
+    ((int * ALIGNED(64))a)[ 7] = ((int * ALIGNED(64))b)[ 7];
+    ((int * ALIGNED(64))b)[ 7] = t;
+
+    t = ((int * ALIGNED(64))a)[ 8];
+    ((int * ALIGNED(64))a)[ 8] = ((int * ALIGNED(64))b)[ 8];
+    ((int * ALIGNED(64))b)[ 8] = t;
+
+    t = ((int * ALIGNED(64))a)[ 9];
+    ((int * ALIGNED(64))a)[ 9] = ((int * ALIGNED(64))b)[ 9];
+    ((int * ALIGNED(64))b)[ 9] = t;
+
+    t = ((int * ALIGNED(64))a)[10];
+    ((int * ALIGNED(64))a)[10] = ((int * ALIGNED(64))b)[10];
+    ((int * ALIGNED(64))b)[10] = t;
+
+    t = ((int * ALIGNED(64))a)[11];
+    ((int * ALIGNED(64))a)[11] = ((int * ALIGNED(64))b)[11];
+    ((int * ALIGNED(64))b)[11] = t;
+
+    t = ((int * ALIGNED(64))a)[12];
+    ((int * ALIGNED(64))a)[12] = ((int * ALIGNED(64))b)[12];
+    ((int * ALIGNED(64))b)[12] = t;
+
+    t = ((int * ALIGNED(64))a)[13];
+    ((int * ALIGNED(64))a)[13] = ((int * ALIGNED(64))b)[13];
+    ((int * ALIGNED(64))b)[13] = t;
+
+    t = ((int * ALIGNED(64))a)[14];
+    ((int * ALIGNED(64))a)[14] = ((int * ALIGNED(64))b)[14];
+    ((int * ALIGNED(64))b)[14] = t;
+
+    t = ((int * ALIGNED(64))a)[15];
+    ((int * ALIGNED(64))a)[15] = ((int * ALIGNED(64))b)[15];
+    ((int * ALIGNED(64))b)[15] = t;
+  }
+
+  // v16 transposed memory manipulation functions
+
+  inline void load_16x1_tr( const void *a00, const void *a01,
+                            const void *a02, const void *a03,
+                            const void *a04, const void *a05,
+                            const void *a06, const void *a07,
+			    const void *a08, const void *a09,
+                            const void *a10, const void *a11,
+                            const void *a12, const void *a13,
+                            const void *a14, const void *a15,
+			    v16 &a )
+  {
+    a.i[ 0] = ((const int *)a00)[0];
+    a.i[ 1] = ((const int *)a01)[0];
+    a.i[ 2] = ((const int *)a02)[0];
+    a.i[ 3] = ((const int *)a03)[0];
+    a.i[ 4] = ((const int *)a04)[0];
+    a.i[ 5] = ((const int *)a05)[0];
+    a.i[ 6] = ((const int *)a06)[0];
+    a.i[ 7] = ((const int *)a07)[0];
+    a.i[ 8] = ((const int *)a08)[0];
+    a.i[ 9] = ((const int *)a09)[0];
+    a.i[10] = ((const int *)a10)[0];
+    a.i[11] = ((const int *)a11)[0];
+    a.i[12] = ((const int *)a12)[0];
+    a.i[13] = ((const int *)a13)[0];
+    a.i[14] = ((const int *)a14)[0];
+    a.i[15] = ((const int *)a15)[0];
+  }
+
+  inline void load_16x2_tr( const void * ALIGNED(8) a00,
+			    const void * ALIGNED(8) a01,
+			    const void * ALIGNED(8) a02,
+			    const void * ALIGNED(8) a03,
+			    const void * ALIGNED(8) a04,
+			    const void * ALIGNED(8) a05,
+			    const void * ALIGNED(8) a06,
+			    const void * ALIGNED(8) a07,
+			    const void * ALIGNED(8) a08,
+			    const void * ALIGNED(8) a09,
+			    const void * ALIGNED(8) a10,
+			    const void * ALIGNED(8) a11,
+			    const void * ALIGNED(8) a12,
+			    const void * ALIGNED(8) a13,
+			    const void * ALIGNED(8) a14,
+			    const void * ALIGNED(8) a15,
+			    v16 &a, v16 &b )
+  {
+    a.i[ 0] = ((const int * ALIGNED(8))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(8))a00)[1];
+
+    a.i[ 1] = ((const int * ALIGNED(8))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(8))a01)[1];
+
+    a.i[ 2] = ((const int * ALIGNED(8))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(8))a02)[1];
+
+    a.i[ 3] = ((const int * ALIGNED(8))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(8))a03)[1];
+
+    a.i[ 4] = ((const int * ALIGNED(8))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(8))a04)[1];
+
+    a.i[ 5] = ((const int * ALIGNED(8))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(8))a05)[1];
+
+    a.i[ 6] = ((const int * ALIGNED(8))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(8))a06)[1];
+
+    a.i[ 7] = ((const int * ALIGNED(8))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(8))a07)[1];
+
+    a.i[ 8] = ((const int * ALIGNED(8))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(8))a08)[1];
+
+    a.i[ 9] = ((const int * ALIGNED(8))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(8))a09)[1];
+
+    a.i[10] = ((const int * ALIGNED(8))a10)[0];
+    b.i[10] = ((const int * ALIGNED(8))a10)[1];
+
+    a.i[11] = ((const int * ALIGNED(8))a11)[0];
+    b.i[11] = ((const int * ALIGNED(8))a11)[1];
+
+    a.i[12] = ((const int * ALIGNED(8))a12)[0];
+    b.i[12] = ((const int * ALIGNED(8))a12)[1];
+
+    a.i[13] = ((const int * ALIGNED(8))a13)[0];
+    b.i[13] = ((const int * ALIGNED(8))a13)[1];
+
+    a.i[14] = ((const int * ALIGNED(8))a14)[0];
+    b.i[14] = ((const int * ALIGNED(8))a14)[1];
+
+    a.i[15] = ((const int * ALIGNED(8))a15)[0];
+    b.i[15] = ((const int * ALIGNED(8))a15)[1];
+  }
+
+  inline void load_16x3_tr( const void * ALIGNED(64) a00,
+                            const void * ALIGNED(64) a01,
+                            const void * ALIGNED(64) a02,
+                            const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+                            const void * ALIGNED(64) a09,
+                            const void * ALIGNED(64) a10,
+                            const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; 
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; 
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2]; 
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2]; 
+  }
+
+  inline void load_16x4_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+  }
+
+  inline void load_16x8_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d,
+			    v16 &e, v16 &f, v16 &g, v16 &h )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+    e.i[ 0] = ((const int * ALIGNED(64))a00)[4];
+    f.i[ 0] = ((const int * ALIGNED(64))a00)[5];
+    g.i[ 0] = ((const int * ALIGNED(64))a00)[6];
+    h.i[ 0] = ((const int * ALIGNED(64))a00)[7];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+    e.i[ 1] = ((const int * ALIGNED(64))a01)[4];
+    f.i[ 1] = ((const int * ALIGNED(64))a01)[5];
+    g.i[ 1] = ((const int * ALIGNED(64))a01)[6];
+    h.i[ 1] = ((const int * ALIGNED(64))a01)[7];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+    e.i[ 2] = ((const int * ALIGNED(64))a02)[4];
+    f.i[ 2] = ((const int * ALIGNED(64))a02)[5];
+    g.i[ 2] = ((const int * ALIGNED(64))a02)[6];
+    h.i[ 2] = ((const int * ALIGNED(64))a02)[7];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+    e.i[ 3] = ((const int * ALIGNED(64))a03)[4];
+    f.i[ 3] = ((const int * ALIGNED(64))a03)[5];
+    g.i[ 3] = ((const int * ALIGNED(64))a03)[6];
+    h.i[ 3] = ((const int * ALIGNED(64))a03)[7];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+    e.i[ 4] = ((const int * ALIGNED(64))a04)[4];
+    f.i[ 4] = ((const int * ALIGNED(64))a04)[5];
+    g.i[ 4] = ((const int * ALIGNED(64))a04)[6];
+    h.i[ 4] = ((const int * ALIGNED(64))a04)[7];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+    e.i[ 5] = ((const int * ALIGNED(64))a05)[4];
+    f.i[ 5] = ((const int * ALIGNED(64))a05)[5];
+    g.i[ 5] = ((const int * ALIGNED(64))a05)[6];
+    h.i[ 5] = ((const int * ALIGNED(64))a05)[7];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+    e.i[ 6] = ((const int * ALIGNED(64))a06)[4];
+    f.i[ 6] = ((const int * ALIGNED(64))a06)[5];
+    g.i[ 6] = ((const int * ALIGNED(64))a06)[6];
+    h.i[ 6] = ((const int * ALIGNED(64))a06)[7];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+    e.i[ 7] = ((const int * ALIGNED(64))a07)[4];
+    f.i[ 7] = ((const int * ALIGNED(64))a07)[5];
+    g.i[ 7] = ((const int * ALIGNED(64))a07)[6];
+    h.i[ 7] = ((const int * ALIGNED(64))a07)[7];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+    e.i[ 8] = ((const int * ALIGNED(64))a08)[4];
+    f.i[ 8] = ((const int * ALIGNED(64))a08)[5];
+    g.i[ 8] = ((const int * ALIGNED(64))a08)[6];
+    h.i[ 8] = ((const int * ALIGNED(64))a08)[7];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+    e.i[ 9] = ((const int * ALIGNED(64))a09)[4];
+    f.i[ 9] = ((const int * ALIGNED(64))a09)[5];
+    g.i[ 9] = ((const int * ALIGNED(64))a09)[6];
+    h.i[ 9] = ((const int * ALIGNED(64))a09)[7];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+    e.i[10] = ((const int * ALIGNED(64))a10)[4];
+    f.i[10] = ((const int * ALIGNED(64))a10)[5];
+    g.i[10] = ((const int * ALIGNED(64))a10)[6];
+    h.i[10] = ((const int * ALIGNED(64))a10)[7];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+    e.i[11] = ((const int * ALIGNED(64))a11)[4];
+    f.i[11] = ((const int * ALIGNED(64))a11)[5];
+    g.i[11] = ((const int * ALIGNED(64))a11)[6];
+    h.i[11] = ((const int * ALIGNED(64))a11)[7];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+    e.i[12] = ((const int * ALIGNED(64))a12)[4];
+    f.i[12] = ((const int * ALIGNED(64))a12)[5];
+    g.i[12] = ((const int * ALIGNED(64))a12)[6];
+    h.i[12] = ((const int * ALIGNED(64))a12)[7];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+    e.i[13] = ((const int * ALIGNED(64))a13)[4];
+    f.i[13] = ((const int * ALIGNED(64))a13)[5];
+    g.i[13] = ((const int * ALIGNED(64))a13)[6];
+    h.i[13] = ((const int * ALIGNED(64))a13)[7];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+    e.i[14] = ((const int * ALIGNED(64))a14)[4];
+    f.i[14] = ((const int * ALIGNED(64))a14)[5];
+    g.i[14] = ((const int * ALIGNED(64))a14)[6];
+    h.i[14] = ((const int * ALIGNED(64))a14)[7];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+    e.i[15] = ((const int * ALIGNED(64))a15)[4];
+    f.i[15] = ((const int * ALIGNED(64))a15)[5];
+    g.i[15] = ((const int * ALIGNED(64))a15)[6];
+    h.i[15] = ((const int * ALIGNED(64))a15)[7];
+  }
+
+  inline void load_16x16_tr( const void * ALIGNED(64) a00,
+			     const void * ALIGNED(64) a01,
+			     const void * ALIGNED(64) a02,
+			     const void * ALIGNED(64) a03,
+			     const void * ALIGNED(64) a04,
+			     const void * ALIGNED(64) a05,
+			     const void * ALIGNED(64) a06,
+			     const void * ALIGNED(64) a07,
+			     const void * ALIGNED(64) a08,
+			     const void * ALIGNED(64) a09,
+			     const void * ALIGNED(64) a10,
+			     const void * ALIGNED(64) a11,
+			     const void * ALIGNED(64) a12,
+			     const void * ALIGNED(64) a13,
+			     const void * ALIGNED(64) a14,
+			     const void * ALIGNED(64) a15,
+			     v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			     v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			     v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			     v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8];
+    b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9];
+    b10.i[ 0] = ((const int * ALIGNED(64))a00)[10];
+    b11.i[ 0] = ((const int * ALIGNED(64))a00)[11];
+    b12.i[ 0] = ((const int * ALIGNED(64))a00)[12];
+    b13.i[ 0] = ((const int * ALIGNED(64))a00)[13];
+    b14.i[ 0] = ((const int * ALIGNED(64))a00)[14];
+    b15.i[ 0] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a01)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a01)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a01)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a01)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a01)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7];
+    b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8];
+    b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9];
+    b10.i[ 2] = ((const int * ALIGNED(64))a02)[10];
+    b11.i[ 2] = ((const int * ALIGNED(64))a02)[11];
+    b12.i[ 2] = ((const int * ALIGNED(64))a02)[12];
+    b13.i[ 2] = ((const int * ALIGNED(64))a02)[13];
+    b14.i[ 2] = ((const int * ALIGNED(64))a02)[14];
+    b15.i[ 2] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a03)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a03)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a03)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a03)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a03)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7];
+    b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8];
+    b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9];
+    b10.i[ 4] = ((const int * ALIGNED(64))a04)[10];
+    b11.i[ 4] = ((const int * ALIGNED(64))a04)[11];
+    b12.i[ 4] = ((const int * ALIGNED(64))a04)[12];
+    b13.i[ 4] = ((const int * ALIGNED(64))a04)[13];
+    b14.i[ 4] = ((const int * ALIGNED(64))a04)[14];
+    b15.i[ 4] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a05)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a05)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a05)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a05)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a05)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7];
+    b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8];
+    b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9];
+    b10.i[ 6] = ((const int * ALIGNED(64))a06)[10];
+    b11.i[ 6] = ((const int * ALIGNED(64))a06)[11];
+    b12.i[ 6] = ((const int * ALIGNED(64))a06)[12];
+    b13.i[ 6] = ((const int * ALIGNED(64))a06)[13];
+    b14.i[ 6] = ((const int * ALIGNED(64))a06)[14];
+    b15.i[ 6] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a07)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a07)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a07)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a07)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a07)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a07)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 8] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 8] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 8] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 8] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 8] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 8] = ((const int * ALIGNED(64))a08)[15];
+
+    b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0];
+    b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1];
+    b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2];
+    b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3];
+    b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4];
+    b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5];
+    b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6];
+    b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a09)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a10)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a10)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a10)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a10)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a10)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a10)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a10)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[10] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[10] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[10] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[10] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[10] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[10] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[10] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[10] = ((const int * ALIGNED(64))a10)[15];
+
+    b00.i[11] = ((const int * ALIGNED(64))a11)[ 0];
+    b01.i[11] = ((const int * ALIGNED(64))a11)[ 1];
+    b02.i[11] = ((const int * ALIGNED(64))a11)[ 2];
+    b03.i[11] = ((const int * ALIGNED(64))a11)[ 3];
+    b04.i[11] = ((const int * ALIGNED(64))a11)[ 4];
+    b05.i[11] = ((const int * ALIGNED(64))a11)[ 5];
+    b06.i[11] = ((const int * ALIGNED(64))a11)[ 6];
+    b07.i[11] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a11)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a12)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a12)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a12)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a12)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a12)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a12)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a12)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[12] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[12] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[12] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[12] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[12] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[12] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[12] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[12] = ((const int * ALIGNED(64))a12)[15];
+
+    b00.i[13] = ((const int * ALIGNED(64))a13)[ 0];
+    b01.i[13] = ((const int * ALIGNED(64))a13)[ 1];
+    b02.i[13] = ((const int * ALIGNED(64))a13)[ 2];
+    b03.i[13] = ((const int * ALIGNED(64))a13)[ 3];
+    b04.i[13] = ((const int * ALIGNED(64))a13)[ 4];
+    b05.i[13] = ((const int * ALIGNED(64))a13)[ 5];
+    b06.i[13] = ((const int * ALIGNED(64))a13)[ 6];
+    b07.i[13] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a13)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a14)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a14)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a14)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a14)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a14)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a14)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a14)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[14] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[14] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[14] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[14] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[14] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[14] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[14] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[14] = ((const int * ALIGNED(64))a14)[15];
+
+    b00.i[15] = ((const int * ALIGNED(64))a15)[ 0];
+    b01.i[15] = ((const int * ALIGNED(64))a15)[ 1];
+    b02.i[15] = ((const int * ALIGNED(64))a15)[ 2];
+    b03.i[15] = ((const int * ALIGNED(64))a15)[ 3];
+    b04.i[15] = ((const int * ALIGNED(64))a15)[ 4];
+    b05.i[15] = ((const int * ALIGNED(64))a15)[ 5];
+    b06.i[15] = ((const int * ALIGNED(64))a15)[ 6];
+    b07.i[15] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+			      const void * ALIGNED(64) a01,
+			      const void * ALIGNED(64) a02,
+			      const void * ALIGNED(64) a03,
+			      const void * ALIGNED(64) a04,
+			      const void * ALIGNED(64) a05,
+			      const void * ALIGNED(64) a06,
+			      const void * ALIGNED(64) a07,
+			      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			      v16 &b04, v16 &b05, v16 &b06, v16 &b07 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+  }
+
+  inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+			       const void * ALIGNED(64) a01,
+			       const void * ALIGNED(64) a02,
+			       const void * ALIGNED(64) a03,
+			       const void * ALIGNED(64) a04,
+			       const void * ALIGNED(64) a05,
+			       const void * ALIGNED(64) a06,
+			       const void * ALIGNED(64) a07,
+			       const void * ALIGNED(64) a08,
+			       const void * ALIGNED(64) a09,
+			       const void * ALIGNED(64) a10,
+			       const void * ALIGNED(64) a11,
+			       const void * ALIGNED(64) a12,
+			       const void * ALIGNED(64) a13,
+			       const void * ALIGNED(64) a14,
+			       const void * ALIGNED(64) a15,
+			       v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			       v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			       v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			       v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+
+    b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0];
+    b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1];
+    b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2];
+    b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3];
+    b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4];
+    b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5];
+    b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6];
+    b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a08)[15];
+
+    b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0];
+    b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1];
+    b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2];
+    b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3];
+    b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4];
+    b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5];
+    b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6];
+    b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a09)[15];
+
+    b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0];
+    b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1];
+    b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2];
+    b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3];
+    b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4];
+    b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5];
+    b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6];
+    b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a10)[15];
+
+    b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0];
+    b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1];
+    b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2];
+    b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3];
+    b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4];
+    b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5];
+    b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6];
+    b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a11)[15];
+
+    b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0];
+    b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1];
+    b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2];
+    b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3];
+    b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4];
+    b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5];
+    b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6];
+    b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a12)[15];
+
+    b08.i[10] = ((const int * ALIGNED(64))a13)[ 0];
+    b09.i[10] = ((const int * ALIGNED(64))a13)[ 1];
+    b10.i[10] = ((const int * ALIGNED(64))a13)[ 2];
+    b11.i[10] = ((const int * ALIGNED(64))a13)[ 3];
+    b12.i[10] = ((const int * ALIGNED(64))a13)[ 4];
+    b13.i[10] = ((const int * ALIGNED(64))a13)[ 5];
+    b14.i[10] = ((const int * ALIGNED(64))a13)[ 6];
+    b15.i[10] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a13)[15];
+
+    b08.i[12] = ((const int * ALIGNED(64))a14)[ 0];
+    b09.i[12] = ((const int * ALIGNED(64))a14)[ 1];
+    b10.i[12] = ((const int * ALIGNED(64))a14)[ 2];
+    b11.i[12] = ((const int * ALIGNED(64))a14)[ 3];
+    b12.i[12] = ((const int * ALIGNED(64))a14)[ 4];
+    b13.i[12] = ((const int * ALIGNED(64))a14)[ 5];
+    b14.i[12] = ((const int * ALIGNED(64))a14)[ 6];
+    b15.i[12] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a14)[15];
+
+    b08.i[14] = ((const int * ALIGNED(64))a15)[ 0];
+    b09.i[14] = ((const int * ALIGNED(64))a15)[ 1];
+    b10.i[14] = ((const int * ALIGNED(64))a15)[ 2];
+    b11.i[14] = ((const int * ALIGNED(64))a15)[ 3];
+    b12.i[14] = ((const int * ALIGNED(64))a15)[ 4];
+    b13.i[14] = ((const int * ALIGNED(64))a15)[ 5];
+    b14.i[14] = ((const int * ALIGNED(64))a15)[ 6];
+    b15.i[14] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void store_16x1_tr( const v16 &a,
+			     void *a00, void *a01, void *a02, void *a03,
+			     void *a04, void *a05, void *a06, void *a07,
+			     void *a08, void *a09, void *a10, void *a11,
+			     void *a12, void *a13, void *a14, void *a15 )
+  {
+    ((int *)a00)[0] = a.i[ 0];
+    ((int *)a01)[0] = a.i[ 1];
+    ((int *)a02)[0] = a.i[ 2];
+    ((int *)a03)[0] = a.i[ 3];
+    ((int *)a04)[0] = a.i[ 4];
+    ((int *)a05)[0] = a.i[ 5];
+    ((int *)a06)[0] = a.i[ 6];
+    ((int *)a07)[0] = a.i[ 7];
+    ((int *)a08)[0] = a.i[ 8];
+    ((int *)a09)[0] = a.i[ 9];
+    ((int *)a10)[0] = a.i[10];
+    ((int *)a11)[0] = a.i[11];
+    ((int *)a12)[0] = a.i[12];
+    ((int *)a13)[0] = a.i[13];
+    ((int *)a14)[0] = a.i[14];
+    ((int *)a15)[0] = a.i[15];
+  }
+
+  inline void store_16x2_tr( const v16 &a, const v16 &b,
+			     void * ALIGNED(8) a00, void * ALIGNED(8) a01,
+			     void * ALIGNED(8) a02, void * ALIGNED(8) a03,
+			     void * ALIGNED(8) a04, void * ALIGNED(8) a05,
+			     void * ALIGNED(8) a06, void * ALIGNED(8) a07,
+			     void * ALIGNED(8) a08, void * ALIGNED(8) a09,
+			     void * ALIGNED(8) a10, void * ALIGNED(8) a11,
+			     void * ALIGNED(8) a12, void * ALIGNED(8) a13,
+			     void * ALIGNED(8) a14, void * ALIGNED(8) a15 )
+  {
+    ((int * ALIGNED(8))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(8))a00)[1] = b.i[ 0];
+
+    ((int * ALIGNED(8))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(8))a01)[1] = b.i[ 1];
+
+    ((int * ALIGNED(8))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(8))a02)[1] = b.i[ 2];
+
+    ((int * ALIGNED(8))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(8))a03)[1] = b.i[ 3];
+
+    ((int * ALIGNED(8))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(8))a04)[1] = b.i[ 4];
+
+    ((int * ALIGNED(8))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(8))a05)[1] = b.i[ 5];
+
+    ((int * ALIGNED(8))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(8))a06)[1] = b.i[ 6];
+
+    ((int * ALIGNED(8))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(8))a07)[1] = b.i[ 7];
+
+    ((int * ALIGNED(8))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(8))a08)[1] = b.i[ 8];
+
+    ((int * ALIGNED(8))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(8))a09)[1] = b.i[ 9];
+
+    ((int * ALIGNED(8))a10)[0] = a.i[10];
+    ((int * ALIGNED(8))a10)[1] = b.i[10];
+
+    ((int * ALIGNED(8))a11)[0] = a.i[11];
+    ((int * ALIGNED(8))a11)[1] = b.i[11];
+
+    ((int * ALIGNED(8))a12)[0] = a.i[12];
+    ((int * ALIGNED(8))a12)[1] = b.i[12];
+
+    ((int * ALIGNED(8))a13)[0] = a.i[13];
+    ((int * ALIGNED(8))a13)[1] = b.i[13];
+
+    ((int * ALIGNED(8))a14)[0] = a.i[14];
+    ((int * ALIGNED(8))a14)[1] = b.i[14];
+
+    ((int * ALIGNED(8))a15)[0] = a.i[15];
+    ((int * ALIGNED(8))a15)[1] = b.i[15];
+  }
+
+  inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+  }
+
+  inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+  }
+
+  inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     const v16 &e, const v16 &f, const v16 &g, const v16 &h,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+    ((int * ALIGNED(64))a00)[4] = e.i[ 0];
+    ((int * ALIGNED(64))a00)[5] = f.i[ 0];
+    ((int * ALIGNED(64))a00)[6] = g.i[ 0];
+    ((int * ALIGNED(64))a00)[7] = h.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+    ((int * ALIGNED(64))a01)[4] = e.i[ 1];
+    ((int * ALIGNED(64))a01)[5] = f.i[ 1];
+    ((int * ALIGNED(64))a01)[6] = g.i[ 1];
+    ((int * ALIGNED(64))a01)[7] = h.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+    ((int * ALIGNED(64))a02)[4] = e.i[ 2];
+    ((int * ALIGNED(64))a02)[5] = f.i[ 2];
+    ((int * ALIGNED(64))a02)[6] = g.i[ 2];
+    ((int * ALIGNED(64))a02)[7] = h.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+    ((int * ALIGNED(64))a03)[4] = e.i[ 3];
+    ((int * ALIGNED(64))a03)[5] = f.i[ 3];
+    ((int * ALIGNED(64))a03)[6] = g.i[ 3];
+    ((int * ALIGNED(64))a03)[7] = h.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+    ((int * ALIGNED(64))a04)[4] = e.i[ 4];
+    ((int * ALIGNED(64))a04)[5] = f.i[ 4];
+    ((int * ALIGNED(64))a04)[6] = g.i[ 4];
+    ((int * ALIGNED(64))a04)[7] = h.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+    ((int * ALIGNED(64))a05)[4] = e.i[ 5];
+    ((int * ALIGNED(64))a05)[5] = f.i[ 5];
+    ((int * ALIGNED(64))a05)[6] = g.i[ 5];
+    ((int * ALIGNED(64))a05)[7] = h.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+    ((int * ALIGNED(64))a06)[4] = e.i[ 6];
+    ((int * ALIGNED(64))a06)[5] = f.i[ 6];
+    ((int * ALIGNED(64))a06)[6] = g.i[ 6];
+    ((int * ALIGNED(64))a06)[7] = h.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+    ((int * ALIGNED(64))a07)[4] = e.i[ 7];
+    ((int * ALIGNED(64))a07)[5] = f.i[ 7];
+    ((int * ALIGNED(64))a07)[6] = g.i[ 7];
+    ((int * ALIGNED(64))a07)[7] = h.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+    ((int * ALIGNED(64))a08)[4] = e.i[ 8];
+    ((int * ALIGNED(64))a08)[5] = f.i[ 8];
+    ((int * ALIGNED(64))a08)[6] = g.i[ 8];
+    ((int * ALIGNED(64))a08)[7] = h.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+    ((int * ALIGNED(64))a09)[4] = e.i[ 9];
+    ((int * ALIGNED(64))a09)[5] = f.i[ 9];
+    ((int * ALIGNED(64))a09)[6] = g.i[ 9];
+    ((int * ALIGNED(64))a09)[7] = h.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+    ((int * ALIGNED(64))a10)[4] = e.i[10];
+    ((int * ALIGNED(64))a10)[5] = f.i[10];
+    ((int * ALIGNED(64))a10)[6] = g.i[10];
+    ((int * ALIGNED(64))a10)[7] = h.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+    ((int * ALIGNED(64))a11)[4] = e.i[11];
+    ((int * ALIGNED(64))a11)[5] = f.i[11];
+    ((int * ALIGNED(64))a11)[6] = g.i[11];
+    ((int * ALIGNED(64))a11)[7] = h.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+    ((int * ALIGNED(64))a12)[4] = e.i[12];
+    ((int * ALIGNED(64))a12)[5] = f.i[12];
+    ((int * ALIGNED(64))a12)[6] = g.i[12];
+    ((int * ALIGNED(64))a12)[7] = h.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+    ((int * ALIGNED(64))a13)[4] = e.i[13];
+    ((int * ALIGNED(64))a13)[5] = f.i[13];
+    ((int * ALIGNED(64))a13)[6] = g.i[13];
+    ((int * ALIGNED(64))a13)[7] = h.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+    ((int * ALIGNED(64))a14)[4] = e.i[14];
+    ((int * ALIGNED(64))a14)[5] = f.i[14];
+    ((int * ALIGNED(64))a14)[6] = g.i[14];
+    ((int * ALIGNED(64))a14)[7] = h.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+    ((int * ALIGNED(64))a15)[4] = e.i[15];
+    ((int * ALIGNED(64))a15)[5] = f.i[15];
+    ((int * ALIGNED(64))a15)[6] = g.i[15];
+    ((int * ALIGNED(64))a15)[7] = h.i[15];
+  }
+
+  inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+			      const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+			      const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+			      const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+			      void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			      void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			      void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			      void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			      void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			      void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			      void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			      void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0];
+    ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0];
+    ((int * ALIGNED(64))a00)[10] = b10.i[ 0];
+    ((int * ALIGNED(64))a00)[11] = b11.i[ 0];
+    ((int * ALIGNED(64))a00)[12] = b12.i[ 0];
+    ((int * ALIGNED(64))a00)[13] = b13.i[ 0];
+    ((int * ALIGNED(64))a00)[14] = b14.i[ 0];
+    ((int * ALIGNED(64))a00)[15] = b15.i[ 0];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1];
+    ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a01)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a01)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a01)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a01)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a01)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a01)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2];
+    ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2];
+    ((int * ALIGNED(64))a02)[10] = b10.i[ 2];
+    ((int * ALIGNED(64))a02)[11] = b11.i[ 2];
+    ((int * ALIGNED(64))a02)[12] = b12.i[ 2];
+    ((int * ALIGNED(64))a02)[13] = b13.i[ 2];
+    ((int * ALIGNED(64))a02)[14] = b14.i[ 2];
+    ((int * ALIGNED(64))a02)[15] = b15.i[ 2];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3];
+    ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a03)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a03)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a03)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a03)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a03)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a03)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4];
+    ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4];
+    ((int * ALIGNED(64))a04)[10] = b10.i[ 4];
+    ((int * ALIGNED(64))a04)[11] = b11.i[ 4];
+    ((int * ALIGNED(64))a04)[12] = b12.i[ 4];
+    ((int * ALIGNED(64))a04)[13] = b13.i[ 4];
+    ((int * ALIGNED(64))a04)[14] = b14.i[ 4];
+    ((int * ALIGNED(64))a04)[15] = b15.i[ 4];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5];
+    ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a05)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a05)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a05)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a05)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a05)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a05)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6];
+    ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6];
+    ((int * ALIGNED(64))a06)[10] = b10.i[ 6];
+    ((int * ALIGNED(64))a06)[11] = b11.i[ 6];
+    ((int * ALIGNED(64))a06)[12] = b12.i[ 6];
+    ((int * ALIGNED(64))a06)[13] = b13.i[ 6];
+    ((int * ALIGNED(64))a06)[14] = b14.i[ 6];
+    ((int * ALIGNED(64))a06)[15] = b15.i[ 6];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7];
+    ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a07)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a07)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a07)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a07)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a07)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a07)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 8];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 8];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 8];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 8];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 8];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 8];
+
+    ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9];
+    ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9];
+    ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9];
+    ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9];
+    ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9];
+    ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9];
+    ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9];
+    ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a10)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a10)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a10)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a10)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a10)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a10)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a10)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a10)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[10];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[10];
+    ((int * ALIGNED(64))a10)[10] = b10.i[10];
+    ((int * ALIGNED(64))a10)[11] = b11.i[10];
+    ((int * ALIGNED(64))a10)[12] = b12.i[10];
+    ((int * ALIGNED(64))a10)[13] = b13.i[10];
+    ((int * ALIGNED(64))a10)[14] = b14.i[10];
+    ((int * ALIGNED(64))a10)[15] = b15.i[10];
+
+    ((int * ALIGNED(64))a11)[ 0] = b00.i[11];
+    ((int * ALIGNED(64))a11)[ 1] = b01.i[11];
+    ((int * ALIGNED(64))a11)[ 2] = b02.i[11];
+    ((int * ALIGNED(64))a11)[ 3] = b03.i[11];
+    ((int * ALIGNED(64))a11)[ 4] = b04.i[11];
+    ((int * ALIGNED(64))a11)[ 5] = b05.i[11];
+    ((int * ALIGNED(64))a11)[ 6] = b06.i[11];
+    ((int * ALIGNED(64))a11)[ 7] = b07.i[11];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a11)[10] = b10.i[11];
+    ((int * ALIGNED(64))a11)[11] = b11.i[11];
+    ((int * ALIGNED(64))a11)[12] = b12.i[11];
+    ((int * ALIGNED(64))a11)[13] = b13.i[11];
+    ((int * ALIGNED(64))a11)[14] = b14.i[11];
+    ((int * ALIGNED(64))a11)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a12)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a12)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a12)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a12)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a12)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a12)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a12)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a12)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[12];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[12];
+    ((int * ALIGNED(64))a12)[10] = b10.i[12];
+    ((int * ALIGNED(64))a12)[11] = b11.i[12];
+    ((int * ALIGNED(64))a12)[12] = b12.i[12];
+    ((int * ALIGNED(64))a12)[13] = b13.i[12];
+    ((int * ALIGNED(64))a12)[14] = b14.i[12];
+    ((int * ALIGNED(64))a12)[15] = b15.i[12];
+
+    ((int * ALIGNED(64))a13)[ 0] = b00.i[13];
+    ((int * ALIGNED(64))a13)[ 1] = b01.i[13];
+    ((int * ALIGNED(64))a13)[ 2] = b02.i[13];
+    ((int * ALIGNED(64))a13)[ 3] = b03.i[13];
+    ((int * ALIGNED(64))a13)[ 4] = b04.i[13];
+    ((int * ALIGNED(64))a13)[ 5] = b05.i[13];
+    ((int * ALIGNED(64))a13)[ 6] = b06.i[13];
+    ((int * ALIGNED(64))a13)[ 7] = b07.i[13];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a13)[10] = b10.i[13];
+    ((int * ALIGNED(64))a13)[11] = b11.i[13];
+    ((int * ALIGNED(64))a13)[12] = b12.i[13];
+    ((int * ALIGNED(64))a13)[13] = b13.i[13];
+    ((int * ALIGNED(64))a13)[14] = b14.i[13];
+    ((int * ALIGNED(64))a13)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a14)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a14)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a14)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a14)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a14)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a14)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a14)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a14)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[14];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[14];
+    ((int * ALIGNED(64))a14)[10] = b10.i[14];
+    ((int * ALIGNED(64))a14)[11] = b11.i[14];
+    ((int * ALIGNED(64))a14)[12] = b12.i[14];
+    ((int * ALIGNED(64))a14)[13] = b13.i[14];
+    ((int * ALIGNED(64))a14)[14] = b14.i[14];
+    ((int * ALIGNED(64))a14)[15] = b15.i[14];
+
+    ((int * ALIGNED(64))a15)[ 0] = b00.i[15];
+    ((int * ALIGNED(64))a15)[ 1] = b01.i[15];
+    ((int * ALIGNED(64))a15)[ 2] = b02.i[15];
+    ((int * ALIGNED(64))a15)[ 3] = b03.i[15];
+    ((int * ALIGNED(64))a15)[ 4] = b04.i[15];
+    ((int * ALIGNED(64))a15)[ 5] = b05.i[15];
+    ((int * ALIGNED(64))a15)[ 6] = b06.i[15];
+    ((int * ALIGNED(64))a15)[ 7] = b07.i[15];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  inline void store_16x8_tr_p( const v16 &b00,
+			       const v16 &b01,
+			       const v16 &b02,
+			       const v16 &b03,
+			       const v16 &b04,
+			       const v16 &b05,
+			       const v16 &b06,
+			       const v16 &b07,
+			       void * ALIGNED(64) a00,
+			       void * ALIGNED(64) a01,
+			       void * ALIGNED(64) a02,
+			       void * ALIGNED(64) a03,
+			       void * ALIGNED(64) a04,
+			       void * ALIGNED(64) a05,
+			       void * ALIGNED(64) a06,
+			       void * ALIGNED(64) a07 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+  }
+
+  inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+				const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+				const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+				const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+				void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+				void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+				void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+				void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+				void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+				void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+				void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+				void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+
+    ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0];
+    ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0];
+    ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0];
+    ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0];
+    ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0];
+    ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0];
+    ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0];
+    ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2];
+    ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2];
+    ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2];
+    ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2];
+    ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2];
+    ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2];
+    ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2];
+    ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4];
+    ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4];
+    ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4];
+    ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4];
+    ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4];
+    ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4];
+    ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4];
+    ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a10)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a10)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a10)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a10)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a10)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a10)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6];
+    ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6];
+    ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6];
+    ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6];
+    ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6];
+    ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6];
+    ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6];
+    ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a11)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a11)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a11)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a11)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a11)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a11)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8];
+    ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8];
+    ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8];
+    ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8];
+    ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8];
+    ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8];
+    ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8];
+    ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a12)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a12)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a12)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a12)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a12)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a12)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a13)[ 0] = b08.i[10];
+    ((int * ALIGNED(64))a13)[ 1] = b09.i[10];
+    ((int * ALIGNED(64))a13)[ 2] = b10.i[10];
+    ((int * ALIGNED(64))a13)[ 3] = b11.i[10];
+    ((int * ALIGNED(64))a13)[ 4] = b12.i[10];
+    ((int * ALIGNED(64))a13)[ 5] = b13.i[10];
+    ((int * ALIGNED(64))a13)[ 6] = b14.i[10];
+    ((int * ALIGNED(64))a13)[ 7] = b15.i[10];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a13)[10] = b10.i[11];
+    ((int * ALIGNED(64))a13)[11] = b11.i[11];
+    ((int * ALIGNED(64))a13)[12] = b12.i[11];
+    ((int * ALIGNED(64))a13)[13] = b13.i[11];
+    ((int * ALIGNED(64))a13)[14] = b14.i[11];
+    ((int * ALIGNED(64))a13)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a14)[ 0] = b08.i[12];
+    ((int * ALIGNED(64))a14)[ 1] = b09.i[12];
+    ((int * ALIGNED(64))a14)[ 2] = b10.i[12];
+    ((int * ALIGNED(64))a14)[ 3] = b11.i[12];
+    ((int * ALIGNED(64))a14)[ 4] = b12.i[12];
+    ((int * ALIGNED(64))a14)[ 5] = b13.i[12];
+    ((int * ALIGNED(64))a14)[ 6] = b14.i[12];
+    ((int * ALIGNED(64))a14)[ 7] = b15.i[12];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a14)[10] = b10.i[13];
+    ((int * ALIGNED(64))a14)[11] = b11.i[13];
+    ((int * ALIGNED(64))a14)[12] = b12.i[13];
+    ((int * ALIGNED(64))a14)[13] = b13.i[13];
+    ((int * ALIGNED(64))a14)[14] = b14.i[13];
+    ((int * ALIGNED(64))a14)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a15)[ 0] = b08.i[14];
+    ((int * ALIGNED(64))a15)[ 1] = b09.i[14];
+    ((int * ALIGNED(64))a15)[ 2] = b10.i[14];
+    ((int * ALIGNED(64))a15)[ 3] = b11.i[14];
+    ((int * ALIGNED(64))a15)[ 4] = b12.i[14];
+    ((int * ALIGNED(64))a15)[ 5] = b13.i[14];
+    ((int * ALIGNED(64))a15)[ 6] = b14.i[14];
+    ((int * ALIGNED(64))a15)[ 7] = b15.i[14];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  //////////////
+  // v16int class
+
+  class v16int : public v16
+  {
+    // v16int prefix unary operator friends
+
+    friend inline v16int operator  +( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  ~( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  !( const v16int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16int prefix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a ) ALWAYS_INLINE;
+
+    // v16int postfix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE;
+
+    // v16int binary operator friends
+
+    friend inline v16int operator  +( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  *( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  /( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  %( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  ^( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  &( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  |( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int logical operator friends
+
+    friend inline v16int operator  <( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16int abs( const v16int &a ) ALWAYS_INLINE;
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE;
+
+    // v16float unary operator friends
+
+    friend inline v16int operator  !( const v16float & a ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float miscellaneous friends
+
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16int constructors / destructors
+
+    v16int() {}                                  // Default constructor
+
+    v16int( const v16int &a )                    // Copy constructor
+    {
+      i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3];
+      i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7];
+      i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11];
+      i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15];
+    }
+
+    v16int( const v16 &a )                       // Init from mixed
+    {
+      i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3];
+      i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7];
+      i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11];
+      i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15];
+    }
+
+    v16int( int a )                              // Init from scalar
+    {
+      i[ 0] = a; i[ 1] = a; i[ 2] = a; i[ 3] = a;
+      i[ 4] = a; i[ 5] = a; i[ 6] = a; i[ 7] = a;
+      i[ 8] = a; i[ 9] = a; i[10] = a; i[11] = a;
+      i[12] = a; i[13] = a; i[14] = a; i[15] = a;
+    }
+
+    v16int( int i00, int i01, int i02, int i03,
+	    int i04, int i05, int i06, int i07,
+	    int i08, int i09, int i10, int i11,
+	    int i12, int i13, int i14, int i15 ) // Init from scalars
+    {
+      i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03;
+      i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07;
+      i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11;
+      i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15;
+    }
+
+    ~v16int() {}                                 // Destructor
+
+    // v16int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v16int &operator op( const v16int &b ) \
+    {						  \
+      i[ 0] op b.i[ 0];                           \
+      i[ 1] op b.i[ 1];                           \
+      i[ 2] op b.i[ 2];                           \
+      i[ 3] op b.i[ 3];                           \
+      i[ 4] op b.i[ 4];                           \
+      i[ 5] op b.i[ 5];                           \
+      i[ 6] op b.i[ 6];                           \
+      i[ 7] op b.i[ 7];                           \
+      i[ 8] op b.i[ 8];                           \
+      i[ 9] op b.i[ 9];                           \
+      i[10] op b.i[10];                           \
+      i[11] op b.i[11];                           \
+      i[12] op b.i[12];                           \
+      i[13] op b.i[13];                           \
+      i[14] op b.i[14];                           \
+      i[15] op b.i[15];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v16int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v16int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v16int operator op( const v16int & a ) \
+  {						\
+    v16int b;                                   \
+    b.i[ 0] = (op a.i[ 0]);                     \
+    b.i[ 1] = (op a.i[ 1]);                     \
+    b.i[ 2] = (op a.i[ 2]);                     \
+    b.i[ 3] = (op a.i[ 3]);                     \
+    b.i[ 4] = (op a.i[ 4]);                     \
+    b.i[ 5] = (op a.i[ 5]);                     \
+    b.i[ 6] = (op a.i[ 6]);                     \
+    b.i[ 7] = (op a.i[ 7]);                     \
+    b.i[ 8] = (op a.i[ 8]);                     \
+    b.i[ 9] = (op a.i[ 9]);                     \
+    b.i[10] = (op a.i[10]);                     \
+    b.i[11] = (op a.i[11]);                     \
+    b.i[12] = (op a.i[12]);                     \
+    b.i[13] = (op a.i[13]);                     \
+    b.i[14] = (op a.i[14]);                     \
+    b.i[15] = (op a.i[15]);                     \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v16int operator !( const v16int & a )
+  {
+    v16int b;
+    b.i[ 0] = - ( !a.i[ 0] );
+    b.i[ 1] = - ( !a.i[ 1] );
+    b.i[ 2] = - ( !a.i[ 2] );
+    b.i[ 3] = - ( !a.i[ 3] );
+    b.i[ 4] = - ( !a.i[ 4] );
+    b.i[ 5] = - ( !a.i[ 5] );
+    b.i[ 6] = - ( !a.i[ 6] );
+    b.i[ 7] = - ( !a.i[ 7] );
+    b.i[ 8] = - ( !a.i[ 8] );
+    b.i[ 9] = - ( !a.i[ 9] );
+    b.i[10] = - ( !a.i[10] );
+    b.i[11] = - ( !a.i[11] );
+    b.i[12] = - ( !a.i[12] );
+    b.i[13] = - ( !a.i[13] );
+    b.i[14] = - ( !a.i[14] );
+    b.i[15] = - ( !a.i[15] );
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v16int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v16int operator op( v16int & a )       \
+  {						\
+    v16int b;                                   \
+    b.i[ 0] = ( op a.i[ 0] );                   \
+    b.i[ 1] = ( op a.i[ 1] );                   \
+    b.i[ 2] = ( op a.i[ 2] );                   \
+    b.i[ 3] = ( op a.i[ 3] );                   \
+    b.i[ 4] = ( op a.i[ 4] );                   \
+    b.i[ 5] = ( op a.i[ 5] );                   \
+    b.i[ 6] = ( op a.i[ 6] );                   \
+    b.i[ 7] = ( op a.i[ 7] );                   \
+    b.i[ 8] = ( op a.i[ 8] );                   \
+    b.i[ 9] = ( op a.i[ 9] );                   \
+    b.i[10] = ( op a.i[10] );                   \
+    b.i[11] = ( op a.i[11] );                   \
+    b.i[12] = ( op a.i[12] );                   \
+    b.i[13] = ( op a.i[13] );                   \
+    b.i[14] = ( op a.i[14] );                   \
+    b.i[15] = ( op a.i[15] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v16int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v16int operator op( v16int & a, int ) \
+  {					       \
+    v16int b;                                  \
+    b.i[ 0] = ( a.i[ 0] op );                  \
+    b.i[ 1] = ( a.i[ 1] op );                  \
+    b.i[ 2] = ( a.i[ 2] op );                  \
+    b.i[ 3] = ( a.i[ 3] op );                  \
+    b.i[ 4] = ( a.i[ 4] op );                  \
+    b.i[ 5] = ( a.i[ 5] op );                  \
+    b.i[ 6] = ( a.i[ 6] op );                  \
+    b.i[ 7] = ( a.i[ 7] op );                  \
+    b.i[ 8] = ( a.i[ 8] op );                  \
+    b.i[ 9] = ( a.i[ 9] op );                  \
+    b.i[10] = ( a.i[10] op );                  \
+    b.i[11] = ( a.i[11] op );                  \
+    b.i[12] = ( a.i[12] op );                  \
+    b.i[13] = ( a.i[13] op );                  \
+    b.i[14] = ( a.i[14] op );                  \
+    b.i[15] = ( a.i[15] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v16int binary operators
+
+# define BINARY(op)                                             \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {								\
+    v16int c;                                                   \
+    c.i[ 0] = a.i[ 0] op b.i[ 0];                               \
+    c.i[ 1] = a.i[ 1] op b.i[ 1];                               \
+    c.i[ 2] = a.i[ 2] op b.i[ 2];                               \
+    c.i[ 3] = a.i[ 3] op b.i[ 3];                               \
+    c.i[ 4] = a.i[ 4] op b.i[ 4];                               \
+    c.i[ 5] = a.i[ 5] op b.i[ 5];                               \
+    c.i[ 6] = a.i[ 6] op b.i[ 6];                               \
+    c.i[ 7] = a.i[ 7] op b.i[ 7];                               \
+    c.i[ 8] = a.i[ 8] op b.i[ 8];                               \
+    c.i[ 9] = a.i[ 9] op b.i[ 9];                               \
+    c.i[10] = a.i[10] op b.i[10];                               \
+    c.i[11] = a.i[11] op b.i[11];                               \
+    c.i[12] = a.i[12] op b.i[12];                               \
+    c.i[13] = a.i[13] op b.i[13];                               \
+    c.i[14] = a.i[14] op b.i[14];                               \
+    c.i[15] = a.i[15] op b.i[15];                               \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v16int logical operators
+
+# define LOGICAL(op)                                            \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {                                                             \
+    v16int c;                                                   \
+    c.i[ 0] = - ( a.i[ 0] op b.i[ 0] );                         \
+    c.i[ 1] = - ( a.i[ 1] op b.i[ 1] );                         \
+    c.i[ 2] = - ( a.i[ 2] op b.i[ 2] );                         \
+    c.i[ 3] = - ( a.i[ 3] op b.i[ 3] );                         \
+    c.i[ 4] = - ( a.i[ 4] op b.i[ 4] );                         \
+    c.i[ 5] = - ( a.i[ 5] op b.i[ 5] );                         \
+    c.i[ 6] = - ( a.i[ 6] op b.i[ 6] );                         \
+    c.i[ 7] = - ( a.i[ 7] op b.i[ 7] );                         \
+    c.i[ 8] = - ( a.i[ 8] op b.i[ 8] );                         \
+    c.i[ 9] = - ( a.i[ 9] op b.i[ 9] );                         \
+    c.i[10] = - ( a.i[10] op b.i[10] );                         \
+    c.i[11] = - ( a.i[11] op b.i[11] );                         \
+    c.i[12] = - ( a.i[12] op b.i[12] );                         \
+    c.i[13] = - ( a.i[13] op b.i[13] );                         \
+    c.i[14] = - ( a.i[14] op b.i[14] );                         \
+    c.i[15] = - ( a.i[15] op b.i[15] );                         \
+    return c;                                                   \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16int miscellaneous functions
+
+  inline v16int abs( const v16int &a )
+  {
+    v16int b;
+
+    b.i[ 0] = ( a.i[ 0] >= 0 ) ? a.i[ 0] : - a.i[ 0];
+    b.i[ 1] = ( a.i[ 1] >= 0 ) ? a.i[ 1] : - a.i[ 1];
+    b.i[ 2] = ( a.i[ 2] >= 0 ) ? a.i[ 2] : - a.i[ 2];
+    b.i[ 3] = ( a.i[ 3] >= 0 ) ? a.i[ 3] : - a.i[ 3];
+    b.i[ 4] = ( a.i[ 4] >= 0 ) ? a.i[ 4] : - a.i[ 4];
+    b.i[ 5] = ( a.i[ 5] >= 0 ) ? a.i[ 5] : - a.i[ 5];
+    b.i[ 6] = ( a.i[ 6] >= 0 ) ? a.i[ 6] : - a.i[ 6];
+    b.i[ 7] = ( a.i[ 7] >= 0 ) ? a.i[ 7] : - a.i[ 7];
+    b.i[ 8] = ( a.i[ 8] >= 0 ) ? a.i[ 8] : - a.i[ 8];
+    b.i[ 9] = ( a.i[ 9] >= 0 ) ? a.i[ 9] : - a.i[ 9];
+    b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : - a.i[10];
+    b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : - a.i[11];
+    b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : - a.i[12];
+    b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : - a.i[13];
+    b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : - a.i[14];
+    b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : - a.i[15];
+
+    return b;
+  }
+
+  inline v16 czero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[ 0] & ~c.i[ 0];
+    b.i[ 1] = a.i[ 1] & ~c.i[ 1];
+    b.i[ 2] = a.i[ 2] & ~c.i[ 2];
+    b.i[ 3] = a.i[ 3] & ~c.i[ 3];
+    b.i[ 4] = a.i[ 4] & ~c.i[ 4];
+    b.i[ 5] = a.i[ 5] & ~c.i[ 5];
+    b.i[ 6] = a.i[ 6] & ~c.i[ 6];
+    b.i[ 7] = a.i[ 7] & ~c.i[ 7];
+    b.i[ 8] = a.i[ 8] & ~c.i[ 8];
+    b.i[ 9] = a.i[ 9] & ~c.i[ 9];
+    b.i[10] = a.i[10] & ~c.i[10];
+    b.i[11] = a.i[11] & ~c.i[11];
+    b.i[12] = a.i[12] & ~c.i[12];
+    b.i[13] = a.i[13] & ~c.i[13];
+    b.i[14] = a.i[14] & ~c.i[14];
+    b.i[15] = a.i[15] & ~c.i[15];
+
+    return b;
+  }
+
+  inline v16 notczero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[ 0] & c.i[ 0];
+    b.i[ 1] = a.i[ 1] & c.i[ 1];
+    b.i[ 2] = a.i[ 2] & c.i[ 2];
+    b.i[ 3] = a.i[ 3] & c.i[ 3];
+    b.i[ 4] = a.i[ 4] & c.i[ 4];
+    b.i[ 5] = a.i[ 5] & c.i[ 5];
+    b.i[ 6] = a.i[ 6] & c.i[ 6];
+    b.i[ 7] = a.i[ 7] & c.i[ 7];
+    b.i[ 8] = a.i[ 8] & c.i[ 8];
+    b.i[ 9] = a.i[ 9] & c.i[ 9];
+    b.i[10] = a.i[10] & c.i[10];
+    b.i[11] = a.i[11] & c.i[11];
+    b.i[12] = a.i[12] & c.i[12];
+    b.i[13] = a.i[13] & c.i[13];
+    b.i[14] = a.i[14] & c.i[14];
+    b.i[15] = a.i[15] & c.i[15];
+
+    return b;
+  }
+
+  inline v16 merge( const v16int &c, const v16 &t, const v16 &f )
+  {
+    v16 m;
+
+    m.i[ 0] = ( f.i[ 0] & ~c.i[ 0] ) | ( t.i[ 0] & c.i[ 0] );
+    m.i[ 1] = ( f.i[ 1] & ~c.i[ 1] ) | ( t.i[ 1] & c.i[ 1] );
+    m.i[ 2] = ( f.i[ 2] & ~c.i[ 2] ) | ( t.i[ 2] & c.i[ 2] );
+    m.i[ 3] = ( f.i[ 3] & ~c.i[ 3] ) | ( t.i[ 3] & c.i[ 3] );
+    m.i[ 4] = ( f.i[ 4] & ~c.i[ 4] ) | ( t.i[ 4] & c.i[ 4] );
+    m.i[ 5] = ( f.i[ 5] & ~c.i[ 5] ) | ( t.i[ 5] & c.i[ 5] );
+    m.i[ 6] = ( f.i[ 6] & ~c.i[ 6] ) | ( t.i[ 6] & c.i[ 6] );
+    m.i[ 7] = ( f.i[ 7] & ~c.i[ 7] ) | ( t.i[ 7] & c.i[ 7] );
+    m.i[ 8] = ( f.i[ 8] & ~c.i[ 8] ) | ( t.i[ 8] & c.i[ 8] );
+    m.i[ 9] = ( f.i[ 9] & ~c.i[ 9] ) | ( t.i[ 9] & c.i[ 9] );
+    m.i[10] = ( f.i[10] & ~c.i[10] ) | ( t.i[10] & c.i[10] );
+    m.i[11] = ( f.i[11] & ~c.i[11] ) | ( t.i[11] & c.i[11] );
+    m.i[12] = ( f.i[12] & ~c.i[12] ) | ( t.i[12] & c.i[12] );
+    m.i[13] = ( f.i[13] & ~c.i[13] ) | ( t.i[13] & c.i[13] );
+    m.i[14] = ( f.i[14] & ~c.i[14] ) | ( t.i[14] & c.i[14] );
+    m.i[15] = ( f.i[15] & ~c.i[15] ) | ( t.i[15] & c.i[15] );
+
+    return m;
+  }
+
+  ////////////////
+  // v16float class
+
+  class v16float : public v16
+  {
+    // v16float prefix unary operator friends
+
+    friend inline v16float operator  +( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  ~( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16int   operator  !( const v16float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16float prefix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a ) ALWAYS_INLINE;
+
+    // v16float postfix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE;
+
+    // v16float binary operator friends
+
+    friend inline v16float operator  +( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  *( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  /( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float math library friends
+
+#   define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v16float fn( const v16float &a,  \
+                                                    const v16float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v16float miscellaneous friends
+
+    friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rsqrt       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void     scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16float constructors / destructors
+
+    v16float() {}                                          // Default constructor
+
+    v16float( const v16float &a )                          // Copy constructor
+    {
+      f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3];
+      f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7];
+      f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11];
+      f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15];
+    }
+
+    v16float( const v16 &a )                               // Init from mixed
+    {
+      f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3];
+      f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7];
+      f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11];
+      f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15];
+    }
+
+    v16float( float a )                                    // Init from scalar
+    {
+      f[ 0] = a; f[ 1] = a; f[ 2] = a; f[ 3] = a;
+      f[ 4] = a; f[ 5] = a; f[ 6] = a; f[ 7] = a;
+      f[ 8] = a; f[ 9] = a; f[10] = a; f[11] = a;
+      f[12] = a; f[13] = a; f[14] = a; f[15] = a;
+    }
+
+    v16float( float f00, float f01, float f02, float f03,
+	      float f04, float f05, float f06, float f07,
+	      float f08, float f09, float f10, float f11,
+	      float f12, float f13, float f14, float f15 ) // Init from scalars
+    {
+      f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03;
+      f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07;
+      f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11;
+      f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15;
+    }
+
+    ~v16float() {}                                         // Destructor
+
+    // v16float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v16float &operator op( const v16float &b )   \
+    {							\
+      f[ 0] op b.f[ 0];		             		\
+      f[ 1] op b.f[ 1];                                 \
+      f[ 2] op b.f[ 2];                                 \
+      f[ 3] op b.f[ 3];                                 \
+      f[ 4] op b.f[ 4];                                 \
+      f[ 5] op b.f[ 5];                                 \
+      f[ 6] op b.f[ 6];                                 \
+      f[ 7] op b.f[ 7];                                 \
+      f[ 8] op b.f[ 8];                                 \
+      f[ 9] op b.f[ 9];                                 \
+      f[10] op b.f[10];                                 \
+      f[11] op b.f[11];                                 \
+      f[12] op b.f[12];                                 \
+      f[13] op b.f[13];                                 \
+      f[14] op b.f[14];                                 \
+      f[15] op b.f[15];                                 \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v16float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v16float prefix unary operators
+
+  inline v16float operator +( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = +a.f[ 0];
+    b.f[ 1] = +a.f[ 1];
+    b.f[ 2] = +a.f[ 2];
+    b.f[ 3] = +a.f[ 3];
+    b.f[ 4] = +a.f[ 4];
+    b.f[ 5] = +a.f[ 5];
+    b.f[ 6] = +a.f[ 6];
+    b.f[ 7] = +a.f[ 7];
+    b.f[ 8] = +a.f[ 8];
+    b.f[ 9] = +a.f[ 9];
+    b.f[10] = +a.f[10];
+    b.f[11] = +a.f[11];
+    b.f[12] = +a.f[12];
+    b.f[13] = +a.f[13];
+    b.f[14] = +a.f[14];
+    b.f[15] = +a.f[15];
+
+    return b;
+  }
+
+  inline v16float operator -( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = -a.f[ 0];
+    b.f[ 1] = -a.f[ 1];
+    b.f[ 2] = -a.f[ 2];
+    b.f[ 3] = -a.f[ 3];
+    b.f[ 4] = -a.f[ 4];
+    b.f[ 5] = -a.f[ 5];
+    b.f[ 6] = -a.f[ 6];
+    b.f[ 7] = -a.f[ 7];
+    b.f[ 8] = -a.f[ 8];
+    b.f[ 9] = -a.f[ 9];
+    b.f[10] = -a.f[10];
+    b.f[11] = -a.f[11];
+    b.f[12] = -a.f[12];
+    b.f[13] = -a.f[13];
+    b.f[14] = -a.f[14];
+    b.f[15] = -a.f[15];
+
+    return b;
+  }
+
+  inline v16int operator !( const v16float &a )
+  {
+    v16int b;
+
+    b.i[ 0] = a.i[ 0] ? 0 : -1;
+    b.i[ 1] = a.i[ 1] ? 0 : -1;
+    b.i[ 2] = a.i[ 2] ? 0 : -1;
+    b.i[ 3] = a.i[ 3] ? 0 : -1;
+    b.i[ 4] = a.i[ 4] ? 0 : -1;
+    b.i[ 5] = a.i[ 5] ? 0 : -1;
+    b.i[ 6] = a.i[ 6] ? 0 : -1;
+    b.i[ 7] = a.i[ 7] ? 0 : -1;
+    b.i[ 8] = a.i[ 8] ? 0 : -1;
+    b.i[ 9] = a.i[ 9] ? 0 : -1;
+    b.i[10] = a.i[10] ? 0 : -1;
+    b.i[11] = a.i[11] ? 0 : -1;
+    b.i[12] = a.i[12] ? 0 : -1;
+    b.i[13] = a.i[13] ? 0 : -1;
+    b.i[14] = a.i[14] ? 0 : -1;
+    b.i[15] = a.i[15] ? 0 : -1;
+
+    return b;
+  }
+
+  // v16float prefix increment / decrement operators
+
+  inline v16float operator ++( v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ++a.f[ 0];
+    b.f[ 1] = ++a.f[ 1];
+    b.f[ 2] = ++a.f[ 2];
+    b.f[ 3] = ++a.f[ 3];
+    b.f[ 4] = ++a.f[ 4];
+    b.f[ 5] = ++a.f[ 5];
+    b.f[ 6] = ++a.f[ 6];
+    b.f[ 7] = ++a.f[ 7];
+    b.f[ 8] = ++a.f[ 8];
+    b.f[ 9] = ++a.f[ 9];
+    b.f[10] = ++a.f[10];
+    b.f[11] = ++a.f[11];
+    b.f[12] = ++a.f[12];
+    b.f[13] = ++a.f[13];
+    b.f[14] = ++a.f[14];
+    b.f[15] = ++a.f[15];
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = --a.f[ 0];
+    b.f[ 1] = --a.f[ 1];
+    b.f[ 2] = --a.f[ 2];
+    b.f[ 3] = --a.f[ 3];
+    b.f[ 4] = --a.f[ 4];
+    b.f[ 5] = --a.f[ 5];
+    b.f[ 6] = --a.f[ 6];
+    b.f[ 7] = --a.f[ 7];
+    b.f[ 8] = --a.f[ 8];
+    b.f[ 9] = --a.f[ 9];
+    b.f[10] = --a.f[10];
+    b.f[11] = --a.f[11];
+    b.f[12] = --a.f[12];
+    b.f[13] = --a.f[13];
+    b.f[14] = --a.f[14];
+    b.f[15] = --a.f[15];
+
+    return b;
+  }
+
+  // v16float postfix increment / decrement operators
+
+  inline v16float operator ++( v16float &a, int )
+  {
+    v16float b;
+
+    b.f[ 0] = a.f[ 0]++;
+    b.f[ 1] = a.f[ 1]++;
+    b.f[ 2] = a.f[ 2]++;
+    b.f[ 3] = a.f[ 3]++;
+    b.f[ 4] = a.f[ 4]++;
+    b.f[ 5] = a.f[ 5]++;
+    b.f[ 6] = a.f[ 6]++;
+    b.f[ 7] = a.f[ 7]++;
+    b.f[ 8] = a.f[ 8]++;
+    b.f[ 9] = a.f[ 9]++;
+    b.f[10] = a.f[10]++;
+    b.f[11] = a.f[11]++;
+    b.f[12] = a.f[12]++;
+    b.f[13] = a.f[13]++;
+    b.f[14] = a.f[14]++;
+    b.f[15] = a.f[15]++;
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a, int )
+  {
+    v16float b;
+
+    b.f[ 0] = a.f[ 0]--;
+    b.f[ 1] = a.f[ 1]--;
+    b.f[ 2] = a.f[ 2]--;
+    b.f[ 3] = a.f[ 3]--;
+    b.f[ 4] = a.f[ 4]--;
+    b.f[ 5] = a.f[ 5]--;
+    b.f[ 6] = a.f[ 6]--;
+    b.f[ 7] = a.f[ 7]--;
+    b.f[ 8] = a.f[ 8]--;
+    b.f[ 9] = a.f[ 9]--;
+    b.f[10] = a.f[10]--;
+    b.f[11] = a.f[11]--;
+    b.f[12] = a.f[12]--;
+    b.f[13] = a.f[13]--;
+    b.f[14] = a.f[14]--;
+    b.f[15] = a.f[15]--;
+
+    return b;
+  }
+
+  // v16float binary operators
+
+# define BINARY(op)                                                   \
+  inline v16float operator op( const v16float &a, const v16float &b ) \
+  {								      \
+    v16float c;                                                       \
+    c.f[ 0] = a.f[ 0] op b.f[ 0];                                     \
+    c.f[ 1] = a.f[ 1] op b.f[ 1];                                     \
+    c.f[ 2] = a.f[ 2] op b.f[ 2];                                     \
+    c.f[ 3] = a.f[ 3] op b.f[ 3];                                     \
+    c.f[ 4] = a.f[ 4] op b.f[ 4];                                     \
+    c.f[ 5] = a.f[ 5] op b.f[ 5];                                     \
+    c.f[ 6] = a.f[ 6] op b.f[ 6];                                     \
+    c.f[ 7] = a.f[ 7] op b.f[ 7];                                     \
+    c.f[ 8] = a.f[ 8] op b.f[ 8];                                     \
+    c.f[ 9] = a.f[ 9] op b.f[ 9];                                     \
+    c.f[10] = a.f[10] op b.f[10];                                     \
+    c.f[11] = a.f[11] op b.f[11];                                     \
+    c.f[12] = a.f[12] op b.f[12];                                     \
+    c.f[13] = a.f[13] op b.f[13];                                     \
+    c.f[14] = a.f[14] op b.f[14];                                     \
+    c.f[15] = a.f[15] op b.f[15];                                     \
+    return c;                                                         \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v16float logical operators
+
+# define LOGICAL(op)                                                \
+  inline v16int operator op( const v16float &a, const v16float &b ) \
+  {								    \
+    v16int c;                                                       \
+    c.i[ 0] = -( a.f[ 0] op b.f[ 0] );                              \
+    c.i[ 1] = -( a.f[ 1] op b.f[ 1] );                              \
+    c.i[ 2] = -( a.f[ 2] op b.f[ 2] );                              \
+    c.i[ 3] = -( a.f[ 3] op b.f[ 3] );                              \
+    c.i[ 4] = -( a.f[ 4] op b.f[ 4] );                              \
+    c.i[ 5] = -( a.f[ 5] op b.f[ 5] );                              \
+    c.i[ 6] = -( a.f[ 6] op b.f[ 6] );                              \
+    c.i[ 7] = -( a.f[ 7] op b.f[ 7] );                              \
+    c.i[ 8] = -( a.f[ 8] op b.f[ 8] );                              \
+    c.i[ 9] = -( a.f[ 9] op b.f[ 9] );                              \
+    c.i[10] = -( a.f[10] op b.f[10] );                              \
+    c.i[11] = -( a.f[11] op b.f[11] );                              \
+    c.i[12] = -( a.f[12] op b.f[12] );                              \
+    c.i[13] = -( a.f[13] op b.f[13] );                              \
+    c.i[14] = -( a.f[14] op b.f[14] );                              \
+    c.i[15] = -( a.f[15] op b.f[15] );                              \
+    return c;                                                       \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v16float fn( const v16float &a )       \
+  {						\
+    v16float b;                                 \
+    b.f[ 0] = ::fn( a.f[ 0] );                  \
+    b.f[ 1] = ::fn( a.f[ 1] );                  \
+    b.f[ 2] = ::fn( a.f[ 2] );                  \
+    b.f[ 3] = ::fn( a.f[ 3] );                  \
+    b.f[ 4] = ::fn( a.f[ 4] );                  \
+    b.f[ 5] = ::fn( a.f[ 5] );                  \
+    b.f[ 6] = ::fn( a.f[ 6] );                  \
+    b.f[ 7] = ::fn( a.f[ 7] );                  \
+    b.f[ 8] = ::fn( a.f[ 8] );                  \
+    b.f[ 9] = ::fn( a.f[ 9] );                  \
+    b.f[10] = ::fn( a.f[10] );                  \
+    b.f[11] = ::fn( a.f[11] );                  \
+    b.f[12] = ::fn( a.f[12] );                  \
+    b.f[13] = ::fn( a.f[13] );                  \
+    b.f[14] = ::fn( a.f[14] );                  \
+    b.f[15] = ::fn( a.f[15] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v16float fn( const v16float &a, const v16float &b )    \
+  {								\
+    v16float c;                                                 \
+    c.f[ 0] = ::fn( a.f[ 0], b.f[ 0] );                         \
+    c.f[ 1] = ::fn( a.f[ 1], b.f[ 1] );                         \
+    c.f[ 2] = ::fn( a.f[ 2], b.f[ 2] );                         \
+    c.f[ 3] = ::fn( a.f[ 3], b.f[ 3] );                         \
+    c.f[ 4] = ::fn( a.f[ 4], b.f[ 4] );                         \
+    c.f[ 5] = ::fn( a.f[ 5], b.f[ 5] );                         \
+    c.f[ 6] = ::fn( a.f[ 6], b.f[ 6] );                         \
+    c.f[ 7] = ::fn( a.f[ 7], b.f[ 7] );                         \
+    c.f[ 8] = ::fn( a.f[ 8], b.f[ 8] );                         \
+    c.f[ 9] = ::fn( a.f[ 9], b.f[ 9] );                         \
+    c.f[10] = ::fn( a.f[10], b.f[10] );                         \
+    c.f[11] = ::fn( a.f[11], b.f[11] );                         \
+    c.f[12] = ::fn( a.f[12], b.f[12] );                         \
+    c.f[13] = ::fn( a.f[13], b.f[13] );                         \
+    c.f[14] = ::fn( a.f[14], b.f[14] );                         \
+    c.f[15] = ::fn( a.f[15], b.f[15] );                         \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v16float copysign( const v16float &a, const v16float &b )
+  {
+    v16float c;
+    float t;
+
+    t = ::fabs( a.f[ 0] );
+    if( b.f[ 0] < 0 ) t = -t;
+    c.f[ 0] = t;
+
+    t = ::fabs( a.f[ 1] );
+    if( b.f[ 1] < 0 ) t = -t;
+    c.f[ 1] = t;
+
+    t = ::fabs( a.f[ 2] );
+    if( b.f[ 2] < 0 ) t = -t;
+    c.f[ 2] = t;
+
+    t = ::fabs( a.f[ 3] );
+    if( b.f[ 3] < 0 ) t = -t;
+    c.f[ 3] = t;
+
+    t = ::fabs( a.f[ 4] );
+    if( b.f[ 4] < 0 ) t = -t;
+    c.f[ 4] = t;
+
+    t = ::fabs( a.f[ 5] );
+    if( b.f[ 5] < 0 ) t = -t;
+    c.f[ 5] = t;
+
+    t = ::fabs( a.f[ 6] );
+    if( b.f[ 6] < 0 ) t = -t;
+    c.f[ 6] = t;
+
+    t = ::fabs( a.f[ 7] );
+    if( b.f[ 7] < 0 ) t = -t;
+    c.f[ 7] = t;
+
+    t = ::fabs( a.f[ 8] );
+    if( b.f[ 8] < 0 ) t = -t;
+    c.f[ 8] = t;
+
+    t = ::fabs( a.f[ 9] );
+    if( b.f[ 9] < 0 ) t = -t;
+    c.f[ 9] = t;
+
+    t = ::fabs( a.f[10] );
+    if( b.f[10] < 0 ) t = -t;
+    c.f[10] = t;
+
+    t = ::fabs( a.f[11] );
+    if( b.f[11] < 0 ) t = -t;
+    c.f[11] = t;
+
+    t = ::fabs( a.f[12] );
+    if( b.f[12] < 0 ) t = -t;
+    c.f[12] = t;
+
+    t = ::fabs( a.f[13] );
+    if( b.f[13] < 0 ) t = -t;
+    c.f[13] = t;
+
+    t = ::fabs( a.f[14] );
+    if( b.f[14] < 0 ) t = -t;
+    c.f[14] = t;
+
+    t = ::fabs( a.f[15] );
+    if( b.f[15] < 0 ) t = -t;
+    c.f[15] = t;
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v16float miscellaneous functions
+
+  inline v16float rsqrt_approx( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] );
+    b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] );
+    b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] );
+    b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] );
+    b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] );
+    b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] );
+    b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] );
+    b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] );
+    b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] );
+    b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] );
+    b.f[10] = ::sqrt( 1.0f/a.f[10] );
+    b.f[11] = ::sqrt( 1.0f/a.f[11] );
+    b.f[12] = ::sqrt( 1.0f/a.f[12] );
+    b.f[13] = ::sqrt( 1.0f/a.f[13] );
+    b.f[14] = ::sqrt( 1.0f/a.f[14] );
+    b.f[15] = ::sqrt( 1.0f/a.f[15] );
+
+    return b;
+  }
+
+  inline v16float rsqrt( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] );
+    b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] );
+    b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] );
+    b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] );
+    b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] );
+    b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] );
+    b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] );
+    b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] );
+    b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] );
+    b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] );
+    b.f[10] = ::sqrt( 1.0f/a.f[10] );
+    b.f[11] = ::sqrt( 1.0f/a.f[11] );
+    b.f[12] = ::sqrt( 1.0f/a.f[12] );
+    b.f[13] = ::sqrt( 1.0f/a.f[13] );
+    b.f[14] = ::sqrt( 1.0f/a.f[14] );
+    b.f[15] = ::sqrt( 1.0f/a.f[15] );
+
+    return b;
+  }
+
+  inline v16float rcp_approx( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = 1.0f/a.f[ 0];
+    b.f[ 1] = 1.0f/a.f[ 1];
+    b.f[ 2] = 1.0f/a.f[ 2];
+    b.f[ 3] = 1.0f/a.f[ 3];
+    b.f[ 4] = 1.0f/a.f[ 4];
+    b.f[ 5] = 1.0f/a.f[ 5];
+    b.f[ 6] = 1.0f/a.f[ 6];
+    b.f[ 7] = 1.0f/a.f[ 7];
+    b.f[ 8] = 1.0f/a.f[ 8];
+    b.f[ 9] = 1.0f/a.f[ 9];
+    b.f[10] = 1.0f/a.f[10];
+    b.f[11] = 1.0f/a.f[11];
+    b.f[12] = 1.0f/a.f[12];
+    b.f[13] = 1.0f/a.f[13];
+    b.f[14] = 1.0f/a.f[14];
+    b.f[15] = 1.0f/a.f[15];
+
+    return b;
+  }
+
+  inline v16float rcp( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = 1.0f/a.f[ 0];
+    b.f[ 1] = 1.0f/a.f[ 1];
+    b.f[ 2] = 1.0f/a.f[ 2];
+    b.f[ 3] = 1.0f/a.f[ 3];
+    b.f[ 4] = 1.0f/a.f[ 4];
+    b.f[ 5] = 1.0f/a.f[ 5];
+    b.f[ 6] = 1.0f/a.f[ 6];
+    b.f[ 7] = 1.0f/a.f[ 7];
+    b.f[ 8] = 1.0f/a.f[ 8];
+    b.f[ 9] = 1.0f/a.f[ 9];
+    b.f[10] = 1.0f/a.f[10];
+    b.f[11] = 1.0f/a.f[11];
+    b.f[12] = 1.0f/a.f[12];
+    b.f[13] = 1.0f/a.f[13];
+    b.f[14] = 1.0f/a.f[14];
+    b.f[15] = 1.0f/a.f[15];
+
+    return b;
+  }
+
+  inline v16float fma( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = a.f[ 0] * b.f[ 0] + c.f[ 0];
+    d.f[ 1] = a.f[ 1] * b.f[ 1] + c.f[ 1];
+    d.f[ 2] = a.f[ 2] * b.f[ 2] + c.f[ 2];
+    d.f[ 3] = a.f[ 3] * b.f[ 3] + c.f[ 3];
+    d.f[ 4] = a.f[ 4] * b.f[ 4] + c.f[ 4];
+    d.f[ 5] = a.f[ 5] * b.f[ 5] + c.f[ 5];
+    d.f[ 6] = a.f[ 6] * b.f[ 6] + c.f[ 6];
+    d.f[ 7] = a.f[ 7] * b.f[ 7] + c.f[ 7];
+    d.f[ 8] = a.f[ 8] * b.f[ 8] + c.f[ 8];
+    d.f[ 9] = a.f[ 9] * b.f[ 9] + c.f[ 9];
+    d.f[10] = a.f[10] * b.f[10] + c.f[10];
+    d.f[11] = a.f[11] * b.f[11] + c.f[11];
+    d.f[12] = a.f[12] * b.f[12] + c.f[12];
+    d.f[13] = a.f[13] * b.f[13] + c.f[13];
+    d.f[14] = a.f[14] * b.f[14] + c.f[14];
+    d.f[15] = a.f[15] * b.f[15] + c.f[15];
+
+    return d;
+  }
+
+  inline v16float fms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = a.f[ 0] * b.f[ 0] - c.f[ 0];
+    d.f[ 1] = a.f[ 1] * b.f[ 1] - c.f[ 1];
+    d.f[ 2] = a.f[ 2] * b.f[ 2] - c.f[ 2];
+    d.f[ 3] = a.f[ 3] * b.f[ 3] - c.f[ 3];
+    d.f[ 4] = a.f[ 4] * b.f[ 4] - c.f[ 4];
+    d.f[ 5] = a.f[ 5] * b.f[ 5] - c.f[ 5];
+    d.f[ 6] = a.f[ 6] * b.f[ 6] - c.f[ 6];
+    d.f[ 7] = a.f[ 7] * b.f[ 7] - c.f[ 7];
+    d.f[ 8] = a.f[ 8] * b.f[ 8] - c.f[ 8];
+    d.f[ 9] = a.f[ 9] * b.f[ 9] - c.f[ 9];
+    d.f[10] = a.f[10] * b.f[10] - c.f[10];
+    d.f[11] = a.f[11] * b.f[11] - c.f[11];
+    d.f[12] = a.f[12] * b.f[12] - c.f[12];
+    d.f[13] = a.f[13] * b.f[13] - c.f[13];
+    d.f[14] = a.f[14] * b.f[14] - c.f[14];
+    d.f[15] = a.f[15] * b.f[15] - c.f[15];
+
+    return d;
+  }
+
+  inline v16float fnms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = c.f[ 0] - a.f[ 0] * b.f[ 0];
+    d.f[ 1] = c.f[ 1] - a.f[ 1] * b.f[ 1];
+    d.f[ 2] = c.f[ 2] - a.f[ 2] * b.f[ 2];
+    d.f[ 3] = c.f[ 3] - a.f[ 3] * b.f[ 3];
+    d.f[ 4] = c.f[ 4] - a.f[ 4] * b.f[ 4];
+    d.f[ 5] = c.f[ 5] - a.f[ 5] * b.f[ 5];
+    d.f[ 6] = c.f[ 6] - a.f[ 6] * b.f[ 6];
+    d.f[ 7] = c.f[ 7] - a.f[ 7] * b.f[ 7];
+    d.f[ 8] = c.f[ 8] - a.f[ 8] * b.f[ 8];
+    d.f[ 9] = c.f[ 9] - a.f[ 9] * b.f[ 9];
+    d.f[10] = c.f[10] - a.f[10] * b.f[10];
+    d.f[11] = c.f[11] - a.f[11] * b.f[11];
+    d.f[12] = c.f[12] - a.f[12] * b.f[12];
+    d.f[13] = c.f[13] - a.f[13] * b.f[13];
+    d.f[14] = c.f[14] - a.f[14] * b.f[14];
+    d.f[15] = c.f[15] - a.f[15] * b.f[15];
+
+    return d;
+  }
+
+  inline v16float clear_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = ( ~m.i[ 0] ) & a.i[ 0];
+    b.i[ 1] = ( ~m.i[ 1] ) & a.i[ 1];
+    b.i[ 2] = ( ~m.i[ 2] ) & a.i[ 2];
+    b.i[ 3] = ( ~m.i[ 3] ) & a.i[ 3];
+    b.i[ 4] = ( ~m.i[ 4] ) & a.i[ 4];
+    b.i[ 5] = ( ~m.i[ 5] ) & a.i[ 5];
+    b.i[ 6] = ( ~m.i[ 6] ) & a.i[ 6];
+    b.i[ 7] = ( ~m.i[ 7] ) & a.i[ 7];
+    b.i[ 8] = ( ~m.i[ 8] ) & a.i[ 8];
+    b.i[ 9] = ( ~m.i[ 9] ) & a.i[ 9];
+    b.i[10] = ( ~m.i[10] ) & a.i[10];
+    b.i[11] = ( ~m.i[11] ) & a.i[11];
+    b.i[12] = ( ~m.i[12] ) & a.i[12];
+    b.i[13] = ( ~m.i[13] ) & a.i[13];
+    b.i[14] = ( ~m.i[14] ) & a.i[14];
+    b.i[15] = ( ~m.i[15] ) & a.i[15];
+
+    return b;
+  }
+
+  inline v16float set_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = m.i[ 0] | a.i[ 0];
+    b.i[ 1] = m.i[ 1] | a.i[ 1];
+    b.i[ 2] = m.i[ 2] | a.i[ 2];
+    b.i[ 3] = m.i[ 3] | a.i[ 3];
+    b.i[ 4] = m.i[ 4] | a.i[ 4];
+    b.i[ 5] = m.i[ 5] | a.i[ 5];
+    b.i[ 6] = m.i[ 6] | a.i[ 6];
+    b.i[ 7] = m.i[ 7] | a.i[ 7];
+    b.i[ 8] = m.i[ 8] | a.i[ 8];
+    b.i[ 9] = m.i[ 9] | a.i[ 9];
+    b.i[10] = m.i[10] | a.i[10];
+    b.i[11] = m.i[11] | a.i[11];
+    b.i[12] = m.i[12] | a.i[12];
+    b.i[13] = m.i[13] | a.i[13];
+    b.i[14] = m.i[14] | a.i[14];
+    b.i[15] = m.i[15] | a.i[15];
+
+    return b;
+  }
+
+  inline v16float toggle_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = m.i[ 0] ^ a.i[ 0];
+    b.i[ 1] = m.i[ 1] ^ a.i[ 1];
+    b.i[ 2] = m.i[ 2] ^ a.i[ 2];
+    b.i[ 3] = m.i[ 3] ^ a.i[ 3];
+    b.i[ 4] = m.i[ 4] ^ a.i[ 4];
+    b.i[ 5] = m.i[ 5] ^ a.i[ 5];
+    b.i[ 6] = m.i[ 6] ^ a.i[ 6];
+    b.i[ 7] = m.i[ 7] ^ a.i[ 7];
+    b.i[ 8] = m.i[ 8] ^ a.i[ 8];
+    b.i[ 9] = m.i[ 9] ^ a.i[ 9];
+    b.i[10] = m.i[10] ^ a.i[10];
+    b.i[11] = m.i[11] ^ a.i[11];
+    b.i[12] = m.i[12] ^ a.i[12];
+    b.i[13] = m.i[13] ^ a.i[13];
+    b.i[14] = m.i[14] ^ a.i[14];
+    b.i[15] = m.i[15] ^ a.i[15];
+
+    return b;
+  }
+
+  inline void increment_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] += a.f[ 0];
+    p[ 1] += a.f[ 1];
+    p[ 2] += a.f[ 2];
+    p[ 3] += a.f[ 3];
+    p[ 4] += a.f[ 4];
+    p[ 5] += a.f[ 5];
+    p[ 6] += a.f[ 6];
+    p[ 7] += a.f[ 7];
+    p[ 8] += a.f[ 8];
+    p[ 9] += a.f[ 9];
+    p[10] += a.f[10];
+    p[11] += a.f[11];
+    p[12] += a.f[12];
+    p[13] += a.f[13];
+    p[14] += a.f[14];
+    p[15] += a.f[15];
+  }
+
+  inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] -= a.f[ 0];
+    p[ 1] -= a.f[ 1];
+    p[ 2] -= a.f[ 2];
+    p[ 3] -= a.f[ 3];
+    p[ 4] -= a.f[ 4];
+    p[ 5] -= a.f[ 5];
+    p[ 6] -= a.f[ 6];
+    p[ 7] -= a.f[ 7];
+    p[ 8] -= a.f[ 8];
+    p[ 9] -= a.f[ 9];
+    p[10] -= a.f[10];
+    p[11] -= a.f[11];
+    p[12] -= a.f[12];
+    p[13] -= a.f[13];
+    p[14] -= a.f[14];
+    p[15] -= a.f[15];
+  }
+
+  inline void scale_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] *= a.f[ 0];
+    p[ 1] *= a.f[ 1];
+    p[ 2] *= a.f[ 2];
+    p[ 3] *= a.f[ 3];
+    p[ 4] *= a.f[ 4];
+    p[ 5] *= a.f[ 5];
+    p[ 6] *= a.f[ 6];
+    p[ 7] *= a.f[ 7];
+    p[ 8] *= a.f[ 8];
+    p[ 9] *= a.f[ 9];
+    p[10] *= a.f[10];
+    p[11] *= a.f[11];
+    p[12] *= a.f[12];
+    p[13] *= a.f[13];
+    p[14] *= a.f[14];
+    p[15] *= a.f[15];
+  }
+
+} // namespace v16
+
+#endif // _v16_portable_h_
diff --git a/src/util/v16/v16_portable_v0.h b/src/util/v16/v16_portable_v0.h
new file mode 100644
index 00000000..084d1bb2
--- /dev/null
+++ b/src/util/v16/v16_portable_v0.h
@@ -0,0 +1,4253 @@
+#ifndef _v16_portable_h_
+#define _v16_portable_h_
+
+#ifndef IN_v16_h
+#error "Do not include v16_portable.h directly; use v16.h"
+#endif
+
+#define V16_ACCELERATION
+#define V16_PORTABLE_ACCELERATION
+
+#include <math.h>
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v16
+{
+  class v16;
+  class v16int;
+  class v16float;
+
+  ////////////////
+  // v16 base class
+
+  class v16
+  {
+    friend class v16int;
+    friend class v16float;
+
+    // v16 miscellaneous friends
+
+    friend inline int any( const v16 &a ) ALWAYS_INLINE;
+    friend inline int all( const v16 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v16 splat( const v16 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+				  v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+				  v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+				  v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16    merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE;
+
+    // v16 memory manipulation friends
+
+    friend inline void   load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE;
+    friend inline void  store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void  clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE;
+    friend inline void   copy_16x1( void * ALIGNED(64) dst,
+				    const void * ALIGNED(64) src ) ALWAYS_INLINE;
+    friend inline void   swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE;
+
+    // v16 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 16x2_tr variants.
+
+    friend inline void load_16x1_tr( const void *a00, const void *a01,
+				     const void *a02, const void *a03,
+				     const void *a04, const void *a05,
+				     const void *a06, const void *a07,
+				     const void *a08, const void *a09,
+				     const void *a10, const void *a11,
+				     const void *a12, const void *a13,
+				     const void *a14, const void *a15,
+				     v16 &a ) ALWAYS_INLINE;
+    friend inline void load_16x2_tr( const void * ALIGNED(8) a00,
+				     const void * ALIGNED(8) a01,
+				     const void * ALIGNED(8) a02,
+				     const void * ALIGNED(8) a03,
+				     const void * ALIGNED(8) a04,
+				     const void * ALIGNED(8) a05,
+				     const void * ALIGNED(8) a06,
+				     const void * ALIGNED(8) a07,
+				     const void * ALIGNED(8) a08,
+				     const void * ALIGNED(8) a09,
+				     const void * ALIGNED(8) a10,
+				     const void * ALIGNED(8) a11,
+				     const void * ALIGNED(8) a12,
+				     const void * ALIGNED(8) a13,
+				     const void * ALIGNED(8) a14,
+				     const void * ALIGNED(8) a15,
+				     v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void load_16x3_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE;
+    friend inline void load_16x4_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d,
+				     v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr( const void * ALIGNED(64) a00,
+				      const void * ALIGNED(64) a01,
+				      const void * ALIGNED(64) a02,
+				      const void * ALIGNED(64) a03,
+				      const void * ALIGNED(64) a04,
+				      const void * ALIGNED(64) a05,
+				      const void * ALIGNED(64) a06,
+				      const void * ALIGNED(64) a07,
+				      const void * ALIGNED(64) a08,
+				      const void * ALIGNED(64) a09,
+				      const void * ALIGNED(64) a10,
+				      const void * ALIGNED(64) a11,
+				      const void * ALIGNED(64) a12,
+				      const void * ALIGNED(64) a13,
+				      const void * ALIGNED(64) a14,
+				      const void * ALIGNED(64) a15,
+				      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+				      v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+				      v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+				      v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+				       const void * ALIGNED(64) a01,
+				       const void * ALIGNED(64) a02,
+				       const void * ALIGNED(64) a03,
+				       const void * ALIGNED(64) a04,
+				       const void * ALIGNED(64) a05,
+				       const void * ALIGNED(64) a06,
+				       const void * ALIGNED(64) a07,
+				       v16 &a, v16 &b, v16 &c, v16 &d,
+				       v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+					const void * ALIGNED(64) a01,
+					const void * ALIGNED(64) a02,
+					const void * ALIGNED(64) a03,
+					const void * ALIGNED(64) a04,
+					const void * ALIGNED(64) a05,
+					const void * ALIGNED(64) a06,
+					const void * ALIGNED(64) a07,
+					const void * ALIGNED(64) a08,
+					const void * ALIGNED(64) a09,
+					const void * ALIGNED(64) a10,
+					const void * ALIGNED(64) a11,
+					const void * ALIGNED(64) a12,
+					const void * ALIGNED(64) a13,
+					const void * ALIGNED(64) a14,
+					const void * ALIGNED(64) a15,
+					v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+					v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+					v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+					v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+
+    friend inline void store_16x1_tr( const v16 &a,
+				      void *a00, void *a01, void *a02, void *a03,
+				      void *a04, void *a05, void *a06, void *a07,
+				      void *a08, void *a09, void *a10, void *a11,
+				      void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE;
+    friend inline void store_16x2_tr( const v16 &a, const v16 &b,
+				      void * ALIGNED(8) a00,
+				      void * ALIGNED(8) a01,
+				      void * ALIGNED(8) a02,
+				      void * ALIGNED(8) a03,
+				      void * ALIGNED(8) a04,
+				      void * ALIGNED(8) a05,
+				      void * ALIGNED(8) a06,
+				      void * ALIGNED(8) a07,
+				      void * ALIGNED(8) a08,
+				      void * ALIGNED(8) a09,
+				      void * ALIGNED(8) a10,
+				      void * ALIGNED(8) a11,
+				      void * ALIGNED(8) a12,
+				      void * ALIGNED(8) a13,
+				      void * ALIGNED(8) a14,
+				      void * ALIGNED(8) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x4_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      const v16 &e, const v16 &f,
+				      const v16 &g, const v16 &h,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr( const v16 &b00, const v16 &b01,
+				       const v16 &b02, const v16 &b03,
+				       const v16 &b04, const v16 &b05,
+				       const v16 &b06, const v16 &b07,
+				       const v16 &b08, const v16 &b09,
+				       const v16 &b10, const v16 &b11,
+				       const v16 &b12, const v16 &b13,
+				       const v16 &b14, const v16 &b15,
+				       void * ALIGNED(64) a00,
+				       void * ALIGNED(64) a01,
+				       void * ALIGNED(64) a02,
+				       void * ALIGNED(64) a03,
+				       void * ALIGNED(64) a04,
+				       void * ALIGNED(64) a05,
+				       void * ALIGNED(64) a06,
+				       void * ALIGNED(64) a07,
+				       void * ALIGNED(64) a08,
+				       void * ALIGNED(64) a09,
+				       void * ALIGNED(64) a10,
+				       void * ALIGNED(64) a11,
+				       void * ALIGNED(64) a12,
+				       void * ALIGNED(64) a13,
+				       void * ALIGNED(64) a14,
+				       void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr_p( const v16 &a, const v16 &b,
+					const v16 &c, const v16 &d,
+					const v16 &e, const v16 &f,
+					const v16 &g, const v16 &h,
+					void * ALIGNED(64) a00,
+					void * ALIGNED(64) a01,
+					void * ALIGNED(64) a02,
+					void * ALIGNED(64) a03,
+					void * ALIGNED(64) a04,
+					void * ALIGNED(64) a05,
+					void * ALIGNED(64) a06,
+					void * ALIGNED(64) a07 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01,
+					 const v16 &b02, const v16 &b03,
+					 const v16 &b04, const v16 &b05,
+					 const v16 &b06, const v16 &b07,
+					 const v16 &b08, const v16 &b09,
+					 const v16 &b10, const v16 &b11,
+					 const v16 &b12, const v16 &b13,
+					 const v16 &b14, const v16 &b15,
+					 void * ALIGNED(64) a00,
+					 void * ALIGNED(64) a01,
+					 void * ALIGNED(64) a02,
+					 void * ALIGNED(64) a03,
+					 void * ALIGNED(64) a04,
+					 void * ALIGNED(64) a05,
+					 void * ALIGNED(64) a06,
+					 void * ALIGNED(64) a07,
+					 void * ALIGNED(64) a08,
+					 void * ALIGNED(64) a09,
+					 void * ALIGNED(64) a10,
+					 void * ALIGNED(64) a11,
+					 void * ALIGNED(64) a12,
+					 void * ALIGNED(64) a13,
+					 void * ALIGNED(64) a14,
+					 void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int   i[16];
+      float f[16];
+    };
+
+  public:
+
+    v16() {}                    // Default constructor
+
+    v16( const v16 &a )         // Copy constructor
+    {
+      i[ 0]=a.i[ 0]; i[ 1]=a.i[ 1]; i[ 2]=a.i[ 2]; i[ 3]=a.i[ 3];
+      i[ 4]=a.i[ 4]; i[ 5]=a.i[ 5]; i[ 6]=a.i[ 6]; i[ 7]=a.i[ 7];
+      i[ 8]=a.i[ 8]; i[ 9]=a.i[ 9]; i[10]=a.i[10]; i[11]=a.i[11];
+      i[12]=a.i[12]; i[13]=a.i[13]; i[14]=a.i[14]; i[15]=a.i[15];
+    }
+
+    ~v16() {}                   // Default destructor
+  };
+
+  // v16 miscellaneous functions
+
+  inline int any( const v16 &a )
+  {
+    return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] ||
+           a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] ||
+           a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] ||
+           a.i[12] || a.i[13] || a.i[14] || a.i[15];
+  }
+
+  inline int all( const v16 &a )
+  {
+    return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] &&
+           a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] &&
+           a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] &&
+           a.i[12] && a.i[13] && a.i[14] && a.i[15];
+  }
+
+  template<int n>
+  inline v16 splat( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[n];
+    b.i[ 1] = a.i[n];
+    b.i[ 2] = a.i[n];
+    b.i[ 3] = a.i[n];
+    b.i[ 4] = a.i[n];
+    b.i[ 5] = a.i[n];
+    b.i[ 6] = a.i[n];
+    b.i[ 7] = a.i[n];
+    b.i[ 8] = a.i[n];
+    b.i[ 9] = a.i[n];
+    b.i[10] = a.i[n];
+    b.i[11] = a.i[n];
+    b.i[12] = a.i[n];
+    b.i[13] = a.i[n];
+    b.i[14] = a.i[n];
+    b.i[15] = a.i[n];
+
+    return b;
+  }
+
+  template<int i00, int i01, int i02, int i03, int i04, int i05, int i06, int i07, int i08, int i09, int i10, int i11, int i12, int i13, int i14, int i15>
+  inline v16 shuffle( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[i00];
+    b.i[ 1] = a.i[i01];
+    b.i[ 2] = a.i[i02];
+    b.i[ 3] = a.i[i03];
+    b.i[ 4] = a.i[i04];
+    b.i[ 5] = a.i[i05];
+    b.i[ 6] = a.i[i06];
+    b.i[ 7] = a.i[i07];
+    b.i[ 8] = a.i[i08];
+    b.i[ 9] = a.i[i09];
+    b.i[10] = a.i[i10];
+    b.i[11] = a.i[i11];
+    b.i[12] = a.i[i12];
+    b.i[13] = a.i[i13];
+    b.i[14] = a.i[i14];
+    b.i[15] = a.i[i15];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v16 &a, v16 &b )
+  {
+    sw( a.i[ 0], b.i[ 0] );
+    sw( a.i[ 1], b.i[ 1] );
+    sw( a.i[ 2], b.i[ 2] );
+    sw( a.i[ 3], b.i[ 3] );
+    sw( a.i[ 4], b.i[ 4] );
+    sw( a.i[ 5], b.i[ 5] );
+    sw( a.i[ 6], b.i[ 6] );
+    sw( a.i[ 7], b.i[ 7] );
+    sw( a.i[ 8], b.i[ 8] );
+    sw( a.i[ 9], b.i[ 9] );
+    sw( a.i[10], b.i[10] );
+    sw( a.i[11], b.i[11] );
+    sw( a.i[12], b.i[12] );
+    sw( a.i[13], b.i[13] );
+    sw( a.i[14], b.i[14] );
+    sw( a.i[15], b.i[15] );
+  }
+
+  inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+			 v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+			 v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+			 v16 &a12, v16 &a13, v16 &a14, v16 &a15 )
+  {
+    sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] );
+                             sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] );
+                                                      sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] );
+                                                                               sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] );
+                                                                                                        sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] );
+                                                                                                                                 sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] );
+                                                                                                                                                          sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] );
+                                                                                                                                                                                   sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] );
+                                                                                                                                                                                                            sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] );
+                                                                                                                                                                                                                                     sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] );
+                                                                                                                                                                                                                                                               sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] );
+                                                                                                                                                                                                                                                                                          sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] );
+                                                                                                                                                                                                                                                                                                                     sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] );
+                                                                                                                                                                                                                                                                                                                                                sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] );
+                                                                                                                                                                                                                                                                                                                                                                           sw( a14.i[15],a15.i[14] );
+  }
+
+# undef sw
+
+  // v16 memory manipulation functions
+
+  inline void load_16x1( const void * ALIGNED(64) p,
+			 v16 &a )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))p)[ 0];
+    a.i[ 1] = ((const int * ALIGNED(64))p)[ 1];
+    a.i[ 2] = ((const int * ALIGNED(64))p)[ 2];
+    a.i[ 3] = ((const int * ALIGNED(64))p)[ 3];
+    a.i[ 4] = ((const int * ALIGNED(64))p)[ 4];
+    a.i[ 5] = ((const int * ALIGNED(64))p)[ 5];
+    a.i[ 6] = ((const int * ALIGNED(64))p)[ 6];
+    a.i[ 7] = ((const int * ALIGNED(64))p)[ 7];
+    a.i[ 8] = ((const int * ALIGNED(64))p)[ 8];
+    a.i[ 9] = ((const int * ALIGNED(64))p)[ 9];
+    a.i[10] = ((const int * ALIGNED(64))p)[10];
+    a.i[11] = ((const int * ALIGNED(64))p)[11];
+    a.i[12] = ((const int * ALIGNED(64))p)[12];
+    a.i[13] = ((const int * ALIGNED(64))p)[13];
+    a.i[14] = ((const int * ALIGNED(64))p)[14];
+    a.i[15] = ((const int * ALIGNED(64))p)[15];
+  }
+
+  inline void store_16x1( const v16 &a,
+			  void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = a.i[ 0];
+    ((int * ALIGNED(64))p)[ 1] = a.i[ 1];
+    ((int * ALIGNED(64))p)[ 2] = a.i[ 2];
+    ((int * ALIGNED(64))p)[ 3] = a.i[ 3];
+    ((int * ALIGNED(64))p)[ 4] = a.i[ 4];
+    ((int * ALIGNED(64))p)[ 5] = a.i[ 5];
+    ((int * ALIGNED(64))p)[ 6] = a.i[ 6];
+    ((int * ALIGNED(64))p)[ 7] = a.i[ 7];
+    ((int * ALIGNED(64))p)[ 8] = a.i[ 8];
+    ((int * ALIGNED(64))p)[ 9] = a.i[ 9];
+    ((int * ALIGNED(64))p)[10] = a.i[10];
+    ((int * ALIGNED(64))p)[11] = a.i[11];
+    ((int * ALIGNED(64))p)[12] = a.i[12];
+    ((int * ALIGNED(64))p)[13] = a.i[13];
+    ((int * ALIGNED(64))p)[14] = a.i[14];
+    ((int * ALIGNED(64))p)[15] = a.i[15];
+  }
+
+  inline void stream_16x1( const v16 &a,
+			   void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = a.i[ 0];
+    ((int * ALIGNED(64))p)[ 1] = a.i[ 1];
+    ((int * ALIGNED(64))p)[ 2] = a.i[ 2];
+    ((int * ALIGNED(64))p)[ 3] = a.i[ 3];
+    ((int * ALIGNED(64))p)[ 4] = a.i[ 4];
+    ((int * ALIGNED(64))p)[ 5] = a.i[ 5];
+    ((int * ALIGNED(64))p)[ 6] = a.i[ 6];
+    ((int * ALIGNED(64))p)[ 7] = a.i[ 7];
+    ((int * ALIGNED(64))p)[ 8] = a.i[ 8];
+    ((int * ALIGNED(64))p)[ 9] = a.i[ 9];
+    ((int * ALIGNED(64))p)[10] = a.i[10];
+    ((int * ALIGNED(64))p)[11] = a.i[11];
+    ((int * ALIGNED(64))p)[12] = a.i[12];
+    ((int * ALIGNED(64))p)[13] = a.i[13];
+    ((int * ALIGNED(64))p)[14] = a.i[14];
+    ((int * ALIGNED(64))p)[15] = a.i[15];
+  }
+
+  inline void clear_16x1( void * ALIGNED(64) p )
+  {
+    ((int * ALIGNED(64))p)[ 0] = 0;
+    ((int * ALIGNED(64))p)[ 1] = 0;
+    ((int * ALIGNED(64))p)[ 2] = 0;
+    ((int * ALIGNED(64))p)[ 3] = 0;
+    ((int * ALIGNED(64))p)[ 4] = 0;
+    ((int * ALIGNED(64))p)[ 5] = 0;
+    ((int * ALIGNED(64))p)[ 6] = 0;
+    ((int * ALIGNED(64))p)[ 7] = 0;
+    ((int * ALIGNED(64))p)[ 8] = 0;
+    ((int * ALIGNED(64))p)[ 9] = 0;
+    ((int * ALIGNED(64))p)[10] = 0;
+    ((int * ALIGNED(64))p)[11] = 0;
+    ((int * ALIGNED(64))p)[12] = 0;
+    ((int * ALIGNED(64))p)[13] = 0;
+    ((int * ALIGNED(64))p)[14] = 0;
+    ((int * ALIGNED(64))p)[15] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_16x1( void * ALIGNED(64) dst,
+			 const void * ALIGNED(64) src )
+  {
+    ((int * ALIGNED(64))dst)[ 0] = ((const int * ALIGNED(64))src)[ 0];
+    ((int * ALIGNED(64))dst)[ 1] = ((const int * ALIGNED(64))src)[ 1];
+    ((int * ALIGNED(64))dst)[ 2] = ((const int * ALIGNED(64))src)[ 2];
+    ((int * ALIGNED(64))dst)[ 3] = ((const int * ALIGNED(64))src)[ 3];
+    ((int * ALIGNED(64))dst)[ 4] = ((const int * ALIGNED(64))src)[ 4];
+    ((int * ALIGNED(64))dst)[ 5] = ((const int * ALIGNED(64))src)[ 5];
+    ((int * ALIGNED(64))dst)[ 6] = ((const int * ALIGNED(64))src)[ 6];
+    ((int * ALIGNED(64))dst)[ 7] = ((const int * ALIGNED(64))src)[ 7];
+    ((int * ALIGNED(64))dst)[ 8] = ((const int * ALIGNED(64))src)[ 8];
+    ((int * ALIGNED(64))dst)[ 9] = ((const int * ALIGNED(64))src)[ 9];
+    ((int * ALIGNED(64))dst)[10] = ((const int * ALIGNED(64))src)[10];
+    ((int * ALIGNED(64))dst)[11] = ((const int * ALIGNED(64))src)[11];
+    ((int * ALIGNED(64))dst)[12] = ((const int * ALIGNED(64))src)[12];
+    ((int * ALIGNED(64))dst)[13] = ((const int * ALIGNED(64))src)[13];
+    ((int * ALIGNED(64))dst)[14] = ((const int * ALIGNED(64))src)[14];
+    ((int * ALIGNED(64))dst)[15] = ((const int * ALIGNED(64))src)[15];
+  }
+
+  inline void swap_16x1( void * ALIGNED(64) a,
+			 void * ALIGNED(64) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(64))a)[ 0];
+    ((int * ALIGNED(64))a)[ 0] = ((int * ALIGNED(64))b)[ 0];
+    ((int * ALIGNED(64))b)[ 0] = t;
+
+    t = ((int * ALIGNED(64))a)[ 1];
+    ((int * ALIGNED(64))a)[ 1] = ((int * ALIGNED(64))b)[ 1];
+    ((int * ALIGNED(64))b)[ 1] = t;
+
+    t = ((int * ALIGNED(64))a)[ 2];
+    ((int * ALIGNED(64))a)[ 2] = ((int * ALIGNED(64))b)[ 2];
+    ((int * ALIGNED(64))b)[ 2] = t;
+
+    t = ((int * ALIGNED(64))a)[ 3];
+    ((int * ALIGNED(64))a)[ 3] = ((int * ALIGNED(64))b)[ 3];
+    ((int * ALIGNED(64))b)[ 3] = t;
+
+    t = ((int * ALIGNED(64))a)[ 4];
+    ((int * ALIGNED(64))a)[ 4] = ((int * ALIGNED(64))b)[ 4];
+    ((int * ALIGNED(64))b)[ 4] = t;
+
+    t = ((int * ALIGNED(64))a)[ 5];
+    ((int * ALIGNED(64))a)[ 5] = ((int * ALIGNED(64))b)[ 5];
+    ((int * ALIGNED(64))b)[ 5] = t;
+
+    t = ((int * ALIGNED(64))a)[ 6];
+    ((int * ALIGNED(64))a)[ 6] = ((int * ALIGNED(64))b)[ 6];
+    ((int * ALIGNED(64))b)[ 6] = t;
+
+    t = ((int * ALIGNED(64))a)[ 7];
+    ((int * ALIGNED(64))a)[ 7] = ((int * ALIGNED(64))b)[ 7];
+    ((int * ALIGNED(64))b)[ 7] = t;
+
+    t = ((int * ALIGNED(64))a)[ 8];
+    ((int * ALIGNED(64))a)[ 8] = ((int * ALIGNED(64))b)[ 8];
+    ((int * ALIGNED(64))b)[ 8] = t;
+
+    t = ((int * ALIGNED(64))a)[ 9];
+    ((int * ALIGNED(64))a)[ 9] = ((int * ALIGNED(64))b)[ 9];
+    ((int * ALIGNED(64))b)[ 9] = t;
+
+    t = ((int * ALIGNED(64))a)[10];
+    ((int * ALIGNED(64))a)[10] = ((int * ALIGNED(64))b)[10];
+    ((int * ALIGNED(64))b)[10] = t;
+
+    t = ((int * ALIGNED(64))a)[11];
+    ((int * ALIGNED(64))a)[11] = ((int * ALIGNED(64))b)[11];
+    ((int * ALIGNED(64))b)[11] = t;
+
+    t = ((int * ALIGNED(64))a)[12];
+    ((int * ALIGNED(64))a)[12] = ((int * ALIGNED(64))b)[12];
+    ((int * ALIGNED(64))b)[12] = t;
+
+    t = ((int * ALIGNED(64))a)[13];
+    ((int * ALIGNED(64))a)[13] = ((int * ALIGNED(64))b)[13];
+    ((int * ALIGNED(64))b)[13] = t;
+
+    t = ((int * ALIGNED(64))a)[14];
+    ((int * ALIGNED(64))a)[14] = ((int * ALIGNED(64))b)[14];
+    ((int * ALIGNED(64))b)[14] = t;
+
+    t = ((int * ALIGNED(64))a)[15];
+    ((int * ALIGNED(64))a)[15] = ((int * ALIGNED(64))b)[15];
+    ((int * ALIGNED(64))b)[15] = t;
+  }
+
+  // v16 transposed memory manipulation functions
+
+  inline void load_16x1_tr( const void *a00, const void *a01,
+                            const void *a02, const void *a03,
+                            const void *a04, const void *a05,
+                            const void *a06, const void *a07,
+			    const void *a08, const void *a09,
+                            const void *a10, const void *a11,
+                            const void *a12, const void *a13,
+                            const void *a14, const void *a15,
+			    v16 &a )
+  {
+    a.i[ 0] = ((const int *)a00)[0];
+    a.i[ 1] = ((const int *)a01)[0];
+    a.i[ 2] = ((const int *)a02)[0];
+    a.i[ 3] = ((const int *)a03)[0];
+    a.i[ 4] = ((const int *)a04)[0];
+    a.i[ 5] = ((const int *)a05)[0];
+    a.i[ 6] = ((const int *)a06)[0];
+    a.i[ 7] = ((const int *)a07)[0];
+    a.i[ 8] = ((const int *)a08)[0];
+    a.i[ 9] = ((const int *)a09)[0];
+    a.i[10] = ((const int *)a10)[0];
+    a.i[11] = ((const int *)a11)[0];
+    a.i[12] = ((const int *)a12)[0];
+    a.i[13] = ((const int *)a13)[0];
+    a.i[14] = ((const int *)a14)[0];
+    a.i[15] = ((const int *)a15)[0];
+  }
+
+  inline void load_16x2_tr( const void * ALIGNED(8) a00,
+			    const void * ALIGNED(8) a01,
+			    const void * ALIGNED(8) a02,
+			    const void * ALIGNED(8) a03,
+			    const void * ALIGNED(8) a04,
+			    const void * ALIGNED(8) a05,
+			    const void * ALIGNED(8) a06,
+			    const void * ALIGNED(8) a07,
+			    const void * ALIGNED(8) a08,
+			    const void * ALIGNED(8) a09,
+			    const void * ALIGNED(8) a10,
+			    const void * ALIGNED(8) a11,
+			    const void * ALIGNED(8) a12,
+			    const void * ALIGNED(8) a13,
+			    const void * ALIGNED(8) a14,
+			    const void * ALIGNED(8) a15,
+			    v16 &a, v16 &b )
+  {
+    a.i[ 0] = ((const int * ALIGNED(8))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(8))a00)[1];
+
+    a.i[ 1] = ((const int * ALIGNED(8))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(8))a01)[1];
+
+    a.i[ 2] = ((const int * ALIGNED(8))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(8))a02)[1];
+
+    a.i[ 3] = ((const int * ALIGNED(8))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(8))a03)[1];
+
+    a.i[ 4] = ((const int * ALIGNED(8))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(8))a04)[1];
+
+    a.i[ 5] = ((const int * ALIGNED(8))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(8))a05)[1];
+
+    a.i[ 6] = ((const int * ALIGNED(8))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(8))a06)[1];
+
+    a.i[ 7] = ((const int * ALIGNED(8))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(8))a07)[1];
+
+    a.i[ 8] = ((const int * ALIGNED(8))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(8))a08)[1];
+
+    a.i[ 9] = ((const int * ALIGNED(8))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(8))a09)[1];
+
+    a.i[10] = ((const int * ALIGNED(8))a10)[0];
+    b.i[10] = ((const int * ALIGNED(8))a10)[1];
+
+    a.i[11] = ((const int * ALIGNED(8))a11)[0];
+    b.i[11] = ((const int * ALIGNED(8))a11)[1];
+
+    a.i[12] = ((const int * ALIGNED(8))a12)[0];
+    b.i[12] = ((const int * ALIGNED(8))a12)[1];
+
+    a.i[13] = ((const int * ALIGNED(8))a13)[0];
+    b.i[13] = ((const int * ALIGNED(8))a13)[1];
+
+    a.i[14] = ((const int * ALIGNED(8))a14)[0];
+    b.i[14] = ((const int * ALIGNED(8))a14)[1];
+
+    a.i[15] = ((const int * ALIGNED(8))a15)[0];
+    b.i[15] = ((const int * ALIGNED(8))a15)[1];
+  }
+
+  inline void load_16x3_tr( const void * ALIGNED(64) a00,
+                            const void * ALIGNED(64) a01,
+                            const void * ALIGNED(64) a02,
+                            const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+                            const void * ALIGNED(64) a09,
+                            const void * ALIGNED(64) a10,
+                            const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; 
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; 
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2]; 
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2]; 
+  }
+
+  inline void load_16x4_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+  }
+
+  inline void load_16x8_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d,
+			    v16 &e, v16 &f, v16 &g, v16 &h )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+    e.i[ 0] = ((const int * ALIGNED(64))a00)[4];
+    f.i[ 0] = ((const int * ALIGNED(64))a00)[5];
+    g.i[ 0] = ((const int * ALIGNED(64))a00)[6];
+    h.i[ 0] = ((const int * ALIGNED(64))a00)[7];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+    e.i[ 1] = ((const int * ALIGNED(64))a01)[4];
+    f.i[ 1] = ((const int * ALIGNED(64))a01)[5];
+    g.i[ 1] = ((const int * ALIGNED(64))a01)[6];
+    h.i[ 1] = ((const int * ALIGNED(64))a01)[7];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+    e.i[ 2] = ((const int * ALIGNED(64))a02)[4];
+    f.i[ 2] = ((const int * ALIGNED(64))a02)[5];
+    g.i[ 2] = ((const int * ALIGNED(64))a02)[6];
+    h.i[ 2] = ((const int * ALIGNED(64))a02)[7];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+    e.i[ 3] = ((const int * ALIGNED(64))a03)[4];
+    f.i[ 3] = ((const int * ALIGNED(64))a03)[5];
+    g.i[ 3] = ((const int * ALIGNED(64))a03)[6];
+    h.i[ 3] = ((const int * ALIGNED(64))a03)[7];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+    e.i[ 4] = ((const int * ALIGNED(64))a04)[4];
+    f.i[ 4] = ((const int * ALIGNED(64))a04)[5];
+    g.i[ 4] = ((const int * ALIGNED(64))a04)[6];
+    h.i[ 4] = ((const int * ALIGNED(64))a04)[7];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+    e.i[ 5] = ((const int * ALIGNED(64))a05)[4];
+    f.i[ 5] = ((const int * ALIGNED(64))a05)[5];
+    g.i[ 5] = ((const int * ALIGNED(64))a05)[6];
+    h.i[ 5] = ((const int * ALIGNED(64))a05)[7];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+    e.i[ 6] = ((const int * ALIGNED(64))a06)[4];
+    f.i[ 6] = ((const int * ALIGNED(64))a06)[5];
+    g.i[ 6] = ((const int * ALIGNED(64))a06)[6];
+    h.i[ 6] = ((const int * ALIGNED(64))a06)[7];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+    e.i[ 7] = ((const int * ALIGNED(64))a07)[4];
+    f.i[ 7] = ((const int * ALIGNED(64))a07)[5];
+    g.i[ 7] = ((const int * ALIGNED(64))a07)[6];
+    h.i[ 7] = ((const int * ALIGNED(64))a07)[7];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+    e.i[ 8] = ((const int * ALIGNED(64))a08)[4];
+    f.i[ 8] = ((const int * ALIGNED(64))a08)[5];
+    g.i[ 8] = ((const int * ALIGNED(64))a08)[6];
+    h.i[ 8] = ((const int * ALIGNED(64))a08)[7];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+    e.i[ 9] = ((const int * ALIGNED(64))a09)[4];
+    f.i[ 9] = ((const int * ALIGNED(64))a09)[5];
+    g.i[ 9] = ((const int * ALIGNED(64))a09)[6];
+    h.i[ 9] = ((const int * ALIGNED(64))a09)[7];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+    e.i[10] = ((const int * ALIGNED(64))a10)[4];
+    f.i[10] = ((const int * ALIGNED(64))a10)[5];
+    g.i[10] = ((const int * ALIGNED(64))a10)[6];
+    h.i[10] = ((const int * ALIGNED(64))a10)[7];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+    e.i[11] = ((const int * ALIGNED(64))a11)[4];
+    f.i[11] = ((const int * ALIGNED(64))a11)[5];
+    g.i[11] = ((const int * ALIGNED(64))a11)[6];
+    h.i[11] = ((const int * ALIGNED(64))a11)[7];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+    e.i[12] = ((const int * ALIGNED(64))a12)[4];
+    f.i[12] = ((const int * ALIGNED(64))a12)[5];
+    g.i[12] = ((const int * ALIGNED(64))a12)[6];
+    h.i[12] = ((const int * ALIGNED(64))a12)[7];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+    e.i[13] = ((const int * ALIGNED(64))a13)[4];
+    f.i[13] = ((const int * ALIGNED(64))a13)[5];
+    g.i[13] = ((const int * ALIGNED(64))a13)[6];
+    h.i[13] = ((const int * ALIGNED(64))a13)[7];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+    e.i[14] = ((const int * ALIGNED(64))a14)[4];
+    f.i[14] = ((const int * ALIGNED(64))a14)[5];
+    g.i[14] = ((const int * ALIGNED(64))a14)[6];
+    h.i[14] = ((const int * ALIGNED(64))a14)[7];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+    e.i[15] = ((const int * ALIGNED(64))a15)[4];
+    f.i[15] = ((const int * ALIGNED(64))a15)[5];
+    g.i[15] = ((const int * ALIGNED(64))a15)[6];
+    h.i[15] = ((const int * ALIGNED(64))a15)[7];
+  }
+
+  inline void load_16x16_tr( const void * ALIGNED(64) a00,
+			     const void * ALIGNED(64) a01,
+			     const void * ALIGNED(64) a02,
+			     const void * ALIGNED(64) a03,
+			     const void * ALIGNED(64) a04,
+			     const void * ALIGNED(64) a05,
+			     const void * ALIGNED(64) a06,
+			     const void * ALIGNED(64) a07,
+			     const void * ALIGNED(64) a08,
+			     const void * ALIGNED(64) a09,
+			     const void * ALIGNED(64) a10,
+			     const void * ALIGNED(64) a11,
+			     const void * ALIGNED(64) a12,
+			     const void * ALIGNED(64) a13,
+			     const void * ALIGNED(64) a14,
+			     const void * ALIGNED(64) a15,
+			     v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			     v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			     v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			     v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8];
+    b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9];
+    b10.i[ 0] = ((const int * ALIGNED(64))a00)[10];
+    b11.i[ 0] = ((const int * ALIGNED(64))a00)[11];
+    b12.i[ 0] = ((const int * ALIGNED(64))a00)[12];
+    b13.i[ 0] = ((const int * ALIGNED(64))a00)[13];
+    b14.i[ 0] = ((const int * ALIGNED(64))a00)[14];
+    b15.i[ 0] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a01)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a01)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a01)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a01)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a01)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7];
+    b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8];
+    b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9];
+    b10.i[ 2] = ((const int * ALIGNED(64))a02)[10];
+    b11.i[ 2] = ((const int * ALIGNED(64))a02)[11];
+    b12.i[ 2] = ((const int * ALIGNED(64))a02)[12];
+    b13.i[ 2] = ((const int * ALIGNED(64))a02)[13];
+    b14.i[ 2] = ((const int * ALIGNED(64))a02)[14];
+    b15.i[ 2] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a03)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a03)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a03)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a03)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a03)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7];
+    b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8];
+    b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9];
+    b10.i[ 4] = ((const int * ALIGNED(64))a04)[10];
+    b11.i[ 4] = ((const int * ALIGNED(64))a04)[11];
+    b12.i[ 4] = ((const int * ALIGNED(64))a04)[12];
+    b13.i[ 4] = ((const int * ALIGNED(64))a04)[13];
+    b14.i[ 4] = ((const int * ALIGNED(64))a04)[14];
+    b15.i[ 4] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a05)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a05)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a05)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a05)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a05)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7];
+    b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8];
+    b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9];
+    b10.i[ 6] = ((const int * ALIGNED(64))a06)[10];
+    b11.i[ 6] = ((const int * ALIGNED(64))a06)[11];
+    b12.i[ 6] = ((const int * ALIGNED(64))a06)[12];
+    b13.i[ 6] = ((const int * ALIGNED(64))a06)[13];
+    b14.i[ 6] = ((const int * ALIGNED(64))a06)[14];
+    b15.i[ 6] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a07)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a07)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a07)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a07)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a07)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a07)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 8] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 8] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 8] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 8] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 8] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 8] = ((const int * ALIGNED(64))a08)[15];
+
+    b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0];
+    b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1];
+    b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2];
+    b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3];
+    b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4];
+    b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5];
+    b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6];
+    b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a09)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a10)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a10)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a10)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a10)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a10)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a10)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a10)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[10] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[10] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[10] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[10] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[10] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[10] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[10] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[10] = ((const int * ALIGNED(64))a10)[15];
+
+    b00.i[11] = ((const int * ALIGNED(64))a11)[ 0];
+    b01.i[11] = ((const int * ALIGNED(64))a11)[ 1];
+    b02.i[11] = ((const int * ALIGNED(64))a11)[ 2];
+    b03.i[11] = ((const int * ALIGNED(64))a11)[ 3];
+    b04.i[11] = ((const int * ALIGNED(64))a11)[ 4];
+    b05.i[11] = ((const int * ALIGNED(64))a11)[ 5];
+    b06.i[11] = ((const int * ALIGNED(64))a11)[ 6];
+    b07.i[11] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a11)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a12)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a12)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a12)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a12)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a12)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a12)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a12)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[12] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[12] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[12] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[12] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[12] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[12] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[12] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[12] = ((const int * ALIGNED(64))a12)[15];
+
+    b00.i[13] = ((const int * ALIGNED(64))a13)[ 0];
+    b01.i[13] = ((const int * ALIGNED(64))a13)[ 1];
+    b02.i[13] = ((const int * ALIGNED(64))a13)[ 2];
+    b03.i[13] = ((const int * ALIGNED(64))a13)[ 3];
+    b04.i[13] = ((const int * ALIGNED(64))a13)[ 4];
+    b05.i[13] = ((const int * ALIGNED(64))a13)[ 5];
+    b06.i[13] = ((const int * ALIGNED(64))a13)[ 6];
+    b07.i[13] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a13)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a14)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a14)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a14)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a14)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a14)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a14)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a14)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[14] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[14] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[14] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[14] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[14] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[14] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[14] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[14] = ((const int * ALIGNED(64))a14)[15];
+
+    b00.i[15] = ((const int * ALIGNED(64))a15)[ 0];
+    b01.i[15] = ((const int * ALIGNED(64))a15)[ 1];
+    b02.i[15] = ((const int * ALIGNED(64))a15)[ 2];
+    b03.i[15] = ((const int * ALIGNED(64))a15)[ 3];
+    b04.i[15] = ((const int * ALIGNED(64))a15)[ 4];
+    b05.i[15] = ((const int * ALIGNED(64))a15)[ 5];
+    b06.i[15] = ((const int * ALIGNED(64))a15)[ 6];
+    b07.i[15] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+			      const void * ALIGNED(64) a01,
+			      const void * ALIGNED(64) a02,
+			      const void * ALIGNED(64) a03,
+			      const void * ALIGNED(64) a04,
+			      const void * ALIGNED(64) a05,
+			      const void * ALIGNED(64) a06,
+			      const void * ALIGNED(64) a07,
+			      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			      v16 &b04, v16 &b05, v16 &b06, v16 &b07 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+  }
+
+  inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+			       const void * ALIGNED(64) a01,
+			       const void * ALIGNED(64) a02,
+			       const void * ALIGNED(64) a03,
+			       const void * ALIGNED(64) a04,
+			       const void * ALIGNED(64) a05,
+			       const void * ALIGNED(64) a06,
+			       const void * ALIGNED(64) a07,
+			       const void * ALIGNED(64) a08,
+			       const void * ALIGNED(64) a09,
+			       const void * ALIGNED(64) a10,
+			       const void * ALIGNED(64) a11,
+			       const void * ALIGNED(64) a12,
+			       const void * ALIGNED(64) a13,
+			       const void * ALIGNED(64) a14,
+			       const void * ALIGNED(64) a15,
+			       v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			       v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			       v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			       v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+
+    b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0];
+    b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1];
+    b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2];
+    b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3];
+    b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4];
+    b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5];
+    b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6];
+    b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a08)[15];
+
+    b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0];
+    b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1];
+    b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2];
+    b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3];
+    b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4];
+    b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5];
+    b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6];
+    b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a09)[15];
+
+    b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0];
+    b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1];
+    b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2];
+    b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3];
+    b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4];
+    b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5];
+    b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6];
+    b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a10)[15];
+
+    b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0];
+    b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1];
+    b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2];
+    b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3];
+    b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4];
+    b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5];
+    b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6];
+    b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a11)[15];
+
+    b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0];
+    b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1];
+    b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2];
+    b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3];
+    b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4];
+    b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5];
+    b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6];
+    b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a12)[15];
+
+    b08.i[10] = ((const int * ALIGNED(64))a13)[ 0];
+    b09.i[10] = ((const int * ALIGNED(64))a13)[ 1];
+    b10.i[10] = ((const int * ALIGNED(64))a13)[ 2];
+    b11.i[10] = ((const int * ALIGNED(64))a13)[ 3];
+    b12.i[10] = ((const int * ALIGNED(64))a13)[ 4];
+    b13.i[10] = ((const int * ALIGNED(64))a13)[ 5];
+    b14.i[10] = ((const int * ALIGNED(64))a13)[ 6];
+    b15.i[10] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a13)[15];
+
+    b08.i[12] = ((const int * ALIGNED(64))a14)[ 0];
+    b09.i[12] = ((const int * ALIGNED(64))a14)[ 1];
+    b10.i[12] = ((const int * ALIGNED(64))a14)[ 2];
+    b11.i[12] = ((const int * ALIGNED(64))a14)[ 3];
+    b12.i[12] = ((const int * ALIGNED(64))a14)[ 4];
+    b13.i[12] = ((const int * ALIGNED(64))a14)[ 5];
+    b14.i[12] = ((const int * ALIGNED(64))a14)[ 6];
+    b15.i[12] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a14)[15];
+
+    b08.i[14] = ((const int * ALIGNED(64))a15)[ 0];
+    b09.i[14] = ((const int * ALIGNED(64))a15)[ 1];
+    b10.i[14] = ((const int * ALIGNED(64))a15)[ 2];
+    b11.i[14] = ((const int * ALIGNED(64))a15)[ 3];
+    b12.i[14] = ((const int * ALIGNED(64))a15)[ 4];
+    b13.i[14] = ((const int * ALIGNED(64))a15)[ 5];
+    b14.i[14] = ((const int * ALIGNED(64))a15)[ 6];
+    b15.i[14] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void store_16x1_tr( const v16 &a,
+			     void *a00, void *a01, void *a02, void *a03,
+			     void *a04, void *a05, void *a06, void *a07,
+			     void *a08, void *a09, void *a10, void *a11,
+			     void *a12, void *a13, void *a14, void *a15 )
+  {
+    ((int *)a00)[0] = a.i[ 0];
+    ((int *)a01)[0] = a.i[ 1];
+    ((int *)a02)[0] = a.i[ 2];
+    ((int *)a03)[0] = a.i[ 3];
+    ((int *)a04)[0] = a.i[ 4];
+    ((int *)a05)[0] = a.i[ 5];
+    ((int *)a06)[0] = a.i[ 6];
+    ((int *)a07)[0] = a.i[ 7];
+    ((int *)a08)[0] = a.i[ 8];
+    ((int *)a09)[0] = a.i[ 9];
+    ((int *)a10)[0] = a.i[10];
+    ((int *)a11)[0] = a.i[11];
+    ((int *)a12)[0] = a.i[12];
+    ((int *)a13)[0] = a.i[13];
+    ((int *)a14)[0] = a.i[14];
+    ((int *)a15)[0] = a.i[15];
+  }
+
+  inline void store_16x2_tr( const v16 &a, const v16 &b,
+			     void * ALIGNED(8) a00, void * ALIGNED(8) a01,
+			     void * ALIGNED(8) a02, void * ALIGNED(8) a03,
+			     void * ALIGNED(8) a04, void * ALIGNED(8) a05,
+			     void * ALIGNED(8) a06, void * ALIGNED(8) a07,
+			     void * ALIGNED(8) a08, void * ALIGNED(8) a09,
+			     void * ALIGNED(8) a10, void * ALIGNED(8) a11,
+			     void * ALIGNED(8) a12, void * ALIGNED(8) a13,
+			     void * ALIGNED(8) a14, void * ALIGNED(8) a15 )
+  {
+    ((int * ALIGNED(8))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(8))a00)[1] = b.i[ 0];
+
+    ((int * ALIGNED(8))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(8))a01)[1] = b.i[ 1];
+
+    ((int * ALIGNED(8))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(8))a02)[1] = b.i[ 2];
+
+    ((int * ALIGNED(8))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(8))a03)[1] = b.i[ 3];
+
+    ((int * ALIGNED(8))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(8))a04)[1] = b.i[ 4];
+
+    ((int * ALIGNED(8))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(8))a05)[1] = b.i[ 5];
+
+    ((int * ALIGNED(8))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(8))a06)[1] = b.i[ 6];
+
+    ((int * ALIGNED(8))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(8))a07)[1] = b.i[ 7];
+
+    ((int * ALIGNED(8))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(8))a08)[1] = b.i[ 8];
+
+    ((int * ALIGNED(8))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(8))a09)[1] = b.i[ 9];
+
+    ((int * ALIGNED(8))a10)[0] = a.i[10];
+    ((int * ALIGNED(8))a10)[1] = b.i[10];
+
+    ((int * ALIGNED(8))a11)[0] = a.i[11];
+    ((int * ALIGNED(8))a11)[1] = b.i[11];
+
+    ((int * ALIGNED(8))a12)[0] = a.i[12];
+    ((int * ALIGNED(8))a12)[1] = b.i[12];
+
+    ((int * ALIGNED(8))a13)[0] = a.i[13];
+    ((int * ALIGNED(8))a13)[1] = b.i[13];
+
+    ((int * ALIGNED(8))a14)[0] = a.i[14];
+    ((int * ALIGNED(8))a14)[1] = b.i[14];
+
+    ((int * ALIGNED(8))a15)[0] = a.i[15];
+    ((int * ALIGNED(8))a15)[1] = b.i[15];
+  }
+
+  inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+  }
+
+  inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+  }
+
+  inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     const v16 &e, const v16 &f, const v16 &g, const v16 &h,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+    ((int * ALIGNED(64))a00)[4] = e.i[ 0];
+    ((int * ALIGNED(64))a00)[5] = f.i[ 0];
+    ((int * ALIGNED(64))a00)[6] = g.i[ 0];
+    ((int * ALIGNED(64))a00)[7] = h.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+    ((int * ALIGNED(64))a01)[4] = e.i[ 1];
+    ((int * ALIGNED(64))a01)[5] = f.i[ 1];
+    ((int * ALIGNED(64))a01)[6] = g.i[ 1];
+    ((int * ALIGNED(64))a01)[7] = h.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+    ((int * ALIGNED(64))a02)[4] = e.i[ 2];
+    ((int * ALIGNED(64))a02)[5] = f.i[ 2];
+    ((int * ALIGNED(64))a02)[6] = g.i[ 2];
+    ((int * ALIGNED(64))a02)[7] = h.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+    ((int * ALIGNED(64))a03)[4] = e.i[ 3];
+    ((int * ALIGNED(64))a03)[5] = f.i[ 3];
+    ((int * ALIGNED(64))a03)[6] = g.i[ 3];
+    ((int * ALIGNED(64))a03)[7] = h.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+    ((int * ALIGNED(64))a04)[4] = e.i[ 4];
+    ((int * ALIGNED(64))a04)[5] = f.i[ 4];
+    ((int * ALIGNED(64))a04)[6] = g.i[ 4];
+    ((int * ALIGNED(64))a04)[7] = h.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+    ((int * ALIGNED(64))a05)[4] = e.i[ 5];
+    ((int * ALIGNED(64))a05)[5] = f.i[ 5];
+    ((int * ALIGNED(64))a05)[6] = g.i[ 5];
+    ((int * ALIGNED(64))a05)[7] = h.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+    ((int * ALIGNED(64))a06)[4] = e.i[ 6];
+    ((int * ALIGNED(64))a06)[5] = f.i[ 6];
+    ((int * ALIGNED(64))a06)[6] = g.i[ 6];
+    ((int * ALIGNED(64))a06)[7] = h.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+    ((int * ALIGNED(64))a07)[4] = e.i[ 7];
+    ((int * ALIGNED(64))a07)[5] = f.i[ 7];
+    ((int * ALIGNED(64))a07)[6] = g.i[ 7];
+    ((int * ALIGNED(64))a07)[7] = h.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+    ((int * ALIGNED(64))a08)[4] = e.i[ 8];
+    ((int * ALIGNED(64))a08)[5] = f.i[ 8];
+    ((int * ALIGNED(64))a08)[6] = g.i[ 8];
+    ((int * ALIGNED(64))a08)[7] = h.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+    ((int * ALIGNED(64))a09)[4] = e.i[ 9];
+    ((int * ALIGNED(64))a09)[5] = f.i[ 9];
+    ((int * ALIGNED(64))a09)[6] = g.i[ 9];
+    ((int * ALIGNED(64))a09)[7] = h.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+    ((int * ALIGNED(64))a10)[4] = e.i[10];
+    ((int * ALIGNED(64))a10)[5] = f.i[10];
+    ((int * ALIGNED(64))a10)[6] = g.i[10];
+    ((int * ALIGNED(64))a10)[7] = h.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+    ((int * ALIGNED(64))a11)[4] = e.i[11];
+    ((int * ALIGNED(64))a11)[5] = f.i[11];
+    ((int * ALIGNED(64))a11)[6] = g.i[11];
+    ((int * ALIGNED(64))a11)[7] = h.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+    ((int * ALIGNED(64))a12)[4] = e.i[12];
+    ((int * ALIGNED(64))a12)[5] = f.i[12];
+    ((int * ALIGNED(64))a12)[6] = g.i[12];
+    ((int * ALIGNED(64))a12)[7] = h.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+    ((int * ALIGNED(64))a13)[4] = e.i[13];
+    ((int * ALIGNED(64))a13)[5] = f.i[13];
+    ((int * ALIGNED(64))a13)[6] = g.i[13];
+    ((int * ALIGNED(64))a13)[7] = h.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+    ((int * ALIGNED(64))a14)[4] = e.i[14];
+    ((int * ALIGNED(64))a14)[5] = f.i[14];
+    ((int * ALIGNED(64))a14)[6] = g.i[14];
+    ((int * ALIGNED(64))a14)[7] = h.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+    ((int * ALIGNED(64))a15)[4] = e.i[15];
+    ((int * ALIGNED(64))a15)[5] = f.i[15];
+    ((int * ALIGNED(64))a15)[6] = g.i[15];
+    ((int * ALIGNED(64))a15)[7] = h.i[15];
+  }
+
+  inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+			      const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+			      const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+			      const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+			      void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			      void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			      void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			      void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			      void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			      void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			      void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			      void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0];
+    ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0];
+    ((int * ALIGNED(64))a00)[10] = b10.i[ 0];
+    ((int * ALIGNED(64))a00)[11] = b11.i[ 0];
+    ((int * ALIGNED(64))a00)[12] = b12.i[ 0];
+    ((int * ALIGNED(64))a00)[13] = b13.i[ 0];
+    ((int * ALIGNED(64))a00)[14] = b14.i[ 0];
+    ((int * ALIGNED(64))a00)[15] = b15.i[ 0];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1];
+    ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a01)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a01)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a01)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a01)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a01)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a01)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2];
+    ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2];
+    ((int * ALIGNED(64))a02)[10] = b10.i[ 2];
+    ((int * ALIGNED(64))a02)[11] = b11.i[ 2];
+    ((int * ALIGNED(64))a02)[12] = b12.i[ 2];
+    ((int * ALIGNED(64))a02)[13] = b13.i[ 2];
+    ((int * ALIGNED(64))a02)[14] = b14.i[ 2];
+    ((int * ALIGNED(64))a02)[15] = b15.i[ 2];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3];
+    ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a03)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a03)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a03)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a03)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a03)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a03)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4];
+    ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4];
+    ((int * ALIGNED(64))a04)[10] = b10.i[ 4];
+    ((int * ALIGNED(64))a04)[11] = b11.i[ 4];
+    ((int * ALIGNED(64))a04)[12] = b12.i[ 4];
+    ((int * ALIGNED(64))a04)[13] = b13.i[ 4];
+    ((int * ALIGNED(64))a04)[14] = b14.i[ 4];
+    ((int * ALIGNED(64))a04)[15] = b15.i[ 4];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5];
+    ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a05)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a05)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a05)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a05)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a05)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a05)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6];
+    ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6];
+    ((int * ALIGNED(64))a06)[10] = b10.i[ 6];
+    ((int * ALIGNED(64))a06)[11] = b11.i[ 6];
+    ((int * ALIGNED(64))a06)[12] = b12.i[ 6];
+    ((int * ALIGNED(64))a06)[13] = b13.i[ 6];
+    ((int * ALIGNED(64))a06)[14] = b14.i[ 6];
+    ((int * ALIGNED(64))a06)[15] = b15.i[ 6];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7];
+    ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a07)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a07)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a07)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a07)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a07)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a07)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 8];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 8];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 8];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 8];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 8];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 8];
+
+    ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9];
+    ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9];
+    ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9];
+    ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9];
+    ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9];
+    ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9];
+    ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9];
+    ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a10)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a10)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a10)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a10)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a10)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a10)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a10)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a10)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[10];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[10];
+    ((int * ALIGNED(64))a10)[10] = b10.i[10];
+    ((int * ALIGNED(64))a10)[11] = b11.i[10];
+    ((int * ALIGNED(64))a10)[12] = b12.i[10];
+    ((int * ALIGNED(64))a10)[13] = b13.i[10];
+    ((int * ALIGNED(64))a10)[14] = b14.i[10];
+    ((int * ALIGNED(64))a10)[15] = b15.i[10];
+
+    ((int * ALIGNED(64))a11)[ 0] = b00.i[11];
+    ((int * ALIGNED(64))a11)[ 1] = b01.i[11];
+    ((int * ALIGNED(64))a11)[ 2] = b02.i[11];
+    ((int * ALIGNED(64))a11)[ 3] = b03.i[11];
+    ((int * ALIGNED(64))a11)[ 4] = b04.i[11];
+    ((int * ALIGNED(64))a11)[ 5] = b05.i[11];
+    ((int * ALIGNED(64))a11)[ 6] = b06.i[11];
+    ((int * ALIGNED(64))a11)[ 7] = b07.i[11];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a11)[10] = b10.i[11];
+    ((int * ALIGNED(64))a11)[11] = b11.i[11];
+    ((int * ALIGNED(64))a11)[12] = b12.i[11];
+    ((int * ALIGNED(64))a11)[13] = b13.i[11];
+    ((int * ALIGNED(64))a11)[14] = b14.i[11];
+    ((int * ALIGNED(64))a11)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a12)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a12)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a12)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a12)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a12)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a12)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a12)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a12)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[12];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[12];
+    ((int * ALIGNED(64))a12)[10] = b10.i[12];
+    ((int * ALIGNED(64))a12)[11] = b11.i[12];
+    ((int * ALIGNED(64))a12)[12] = b12.i[12];
+    ((int * ALIGNED(64))a12)[13] = b13.i[12];
+    ((int * ALIGNED(64))a12)[14] = b14.i[12];
+    ((int * ALIGNED(64))a12)[15] = b15.i[12];
+
+    ((int * ALIGNED(64))a13)[ 0] = b00.i[13];
+    ((int * ALIGNED(64))a13)[ 1] = b01.i[13];
+    ((int * ALIGNED(64))a13)[ 2] = b02.i[13];
+    ((int * ALIGNED(64))a13)[ 3] = b03.i[13];
+    ((int * ALIGNED(64))a13)[ 4] = b04.i[13];
+    ((int * ALIGNED(64))a13)[ 5] = b05.i[13];
+    ((int * ALIGNED(64))a13)[ 6] = b06.i[13];
+    ((int * ALIGNED(64))a13)[ 7] = b07.i[13];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a13)[10] = b10.i[13];
+    ((int * ALIGNED(64))a13)[11] = b11.i[13];
+    ((int * ALIGNED(64))a13)[12] = b12.i[13];
+    ((int * ALIGNED(64))a13)[13] = b13.i[13];
+    ((int * ALIGNED(64))a13)[14] = b14.i[13];
+    ((int * ALIGNED(64))a13)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a14)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a14)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a14)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a14)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a14)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a14)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a14)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a14)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[14];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[14];
+    ((int * ALIGNED(64))a14)[10] = b10.i[14];
+    ((int * ALIGNED(64))a14)[11] = b11.i[14];
+    ((int * ALIGNED(64))a14)[12] = b12.i[14];
+    ((int * ALIGNED(64))a14)[13] = b13.i[14];
+    ((int * ALIGNED(64))a14)[14] = b14.i[14];
+    ((int * ALIGNED(64))a14)[15] = b15.i[14];
+
+    ((int * ALIGNED(64))a15)[ 0] = b00.i[15];
+    ((int * ALIGNED(64))a15)[ 1] = b01.i[15];
+    ((int * ALIGNED(64))a15)[ 2] = b02.i[15];
+    ((int * ALIGNED(64))a15)[ 3] = b03.i[15];
+    ((int * ALIGNED(64))a15)[ 4] = b04.i[15];
+    ((int * ALIGNED(64))a15)[ 5] = b05.i[15];
+    ((int * ALIGNED(64))a15)[ 6] = b06.i[15];
+    ((int * ALIGNED(64))a15)[ 7] = b07.i[15];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  inline void store_16x8_tr_p( const v16 &b00,
+			       const v16 &b01,
+			       const v16 &b02,
+			       const v16 &b03,
+			       const v16 &b04,
+			       const v16 &b05,
+			       const v16 &b06,
+			       const v16 &b07,
+			       void * ALIGNED(64) a00,
+			       void * ALIGNED(64) a01,
+			       void * ALIGNED(64) a02,
+			       void * ALIGNED(64) a03,
+			       void * ALIGNED(64) a04,
+			       void * ALIGNED(64) a05,
+			       void * ALIGNED(64) a06,
+			       void * ALIGNED(64) a07 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+  }
+
+  inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+				const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+				const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+				const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+				void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+				void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+				void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+				void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+				void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+				void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+				void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+				void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+
+    ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0];
+    ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0];
+    ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0];
+    ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0];
+    ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0];
+    ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0];
+    ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0];
+    ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2];
+    ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2];
+    ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2];
+    ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2];
+    ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2];
+    ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2];
+    ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2];
+    ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4];
+    ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4];
+    ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4];
+    ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4];
+    ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4];
+    ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4];
+    ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4];
+    ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a10)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a10)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a10)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a10)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a10)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a10)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6];
+    ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6];
+    ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6];
+    ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6];
+    ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6];
+    ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6];
+    ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6];
+    ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a11)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a11)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a11)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a11)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a11)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a11)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8];
+    ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8];
+    ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8];
+    ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8];
+    ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8];
+    ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8];
+    ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8];
+    ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a12)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a12)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a12)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a12)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a12)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a12)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a13)[ 0] = b08.i[10];
+    ((int * ALIGNED(64))a13)[ 1] = b09.i[10];
+    ((int * ALIGNED(64))a13)[ 2] = b10.i[10];
+    ((int * ALIGNED(64))a13)[ 3] = b11.i[10];
+    ((int * ALIGNED(64))a13)[ 4] = b12.i[10];
+    ((int * ALIGNED(64))a13)[ 5] = b13.i[10];
+    ((int * ALIGNED(64))a13)[ 6] = b14.i[10];
+    ((int * ALIGNED(64))a13)[ 7] = b15.i[10];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a13)[10] = b10.i[11];
+    ((int * ALIGNED(64))a13)[11] = b11.i[11];
+    ((int * ALIGNED(64))a13)[12] = b12.i[11];
+    ((int * ALIGNED(64))a13)[13] = b13.i[11];
+    ((int * ALIGNED(64))a13)[14] = b14.i[11];
+    ((int * ALIGNED(64))a13)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a14)[ 0] = b08.i[12];
+    ((int * ALIGNED(64))a14)[ 1] = b09.i[12];
+    ((int * ALIGNED(64))a14)[ 2] = b10.i[12];
+    ((int * ALIGNED(64))a14)[ 3] = b11.i[12];
+    ((int * ALIGNED(64))a14)[ 4] = b12.i[12];
+    ((int * ALIGNED(64))a14)[ 5] = b13.i[12];
+    ((int * ALIGNED(64))a14)[ 6] = b14.i[12];
+    ((int * ALIGNED(64))a14)[ 7] = b15.i[12];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a14)[10] = b10.i[13];
+    ((int * ALIGNED(64))a14)[11] = b11.i[13];
+    ((int * ALIGNED(64))a14)[12] = b12.i[13];
+    ((int * ALIGNED(64))a14)[13] = b13.i[13];
+    ((int * ALIGNED(64))a14)[14] = b14.i[13];
+    ((int * ALIGNED(64))a14)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a15)[ 0] = b08.i[14];
+    ((int * ALIGNED(64))a15)[ 1] = b09.i[14];
+    ((int * ALIGNED(64))a15)[ 2] = b10.i[14];
+    ((int * ALIGNED(64))a15)[ 3] = b11.i[14];
+    ((int * ALIGNED(64))a15)[ 4] = b12.i[14];
+    ((int * ALIGNED(64))a15)[ 5] = b13.i[14];
+    ((int * ALIGNED(64))a15)[ 6] = b14.i[14];
+    ((int * ALIGNED(64))a15)[ 7] = b15.i[14];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  //////////////
+  // v16int class
+
+  class v16int : public v16
+  {
+    // v16int prefix unary operator friends
+
+    friend inline v16int operator  +( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  ~( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  !( const v16int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16int prefix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a ) ALWAYS_INLINE;
+
+    // v16int postfix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE;
+
+    // v16int binary operator friends
+
+    friend inline v16int operator  +( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  *( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  /( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  %( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  ^( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  &( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  |( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int logical operator friends
+
+    friend inline v16int operator  <( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16int abs( const v16int &a ) ALWAYS_INLINE;
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE;
+
+    // v16float unary operator friends
+
+    friend inline v16int operator  !( const v16float & a ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float miscellaneous friends
+
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16int constructors / destructors
+
+    v16int() {}                                  // Default constructor
+
+    v16int( const v16int &a )                    // Copy constructor
+    {
+      i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3];
+      i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7];
+      i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11];
+      i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15];
+    }
+
+    v16int( const v16 &a )                       // Init from mixed
+    {
+      i[ 0] = a.i[ 0]; i[ 1] = a.i[ 1]; i[ 2] = a.i[ 2]; i[ 3] = a.i[ 3];
+      i[ 4] = a.i[ 4]; i[ 5] = a.i[ 5]; i[ 6] = a.i[ 6]; i[ 7] = a.i[ 7];
+      i[ 8] = a.i[ 8]; i[ 9] = a.i[ 9]; i[10] = a.i[10]; i[11] = a.i[11];
+      i[12] = a.i[12]; i[13] = a.i[13]; i[14] = a.i[14]; i[15] = a.i[15];
+    }
+
+    v16int( int a )                              // Init from scalar
+    {
+      i[ 0] = a; i[ 1] = a; i[ 2] = a; i[ 3] = a;
+      i[ 4] = a; i[ 5] = a; i[ 6] = a; i[ 7] = a;
+      i[ 8] = a; i[ 9] = a; i[10] = a; i[11] = a;
+      i[12] = a; i[13] = a; i[14] = a; i[15] = a;
+    }
+
+    v16int( int i00, int i01, int i02, int i03,
+	    int i04, int i05, int i06, int i07,
+	    int i08, int i09, int i10, int i11,
+	    int i12, int i13, int i14, int i15 ) // Init from scalars
+    {
+      i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03;
+      i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07;
+      i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11;
+      i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15;
+    }
+
+    ~v16int() {}                                 // Destructor
+
+    // v16int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v16int &operator op( const v16int &b ) \
+    {						  \
+      i[ 0] op b.i[ 0];                           \
+      i[ 1] op b.i[ 1];                           \
+      i[ 2] op b.i[ 2];                           \
+      i[ 3] op b.i[ 3];                           \
+      i[ 4] op b.i[ 4];                           \
+      i[ 5] op b.i[ 5];                           \
+      i[ 6] op b.i[ 6];                           \
+      i[ 7] op b.i[ 7];                           \
+      i[ 8] op b.i[ 8];                           \
+      i[ 9] op b.i[ 9];                           \
+      i[10] op b.i[10];                           \
+      i[11] op b.i[11];                           \
+      i[12] op b.i[12];                           \
+      i[13] op b.i[13];                           \
+      i[14] op b.i[14];                           \
+      i[15] op b.i[15];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v16int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v16int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v16int operator op( const v16int & a ) \
+  {						\
+    v16int b;                                   \
+    b.i[ 0] = (op a.i[ 0]);                     \
+    b.i[ 1] = (op a.i[ 1]);                     \
+    b.i[ 2] = (op a.i[ 2]);                     \
+    b.i[ 3] = (op a.i[ 3]);                     \
+    b.i[ 4] = (op a.i[ 4]);                     \
+    b.i[ 5] = (op a.i[ 5]);                     \
+    b.i[ 6] = (op a.i[ 6]);                     \
+    b.i[ 7] = (op a.i[ 7]);                     \
+    b.i[ 8] = (op a.i[ 8]);                     \
+    b.i[ 9] = (op a.i[ 9]);                     \
+    b.i[10] = (op a.i[10]);                     \
+    b.i[11] = (op a.i[11]);                     \
+    b.i[12] = (op a.i[12]);                     \
+    b.i[13] = (op a.i[13]);                     \
+    b.i[14] = (op a.i[14]);                     \
+    b.i[15] = (op a.i[15]);                     \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v16int operator !( const v16int & a )
+  {
+    v16int b;
+    b.i[ 0] = - ( !a.i[ 0] );
+    b.i[ 1] = - ( !a.i[ 1] );
+    b.i[ 2] = - ( !a.i[ 2] );
+    b.i[ 3] = - ( !a.i[ 3] );
+    b.i[ 4] = - ( !a.i[ 4] );
+    b.i[ 5] = - ( !a.i[ 5] );
+    b.i[ 6] = - ( !a.i[ 6] );
+    b.i[ 7] = - ( !a.i[ 7] );
+    b.i[ 8] = - ( !a.i[ 8] );
+    b.i[ 9] = - ( !a.i[ 9] );
+    b.i[10] = - ( !a.i[10] );
+    b.i[11] = - ( !a.i[11] );
+    b.i[12] = - ( !a.i[12] );
+    b.i[13] = - ( !a.i[13] );
+    b.i[14] = - ( !a.i[14] );
+    b.i[15] = - ( !a.i[15] );
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v16int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v16int operator op( v16int & a )       \
+  {						\
+    v16int b;                                   \
+    b.i[ 0] = ( op a.i[ 0] );                   \
+    b.i[ 1] = ( op a.i[ 1] );                   \
+    b.i[ 2] = ( op a.i[ 2] );                   \
+    b.i[ 3] = ( op a.i[ 3] );                   \
+    b.i[ 4] = ( op a.i[ 4] );                   \
+    b.i[ 5] = ( op a.i[ 5] );                   \
+    b.i[ 6] = ( op a.i[ 6] );                   \
+    b.i[ 7] = ( op a.i[ 7] );                   \
+    b.i[ 8] = ( op a.i[ 8] );                   \
+    b.i[ 9] = ( op a.i[ 9] );                   \
+    b.i[10] = ( op a.i[10] );                   \
+    b.i[11] = ( op a.i[11] );                   \
+    b.i[12] = ( op a.i[12] );                   \
+    b.i[13] = ( op a.i[13] );                   \
+    b.i[14] = ( op a.i[14] );                   \
+    b.i[15] = ( op a.i[15] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v16int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v16int operator op( v16int & a, int ) \
+  {					       \
+    v16int b;                                  \
+    b.i[ 0] = ( a.i[ 0] op );                  \
+    b.i[ 1] = ( a.i[ 1] op );                  \
+    b.i[ 2] = ( a.i[ 2] op );                  \
+    b.i[ 3] = ( a.i[ 3] op );                  \
+    b.i[ 4] = ( a.i[ 4] op );                  \
+    b.i[ 5] = ( a.i[ 5] op );                  \
+    b.i[ 6] = ( a.i[ 6] op );                  \
+    b.i[ 7] = ( a.i[ 7] op );                  \
+    b.i[ 8] = ( a.i[ 8] op );                  \
+    b.i[ 9] = ( a.i[ 9] op );                  \
+    b.i[10] = ( a.i[10] op );                  \
+    b.i[11] = ( a.i[11] op );                  \
+    b.i[12] = ( a.i[12] op );                  \
+    b.i[13] = ( a.i[13] op );                  \
+    b.i[14] = ( a.i[14] op );                  \
+    b.i[15] = ( a.i[15] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v16int binary operators
+
+# define BINARY(op)                                             \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {								\
+    v16int c;                                                   \
+    c.i[ 0] = a.i[ 0] op b.i[ 0];                               \
+    c.i[ 1] = a.i[ 1] op b.i[ 1];                               \
+    c.i[ 2] = a.i[ 2] op b.i[ 2];                               \
+    c.i[ 3] = a.i[ 3] op b.i[ 3];                               \
+    c.i[ 4] = a.i[ 4] op b.i[ 4];                               \
+    c.i[ 5] = a.i[ 5] op b.i[ 5];                               \
+    c.i[ 6] = a.i[ 6] op b.i[ 6];                               \
+    c.i[ 7] = a.i[ 7] op b.i[ 7];                               \
+    c.i[ 8] = a.i[ 8] op b.i[ 8];                               \
+    c.i[ 9] = a.i[ 9] op b.i[ 9];                               \
+    c.i[10] = a.i[10] op b.i[10];                               \
+    c.i[11] = a.i[11] op b.i[11];                               \
+    c.i[12] = a.i[12] op b.i[12];                               \
+    c.i[13] = a.i[13] op b.i[13];                               \
+    c.i[14] = a.i[14] op b.i[14];                               \
+    c.i[15] = a.i[15] op b.i[15];                               \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v16int logical operators
+
+# define LOGICAL(op)                                            \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {                                                             \
+    v16int c;                                                   \
+    c.i[ 0] = - ( a.i[ 0] op b.i[ 0] );                         \
+    c.i[ 1] = - ( a.i[ 1] op b.i[ 1] );                         \
+    c.i[ 2] = - ( a.i[ 2] op b.i[ 2] );                         \
+    c.i[ 3] = - ( a.i[ 3] op b.i[ 3] );                         \
+    c.i[ 4] = - ( a.i[ 4] op b.i[ 4] );                         \
+    c.i[ 5] = - ( a.i[ 5] op b.i[ 5] );                         \
+    c.i[ 6] = - ( a.i[ 6] op b.i[ 6] );                         \
+    c.i[ 7] = - ( a.i[ 7] op b.i[ 7] );                         \
+    c.i[ 8] = - ( a.i[ 8] op b.i[ 8] );                         \
+    c.i[ 9] = - ( a.i[ 9] op b.i[ 9] );                         \
+    c.i[10] = - ( a.i[10] op b.i[10] );                         \
+    c.i[11] = - ( a.i[11] op b.i[11] );                         \
+    c.i[12] = - ( a.i[12] op b.i[12] );                         \
+    c.i[13] = - ( a.i[13] op b.i[13] );                         \
+    c.i[14] = - ( a.i[14] op b.i[14] );                         \
+    c.i[15] = - ( a.i[15] op b.i[15] );                         \
+    return c;                                                   \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16int miscellaneous functions
+
+  inline v16int abs( const v16int &a )
+  {
+    v16int b;
+
+    b.i[ 0] = ( a.i[ 0] >= 0 ) ? a.i[ 0] : - a.i[ 0];
+    b.i[ 1] = ( a.i[ 1] >= 0 ) ? a.i[ 1] : - a.i[ 1];
+    b.i[ 2] = ( a.i[ 2] >= 0 ) ? a.i[ 2] : - a.i[ 2];
+    b.i[ 3] = ( a.i[ 3] >= 0 ) ? a.i[ 3] : - a.i[ 3];
+    b.i[ 4] = ( a.i[ 4] >= 0 ) ? a.i[ 4] : - a.i[ 4];
+    b.i[ 5] = ( a.i[ 5] >= 0 ) ? a.i[ 5] : - a.i[ 5];
+    b.i[ 6] = ( a.i[ 6] >= 0 ) ? a.i[ 6] : - a.i[ 6];
+    b.i[ 7] = ( a.i[ 7] >= 0 ) ? a.i[ 7] : - a.i[ 7];
+    b.i[ 8] = ( a.i[ 8] >= 0 ) ? a.i[ 8] : - a.i[ 8];
+    b.i[ 9] = ( a.i[ 9] >= 0 ) ? a.i[ 9] : - a.i[ 9];
+    b.i[10] = ( a.i[10] >= 0 ) ? a.i[10] : - a.i[10];
+    b.i[11] = ( a.i[11] >= 0 ) ? a.i[11] : - a.i[11];
+    b.i[12] = ( a.i[12] >= 0 ) ? a.i[12] : - a.i[12];
+    b.i[13] = ( a.i[13] >= 0 ) ? a.i[13] : - a.i[13];
+    b.i[14] = ( a.i[14] >= 0 ) ? a.i[14] : - a.i[14];
+    b.i[15] = ( a.i[15] >= 0 ) ? a.i[15] : - a.i[15];
+
+    return b;
+  }
+
+  inline v16 czero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[ 0] & ~c.i[ 0];
+    b.i[ 1] = a.i[ 1] & ~c.i[ 1];
+    b.i[ 2] = a.i[ 2] & ~c.i[ 2];
+    b.i[ 3] = a.i[ 3] & ~c.i[ 3];
+    b.i[ 4] = a.i[ 4] & ~c.i[ 4];
+    b.i[ 5] = a.i[ 5] & ~c.i[ 5];
+    b.i[ 6] = a.i[ 6] & ~c.i[ 6];
+    b.i[ 7] = a.i[ 7] & ~c.i[ 7];
+    b.i[ 8] = a.i[ 8] & ~c.i[ 8];
+    b.i[ 9] = a.i[ 9] & ~c.i[ 9];
+    b.i[10] = a.i[10] & ~c.i[10];
+    b.i[11] = a.i[11] & ~c.i[11];
+    b.i[12] = a.i[12] & ~c.i[12];
+    b.i[13] = a.i[13] & ~c.i[13];
+    b.i[14] = a.i[14] & ~c.i[14];
+    b.i[15] = a.i[15] & ~c.i[15];
+
+    return b;
+  }
+
+  inline v16 notczero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[ 0] & c.i[ 0];
+    b.i[ 1] = a.i[ 1] & c.i[ 1];
+    b.i[ 2] = a.i[ 2] & c.i[ 2];
+    b.i[ 3] = a.i[ 3] & c.i[ 3];
+    b.i[ 4] = a.i[ 4] & c.i[ 4];
+    b.i[ 5] = a.i[ 5] & c.i[ 5];
+    b.i[ 6] = a.i[ 6] & c.i[ 6];
+    b.i[ 7] = a.i[ 7] & c.i[ 7];
+    b.i[ 8] = a.i[ 8] & c.i[ 8];
+    b.i[ 9] = a.i[ 9] & c.i[ 9];
+    b.i[10] = a.i[10] & c.i[10];
+    b.i[11] = a.i[11] & c.i[11];
+    b.i[12] = a.i[12] & c.i[12];
+    b.i[13] = a.i[13] & c.i[13];
+    b.i[14] = a.i[14] & c.i[14];
+    b.i[15] = a.i[15] & c.i[15];
+
+    return b;
+  }
+
+  inline v16 merge( const v16int &c, const v16 &t, const v16 &f )
+  {
+    v16 m;
+
+    m.i[ 0] = ( f.i[ 0] & ~c.i[ 0] ) | ( t.i[ 0] & c.i[ 0] );
+    m.i[ 1] = ( f.i[ 1] & ~c.i[ 1] ) | ( t.i[ 1] & c.i[ 1] );
+    m.i[ 2] = ( f.i[ 2] & ~c.i[ 2] ) | ( t.i[ 2] & c.i[ 2] );
+    m.i[ 3] = ( f.i[ 3] & ~c.i[ 3] ) | ( t.i[ 3] & c.i[ 3] );
+    m.i[ 4] = ( f.i[ 4] & ~c.i[ 4] ) | ( t.i[ 4] & c.i[ 4] );
+    m.i[ 5] = ( f.i[ 5] & ~c.i[ 5] ) | ( t.i[ 5] & c.i[ 5] );
+    m.i[ 6] = ( f.i[ 6] & ~c.i[ 6] ) | ( t.i[ 6] & c.i[ 6] );
+    m.i[ 7] = ( f.i[ 7] & ~c.i[ 7] ) | ( t.i[ 7] & c.i[ 7] );
+    m.i[ 8] = ( f.i[ 8] & ~c.i[ 8] ) | ( t.i[ 8] & c.i[ 8] );
+    m.i[ 9] = ( f.i[ 9] & ~c.i[ 9] ) | ( t.i[ 9] & c.i[ 9] );
+    m.i[10] = ( f.i[10] & ~c.i[10] ) | ( t.i[10] & c.i[10] );
+    m.i[11] = ( f.i[11] & ~c.i[11] ) | ( t.i[11] & c.i[11] );
+    m.i[12] = ( f.i[12] & ~c.i[12] ) | ( t.i[12] & c.i[12] );
+    m.i[13] = ( f.i[13] & ~c.i[13] ) | ( t.i[13] & c.i[13] );
+    m.i[14] = ( f.i[14] & ~c.i[14] ) | ( t.i[14] & c.i[14] );
+    m.i[15] = ( f.i[15] & ~c.i[15] ) | ( t.i[15] & c.i[15] );
+
+    return m;
+  }
+
+  ////////////////
+  // v16float class
+
+  class v16float : public v16
+  {
+    // v16float prefix unary operator friends
+
+    friend inline v16float operator  +( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  ~( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16int   operator  !( const v16float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16float prefix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a ) ALWAYS_INLINE;
+
+    // v16float postfix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE;
+
+    // v16float binary operator friends
+
+    friend inline v16float operator  +( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  *( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  /( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float math library friends
+
+#   define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v16float fn( const v16float &a,  \
+                                                    const v16float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v16float miscellaneous friends
+
+    friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rsqrt       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void     scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16float constructors / destructors
+
+    v16float() {}                                          // Default constructor
+
+    v16float( const v16float &a )                          // Copy constructor
+    {
+      f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3];
+      f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7];
+      f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11];
+      f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15];
+    }
+
+    v16float( const v16 &a )                               // Init from mixed
+    {
+      f[ 0] = a.f[ 0]; f[ 1] = a.f[ 1]; f[ 2] = a.f[ 2]; f[ 3] = a.f[ 3];
+      f[ 4] = a.f[ 4]; f[ 5] = a.f[ 5]; f[ 6] = a.f[ 6]; f[ 7] = a.f[ 7];
+      f[ 8] = a.f[ 8]; f[ 9] = a.f[ 9]; f[10] = a.f[10]; f[11] = a.f[11];
+      f[12] = a.f[12]; f[13] = a.f[13]; f[14] = a.f[14]; f[15] = a.f[15];
+    }
+
+    v16float( float a )                                    // Init from scalar
+    {
+      f[ 0] = a; f[ 1] = a; f[ 2] = a; f[ 3] = a;
+      f[ 4] = a; f[ 5] = a; f[ 6] = a; f[ 7] = a;
+      f[ 8] = a; f[ 9] = a; f[10] = a; f[11] = a;
+      f[12] = a; f[13] = a; f[14] = a; f[15] = a;
+    }
+
+    v16float( float f00, float f01, float f02, float f03,
+	      float f04, float f05, float f06, float f07,
+	      float f08, float f09, float f10, float f11,
+	      float f12, float f13, float f14, float f15 ) // Init from scalars
+    {
+      f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03;
+      f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07;
+      f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11;
+      f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15;
+    }
+
+    ~v16float() {}                                         // Destructor
+
+    // v16float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v16float &operator op( const v16float &b )   \
+    {							\
+      f[ 0] op b.f[ 0];		             		\
+      f[ 1] op b.f[ 1];                                 \
+      f[ 2] op b.f[ 2];                                 \
+      f[ 3] op b.f[ 3];                                 \
+      f[ 4] op b.f[ 4];                                 \
+      f[ 5] op b.f[ 5];                                 \
+      f[ 6] op b.f[ 6];                                 \
+      f[ 7] op b.f[ 7];                                 \
+      f[ 8] op b.f[ 8];                                 \
+      f[ 9] op b.f[ 9];                                 \
+      f[10] op b.f[10];                                 \
+      f[11] op b.f[11];                                 \
+      f[12] op b.f[12];                                 \
+      f[13] op b.f[13];                                 \
+      f[14] op b.f[14];                                 \
+      f[15] op b.f[15];                                 \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v16float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v16float prefix unary operators
+
+  inline v16float operator +( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = +a.f[ 0];
+    b.f[ 1] = +a.f[ 1];
+    b.f[ 2] = +a.f[ 2];
+    b.f[ 3] = +a.f[ 3];
+    b.f[ 4] = +a.f[ 4];
+    b.f[ 5] = +a.f[ 5];
+    b.f[ 6] = +a.f[ 6];
+    b.f[ 7] = +a.f[ 7];
+    b.f[ 8] = +a.f[ 8];
+    b.f[ 9] = +a.f[ 9];
+    b.f[10] = +a.f[10];
+    b.f[11] = +a.f[11];
+    b.f[12] = +a.f[12];
+    b.f[13] = +a.f[13];
+    b.f[14] = +a.f[14];
+    b.f[15] = +a.f[15];
+
+    return b;
+  }
+
+  inline v16float operator -( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = -a.f[ 0];
+    b.f[ 1] = -a.f[ 1];
+    b.f[ 2] = -a.f[ 2];
+    b.f[ 3] = -a.f[ 3];
+    b.f[ 4] = -a.f[ 4];
+    b.f[ 5] = -a.f[ 5];
+    b.f[ 6] = -a.f[ 6];
+    b.f[ 7] = -a.f[ 7];
+    b.f[ 8] = -a.f[ 8];
+    b.f[ 9] = -a.f[ 9];
+    b.f[10] = -a.f[10];
+    b.f[11] = -a.f[11];
+    b.f[12] = -a.f[12];
+    b.f[13] = -a.f[13];
+    b.f[14] = -a.f[14];
+    b.f[15] = -a.f[15];
+
+    return b;
+  }
+
+  inline v16int operator !( const v16float &a )
+  {
+    v16int b;
+
+    b.i[ 0] = a.i[ 0] ? 0 : -1;
+    b.i[ 1] = a.i[ 1] ? 0 : -1;
+    b.i[ 2] = a.i[ 2] ? 0 : -1;
+    b.i[ 3] = a.i[ 3] ? 0 : -1;
+    b.i[ 4] = a.i[ 4] ? 0 : -1;
+    b.i[ 5] = a.i[ 5] ? 0 : -1;
+    b.i[ 6] = a.i[ 6] ? 0 : -1;
+    b.i[ 7] = a.i[ 7] ? 0 : -1;
+    b.i[ 8] = a.i[ 8] ? 0 : -1;
+    b.i[ 9] = a.i[ 9] ? 0 : -1;
+    b.i[10] = a.i[10] ? 0 : -1;
+    b.i[11] = a.i[11] ? 0 : -1;
+    b.i[12] = a.i[12] ? 0 : -1;
+    b.i[13] = a.i[13] ? 0 : -1;
+    b.i[14] = a.i[14] ? 0 : -1;
+    b.i[15] = a.i[15] ? 0 : -1;
+
+    return b;
+  }
+
+  // v16float prefix increment / decrement operators
+
+  inline v16float operator ++( v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ++a.f[ 0];
+    b.f[ 1] = ++a.f[ 1];
+    b.f[ 2] = ++a.f[ 2];
+    b.f[ 3] = ++a.f[ 3];
+    b.f[ 4] = ++a.f[ 4];
+    b.f[ 5] = ++a.f[ 5];
+    b.f[ 6] = ++a.f[ 6];
+    b.f[ 7] = ++a.f[ 7];
+    b.f[ 8] = ++a.f[ 8];
+    b.f[ 9] = ++a.f[ 9];
+    b.f[10] = ++a.f[10];
+    b.f[11] = ++a.f[11];
+    b.f[12] = ++a.f[12];
+    b.f[13] = ++a.f[13];
+    b.f[14] = ++a.f[14];
+    b.f[15] = ++a.f[15];
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = --a.f[ 0];
+    b.f[ 1] = --a.f[ 1];
+    b.f[ 2] = --a.f[ 2];
+    b.f[ 3] = --a.f[ 3];
+    b.f[ 4] = --a.f[ 4];
+    b.f[ 5] = --a.f[ 5];
+    b.f[ 6] = --a.f[ 6];
+    b.f[ 7] = --a.f[ 7];
+    b.f[ 8] = --a.f[ 8];
+    b.f[ 9] = --a.f[ 9];
+    b.f[10] = --a.f[10];
+    b.f[11] = --a.f[11];
+    b.f[12] = --a.f[12];
+    b.f[13] = --a.f[13];
+    b.f[14] = --a.f[14];
+    b.f[15] = --a.f[15];
+
+    return b;
+  }
+
+  // v16float postfix increment / decrement operators
+
+  inline v16float operator ++( v16float &a, int )
+  {
+    v16float b;
+
+    b.f[ 0] = a.f[ 0]++;
+    b.f[ 1] = a.f[ 1]++;
+    b.f[ 2] = a.f[ 2]++;
+    b.f[ 3] = a.f[ 3]++;
+    b.f[ 4] = a.f[ 4]++;
+    b.f[ 5] = a.f[ 5]++;
+    b.f[ 6] = a.f[ 6]++;
+    b.f[ 7] = a.f[ 7]++;
+    b.f[ 8] = a.f[ 8]++;
+    b.f[ 9] = a.f[ 9]++;
+    b.f[10] = a.f[10]++;
+    b.f[11] = a.f[11]++;
+    b.f[12] = a.f[12]++;
+    b.f[13] = a.f[13]++;
+    b.f[14] = a.f[14]++;
+    b.f[15] = a.f[15]++;
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a, int )
+  {
+    v16float b;
+
+    b.f[ 0] = a.f[ 0]--;
+    b.f[ 1] = a.f[ 1]--;
+    b.f[ 2] = a.f[ 2]--;
+    b.f[ 3] = a.f[ 3]--;
+    b.f[ 4] = a.f[ 4]--;
+    b.f[ 5] = a.f[ 5]--;
+    b.f[ 6] = a.f[ 6]--;
+    b.f[ 7] = a.f[ 7]--;
+    b.f[ 8] = a.f[ 8]--;
+    b.f[ 9] = a.f[ 9]--;
+    b.f[10] = a.f[10]--;
+    b.f[11] = a.f[11]--;
+    b.f[12] = a.f[12]--;
+    b.f[13] = a.f[13]--;
+    b.f[14] = a.f[14]--;
+    b.f[15] = a.f[15]--;
+
+    return b;
+  }
+
+  // v16float binary operators
+
+# define BINARY(op)                                                   \
+  inline v16float operator op( const v16float &a, const v16float &b ) \
+  {								      \
+    v16float c;                                                       \
+    c.f[ 0] = a.f[ 0] op b.f[ 0];                                     \
+    c.f[ 1] = a.f[ 1] op b.f[ 1];                                     \
+    c.f[ 2] = a.f[ 2] op b.f[ 2];                                     \
+    c.f[ 3] = a.f[ 3] op b.f[ 3];                                     \
+    c.f[ 4] = a.f[ 4] op b.f[ 4];                                     \
+    c.f[ 5] = a.f[ 5] op b.f[ 5];                                     \
+    c.f[ 6] = a.f[ 6] op b.f[ 6];                                     \
+    c.f[ 7] = a.f[ 7] op b.f[ 7];                                     \
+    c.f[ 8] = a.f[ 8] op b.f[ 8];                                     \
+    c.f[ 9] = a.f[ 9] op b.f[ 9];                                     \
+    c.f[10] = a.f[10] op b.f[10];                                     \
+    c.f[11] = a.f[11] op b.f[11];                                     \
+    c.f[12] = a.f[12] op b.f[12];                                     \
+    c.f[13] = a.f[13] op b.f[13];                                     \
+    c.f[14] = a.f[14] op b.f[14];                                     \
+    c.f[15] = a.f[15] op b.f[15];                                     \
+    return c;                                                         \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v16float logical operators
+
+# define LOGICAL(op)                                                \
+  inline v16int operator op( const v16float &a, const v16float &b ) \
+  {								    \
+    v16int c;                                                       \
+    c.i[ 0] = -( a.f[ 0] op b.f[ 0] );                              \
+    c.i[ 1] = -( a.f[ 1] op b.f[ 1] );                              \
+    c.i[ 2] = -( a.f[ 2] op b.f[ 2] );                              \
+    c.i[ 3] = -( a.f[ 3] op b.f[ 3] );                              \
+    c.i[ 4] = -( a.f[ 4] op b.f[ 4] );                              \
+    c.i[ 5] = -( a.f[ 5] op b.f[ 5] );                              \
+    c.i[ 6] = -( a.f[ 6] op b.f[ 6] );                              \
+    c.i[ 7] = -( a.f[ 7] op b.f[ 7] );                              \
+    c.i[ 8] = -( a.f[ 8] op b.f[ 8] );                              \
+    c.i[ 9] = -( a.f[ 9] op b.f[ 9] );                              \
+    c.i[10] = -( a.f[10] op b.f[10] );                              \
+    c.i[11] = -( a.f[11] op b.f[11] );                              \
+    c.i[12] = -( a.f[12] op b.f[12] );                              \
+    c.i[13] = -( a.f[13] op b.f[13] );                              \
+    c.i[14] = -( a.f[14] op b.f[14] );                              \
+    c.i[15] = -( a.f[15] op b.f[15] );                              \
+    return c;                                                       \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v16float fn( const v16float &a )       \
+  {						\
+    v16float b;                                 \
+    b.f[ 0] = ::fn( a.f[ 0] );                  \
+    b.f[ 1] = ::fn( a.f[ 1] );                  \
+    b.f[ 2] = ::fn( a.f[ 2] );                  \
+    b.f[ 3] = ::fn( a.f[ 3] );                  \
+    b.f[ 4] = ::fn( a.f[ 4] );                  \
+    b.f[ 5] = ::fn( a.f[ 5] );                  \
+    b.f[ 6] = ::fn( a.f[ 6] );                  \
+    b.f[ 7] = ::fn( a.f[ 7] );                  \
+    b.f[ 8] = ::fn( a.f[ 8] );                  \
+    b.f[ 9] = ::fn( a.f[ 9] );                  \
+    b.f[10] = ::fn( a.f[10] );                  \
+    b.f[11] = ::fn( a.f[11] );                  \
+    b.f[12] = ::fn( a.f[12] );                  \
+    b.f[13] = ::fn( a.f[13] );                  \
+    b.f[14] = ::fn( a.f[14] );                  \
+    b.f[15] = ::fn( a.f[15] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v16float fn( const v16float &a, const v16float &b )    \
+  {								\
+    v16float c;                                                 \
+    c.f[ 0] = ::fn( a.f[ 0], b.f[ 0] );                         \
+    c.f[ 1] = ::fn( a.f[ 1], b.f[ 1] );                         \
+    c.f[ 2] = ::fn( a.f[ 2], b.f[ 2] );                         \
+    c.f[ 3] = ::fn( a.f[ 3], b.f[ 3] );                         \
+    c.f[ 4] = ::fn( a.f[ 4], b.f[ 4] );                         \
+    c.f[ 5] = ::fn( a.f[ 5], b.f[ 5] );                         \
+    c.f[ 6] = ::fn( a.f[ 6], b.f[ 6] );                         \
+    c.f[ 7] = ::fn( a.f[ 7], b.f[ 7] );                         \
+    c.f[ 8] = ::fn( a.f[ 8], b.f[ 8] );                         \
+    c.f[ 9] = ::fn( a.f[ 9], b.f[ 9] );                         \
+    c.f[10] = ::fn( a.f[10], b.f[10] );                         \
+    c.f[11] = ::fn( a.f[11], b.f[11] );                         \
+    c.f[12] = ::fn( a.f[12], b.f[12] );                         \
+    c.f[13] = ::fn( a.f[13], b.f[13] );                         \
+    c.f[14] = ::fn( a.f[14], b.f[14] );                         \
+    c.f[15] = ::fn( a.f[15], b.f[15] );                         \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v16float copysign( const v16float &a, const v16float &b )
+  {
+    v16float c;
+    float t;
+
+    t = ::fabs( a.f[ 0] );
+    if( b.f[ 0] < 0 ) t = -t;
+    c.f[ 0] = t;
+
+    t = ::fabs( a.f[ 1] );
+    if( b.f[ 1] < 0 ) t = -t;
+    c.f[ 1] = t;
+
+    t = ::fabs( a.f[ 2] );
+    if( b.f[ 2] < 0 ) t = -t;
+    c.f[ 2] = t;
+
+    t = ::fabs( a.f[ 3] );
+    if( b.f[ 3] < 0 ) t = -t;
+    c.f[ 3] = t;
+
+    t = ::fabs( a.f[ 4] );
+    if( b.f[ 4] < 0 ) t = -t;
+    c.f[ 4] = t;
+
+    t = ::fabs( a.f[ 5] );
+    if( b.f[ 5] < 0 ) t = -t;
+    c.f[ 5] = t;
+
+    t = ::fabs( a.f[ 6] );
+    if( b.f[ 6] < 0 ) t = -t;
+    c.f[ 6] = t;
+
+    t = ::fabs( a.f[ 7] );
+    if( b.f[ 7] < 0 ) t = -t;
+    c.f[ 7] = t;
+
+    t = ::fabs( a.f[ 8] );
+    if( b.f[ 8] < 0 ) t = -t;
+    c.f[ 8] = t;
+
+    t = ::fabs( a.f[ 9] );
+    if( b.f[ 9] < 0 ) t = -t;
+    c.f[ 9] = t;
+
+    t = ::fabs( a.f[10] );
+    if( b.f[10] < 0 ) t = -t;
+    c.f[10] = t;
+
+    t = ::fabs( a.f[11] );
+    if( b.f[11] < 0 ) t = -t;
+    c.f[11] = t;
+
+    t = ::fabs( a.f[12] );
+    if( b.f[12] < 0 ) t = -t;
+    c.f[12] = t;
+
+    t = ::fabs( a.f[13] );
+    if( b.f[13] < 0 ) t = -t;
+    c.f[13] = t;
+
+    t = ::fabs( a.f[14] );
+    if( b.f[14] < 0 ) t = -t;
+    c.f[14] = t;
+
+    t = ::fabs( a.f[15] );
+    if( b.f[15] < 0 ) t = -t;
+    c.f[15] = t;
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v16float miscellaneous functions
+
+  inline v16float rsqrt_approx( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] );
+    b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] );
+    b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] );
+    b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] );
+    b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] );
+    b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] );
+    b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] );
+    b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] );
+    b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] );
+    b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] );
+    b.f[10] = ::sqrt( 1.0f/a.f[10] );
+    b.f[11] = ::sqrt( 1.0f/a.f[11] );
+    b.f[12] = ::sqrt( 1.0f/a.f[12] );
+    b.f[13] = ::sqrt( 1.0f/a.f[13] );
+    b.f[14] = ::sqrt( 1.0f/a.f[14] );
+    b.f[15] = ::sqrt( 1.0f/a.f[15] );
+
+    return b;
+  }
+
+  inline v16float rsqrt( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = ::sqrt( 1.0f/a.f[ 0] );
+    b.f[ 1] = ::sqrt( 1.0f/a.f[ 1] );
+    b.f[ 2] = ::sqrt( 1.0f/a.f[ 2] );
+    b.f[ 3] = ::sqrt( 1.0f/a.f[ 3] );
+    b.f[ 4] = ::sqrt( 1.0f/a.f[ 4] );
+    b.f[ 5] = ::sqrt( 1.0f/a.f[ 5] );
+    b.f[ 6] = ::sqrt( 1.0f/a.f[ 6] );
+    b.f[ 7] = ::sqrt( 1.0f/a.f[ 7] );
+    b.f[ 8] = ::sqrt( 1.0f/a.f[ 8] );
+    b.f[ 9] = ::sqrt( 1.0f/a.f[ 9] );
+    b.f[10] = ::sqrt( 1.0f/a.f[10] );
+    b.f[11] = ::sqrt( 1.0f/a.f[11] );
+    b.f[12] = ::sqrt( 1.0f/a.f[12] );
+    b.f[13] = ::sqrt( 1.0f/a.f[13] );
+    b.f[14] = ::sqrt( 1.0f/a.f[14] );
+    b.f[15] = ::sqrt( 1.0f/a.f[15] );
+
+    return b;
+  }
+
+  inline v16float rcp_approx( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = 1.0f/a.f[ 0];
+    b.f[ 1] = 1.0f/a.f[ 1];
+    b.f[ 2] = 1.0f/a.f[ 2];
+    b.f[ 3] = 1.0f/a.f[ 3];
+    b.f[ 4] = 1.0f/a.f[ 4];
+    b.f[ 5] = 1.0f/a.f[ 5];
+    b.f[ 6] = 1.0f/a.f[ 6];
+    b.f[ 7] = 1.0f/a.f[ 7];
+    b.f[ 8] = 1.0f/a.f[ 8];
+    b.f[ 9] = 1.0f/a.f[ 9];
+    b.f[10] = 1.0f/a.f[10];
+    b.f[11] = 1.0f/a.f[11];
+    b.f[12] = 1.0f/a.f[12];
+    b.f[13] = 1.0f/a.f[13];
+    b.f[14] = 1.0f/a.f[14];
+    b.f[15] = 1.0f/a.f[15];
+
+    return b;
+  }
+
+  inline v16float rcp( const v16float &a )
+  {
+    v16float b;
+
+    b.f[ 0] = 1.0f/a.f[ 0];
+    b.f[ 1] = 1.0f/a.f[ 1];
+    b.f[ 2] = 1.0f/a.f[ 2];
+    b.f[ 3] = 1.0f/a.f[ 3];
+    b.f[ 4] = 1.0f/a.f[ 4];
+    b.f[ 5] = 1.0f/a.f[ 5];
+    b.f[ 6] = 1.0f/a.f[ 6];
+    b.f[ 7] = 1.0f/a.f[ 7];
+    b.f[ 8] = 1.0f/a.f[ 8];
+    b.f[ 9] = 1.0f/a.f[ 9];
+    b.f[10] = 1.0f/a.f[10];
+    b.f[11] = 1.0f/a.f[11];
+    b.f[12] = 1.0f/a.f[12];
+    b.f[13] = 1.0f/a.f[13];
+    b.f[14] = 1.0f/a.f[14];
+    b.f[15] = 1.0f/a.f[15];
+
+    return b;
+  }
+
+  inline v16float fma( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = a.f[ 0] * b.f[ 0] + c.f[ 0];
+    d.f[ 1] = a.f[ 1] * b.f[ 1] + c.f[ 1];
+    d.f[ 2] = a.f[ 2] * b.f[ 2] + c.f[ 2];
+    d.f[ 3] = a.f[ 3] * b.f[ 3] + c.f[ 3];
+    d.f[ 4] = a.f[ 4] * b.f[ 4] + c.f[ 4];
+    d.f[ 5] = a.f[ 5] * b.f[ 5] + c.f[ 5];
+    d.f[ 6] = a.f[ 6] * b.f[ 6] + c.f[ 6];
+    d.f[ 7] = a.f[ 7] * b.f[ 7] + c.f[ 7];
+    d.f[ 8] = a.f[ 8] * b.f[ 8] + c.f[ 8];
+    d.f[ 9] = a.f[ 9] * b.f[ 9] + c.f[ 9];
+    d.f[10] = a.f[10] * b.f[10] + c.f[10];
+    d.f[11] = a.f[11] * b.f[11] + c.f[11];
+    d.f[12] = a.f[12] * b.f[12] + c.f[12];
+    d.f[13] = a.f[13] * b.f[13] + c.f[13];
+    d.f[14] = a.f[14] * b.f[14] + c.f[14];
+    d.f[15] = a.f[15] * b.f[15] + c.f[15];
+
+    return d;
+  }
+
+  inline v16float fms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = a.f[ 0] * b.f[ 0] - c.f[ 0];
+    d.f[ 1] = a.f[ 1] * b.f[ 1] - c.f[ 1];
+    d.f[ 2] = a.f[ 2] * b.f[ 2] - c.f[ 2];
+    d.f[ 3] = a.f[ 3] * b.f[ 3] - c.f[ 3];
+    d.f[ 4] = a.f[ 4] * b.f[ 4] - c.f[ 4];
+    d.f[ 5] = a.f[ 5] * b.f[ 5] - c.f[ 5];
+    d.f[ 6] = a.f[ 6] * b.f[ 6] - c.f[ 6];
+    d.f[ 7] = a.f[ 7] * b.f[ 7] - c.f[ 7];
+    d.f[ 8] = a.f[ 8] * b.f[ 8] - c.f[ 8];
+    d.f[ 9] = a.f[ 9] * b.f[ 9] - c.f[ 9];
+    d.f[10] = a.f[10] * b.f[10] - c.f[10];
+    d.f[11] = a.f[11] * b.f[11] - c.f[11];
+    d.f[12] = a.f[12] * b.f[12] - c.f[12];
+    d.f[13] = a.f[13] * b.f[13] - c.f[13];
+    d.f[14] = a.f[14] * b.f[14] - c.f[14];
+    d.f[15] = a.f[15] * b.f[15] - c.f[15];
+
+    return d;
+  }
+
+  inline v16float fnms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    d.f[ 0] = c.f[ 0] - a.f[ 0] * b.f[ 0];
+    d.f[ 1] = c.f[ 1] - a.f[ 1] * b.f[ 1];
+    d.f[ 2] = c.f[ 2] - a.f[ 2] * b.f[ 2];
+    d.f[ 3] = c.f[ 3] - a.f[ 3] * b.f[ 3];
+    d.f[ 4] = c.f[ 4] - a.f[ 4] * b.f[ 4];
+    d.f[ 5] = c.f[ 5] - a.f[ 5] * b.f[ 5];
+    d.f[ 6] = c.f[ 6] - a.f[ 6] * b.f[ 6];
+    d.f[ 7] = c.f[ 7] - a.f[ 7] * b.f[ 7];
+    d.f[ 8] = c.f[ 8] - a.f[ 8] * b.f[ 8];
+    d.f[ 9] = c.f[ 9] - a.f[ 9] * b.f[ 9];
+    d.f[10] = c.f[10] - a.f[10] * b.f[10];
+    d.f[11] = c.f[11] - a.f[11] * b.f[11];
+    d.f[12] = c.f[12] - a.f[12] * b.f[12];
+    d.f[13] = c.f[13] - a.f[13] * b.f[13];
+    d.f[14] = c.f[14] - a.f[14] * b.f[14];
+    d.f[15] = c.f[15] - a.f[15] * b.f[15];
+
+    return d;
+  }
+
+  inline v16float clear_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = ( ~m.i[ 0] ) & a.i[ 0];
+    b.i[ 1] = ( ~m.i[ 1] ) & a.i[ 1];
+    b.i[ 2] = ( ~m.i[ 2] ) & a.i[ 2];
+    b.i[ 3] = ( ~m.i[ 3] ) & a.i[ 3];
+    b.i[ 4] = ( ~m.i[ 4] ) & a.i[ 4];
+    b.i[ 5] = ( ~m.i[ 5] ) & a.i[ 5];
+    b.i[ 6] = ( ~m.i[ 6] ) & a.i[ 6];
+    b.i[ 7] = ( ~m.i[ 7] ) & a.i[ 7];
+    b.i[ 8] = ( ~m.i[ 8] ) & a.i[ 8];
+    b.i[ 9] = ( ~m.i[ 9] ) & a.i[ 9];
+    b.i[10] = ( ~m.i[10] ) & a.i[10];
+    b.i[11] = ( ~m.i[11] ) & a.i[11];
+    b.i[12] = ( ~m.i[12] ) & a.i[12];
+    b.i[13] = ( ~m.i[13] ) & a.i[13];
+    b.i[14] = ( ~m.i[14] ) & a.i[14];
+    b.i[15] = ( ~m.i[15] ) & a.i[15];
+
+    return b;
+  }
+
+  inline v16float set_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = m.i[ 0] | a.i[ 0];
+    b.i[ 1] = m.i[ 1] | a.i[ 1];
+    b.i[ 2] = m.i[ 2] | a.i[ 2];
+    b.i[ 3] = m.i[ 3] | a.i[ 3];
+    b.i[ 4] = m.i[ 4] | a.i[ 4];
+    b.i[ 5] = m.i[ 5] | a.i[ 5];
+    b.i[ 6] = m.i[ 6] | a.i[ 6];
+    b.i[ 7] = m.i[ 7] | a.i[ 7];
+    b.i[ 8] = m.i[ 8] | a.i[ 8];
+    b.i[ 9] = m.i[ 9] | a.i[ 9];
+    b.i[10] = m.i[10] | a.i[10];
+    b.i[11] = m.i[11] | a.i[11];
+    b.i[12] = m.i[12] | a.i[12];
+    b.i[13] = m.i[13] | a.i[13];
+    b.i[14] = m.i[14] | a.i[14];
+    b.i[15] = m.i[15] | a.i[15];
+
+    return b;
+  }
+
+  inline v16float toggle_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    b.i[ 0] = m.i[ 0] ^ a.i[ 0];
+    b.i[ 1] = m.i[ 1] ^ a.i[ 1];
+    b.i[ 2] = m.i[ 2] ^ a.i[ 2];
+    b.i[ 3] = m.i[ 3] ^ a.i[ 3];
+    b.i[ 4] = m.i[ 4] ^ a.i[ 4];
+    b.i[ 5] = m.i[ 5] ^ a.i[ 5];
+    b.i[ 6] = m.i[ 6] ^ a.i[ 6];
+    b.i[ 7] = m.i[ 7] ^ a.i[ 7];
+    b.i[ 8] = m.i[ 8] ^ a.i[ 8];
+    b.i[ 9] = m.i[ 9] ^ a.i[ 9];
+    b.i[10] = m.i[10] ^ a.i[10];
+    b.i[11] = m.i[11] ^ a.i[11];
+    b.i[12] = m.i[12] ^ a.i[12];
+    b.i[13] = m.i[13] ^ a.i[13];
+    b.i[14] = m.i[14] ^ a.i[14];
+    b.i[15] = m.i[15] ^ a.i[15];
+
+    return b;
+  }
+
+  inline void increment_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] += a.f[ 0];
+    p[ 1] += a.f[ 1];
+    p[ 2] += a.f[ 2];
+    p[ 3] += a.f[ 3];
+    p[ 4] += a.f[ 4];
+    p[ 5] += a.f[ 5];
+    p[ 6] += a.f[ 6];
+    p[ 7] += a.f[ 7];
+    p[ 8] += a.f[ 8];
+    p[ 9] += a.f[ 9];
+    p[10] += a.f[10];
+    p[11] += a.f[11];
+    p[12] += a.f[12];
+    p[13] += a.f[13];
+    p[14] += a.f[14];
+    p[15] += a.f[15];
+  }
+
+  inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] -= a.f[ 0];
+    p[ 1] -= a.f[ 1];
+    p[ 2] -= a.f[ 2];
+    p[ 3] -= a.f[ 3];
+    p[ 4] -= a.f[ 4];
+    p[ 5] -= a.f[ 5];
+    p[ 6] -= a.f[ 6];
+    p[ 7] -= a.f[ 7];
+    p[ 8] -= a.f[ 8];
+    p[ 9] -= a.f[ 9];
+    p[10] -= a.f[10];
+    p[11] -= a.f[11];
+    p[12] -= a.f[12];
+    p[13] -= a.f[13];
+    p[14] -= a.f[14];
+    p[15] -= a.f[15];
+  }
+
+  inline void scale_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    p[ 0] *= a.f[ 0];
+    p[ 1] *= a.f[ 1];
+    p[ 2] *= a.f[ 2];
+    p[ 3] *= a.f[ 3];
+    p[ 4] *= a.f[ 4];
+    p[ 5] *= a.f[ 5];
+    p[ 6] *= a.f[ 6];
+    p[ 7] *= a.f[ 7];
+    p[ 8] *= a.f[ 8];
+    p[ 9] *= a.f[ 9];
+    p[10] *= a.f[10];
+    p[11] *= a.f[11];
+    p[12] *= a.f[12];
+    p[13] *= a.f[13];
+    p[14] *= a.f[14];
+    p[15] *= a.f[15];
+  }
+
+} // namespace v16
+
+#endif // _v16_portable_h_
diff --git a/src/util/v16/v16_portable_v1.h b/src/util/v16/v16_portable_v1.h
new file mode 100644
index 00000000..5f798341
--- /dev/null
+++ b/src/util/v16/v16_portable_v1.h
@@ -0,0 +1,3589 @@
+#ifndef _v16_portable_h_
+#define _v16_portable_h_
+
+#ifndef IN_v16_h
+#error "Do not include v16_portable.h directly; use v16.h"
+#endif
+
+#define V16_ACCELERATION
+#define V16_PORTABLE_ACCELERATION
+
+#include <math.h>
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+// This does not work with gcc 5.3.1 and the -fopenmp-simd
+// flag.  Does not seem to work with -fopenmp either.  Not
+// sure why.  It does work with the Intel compiler.  Need
+// to try later versions of gcc.
+// #define ALWAYS_VECTORIZE _Pragma( "omp simd" )
+
+// #define ALWAYS_VECTORIZE _Pragma( "simd" )
+
+#define ALWAYS_VECTORIZE \
+  _Pragma( "simd" ) \
+  _Pragma( "vector aligned" )
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v16
+{
+  class v16;
+  class v16int;
+  class v16float;
+
+  ////////////////
+  // v16 base class
+
+  class v16
+  {
+    friend class v16int;
+    friend class v16float;
+
+    // v16 miscellaneous friends
+
+    friend inline int any( const v16 &a ) ALWAYS_INLINE;
+    friend inline int all( const v16 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v16 splat( const v16 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    friend inline v16 shuffle( const v16 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+				  v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+				  v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+				  v16 &a12, v16 &a13, v16 &a14, v16 &a15 ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16    merge( const v16int &c, const v16 &a, const v16 &b ) ALWAYS_INLINE;
+
+    // v16 memory manipulation friends
+
+    friend inline void   load_16x1( const void * ALIGNED(64) p, v16 &a ) ALWAYS_INLINE;
+    friend inline void  store_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void stream_16x1( const v16 &a, void * ALIGNED(64) p ) ALWAYS_INLINE;
+    friend inline void  clear_16x1( void * ALIGNED(64) dst ) ALWAYS_INLINE;
+    friend inline void   copy_16x1( void * ALIGNED(64) dst,
+				    const void * ALIGNED(64) src ) ALWAYS_INLINE;
+    friend inline void   swap_16x1( void * ALIGNED(64) a, void * ALIGNED(64) b ) ALWAYS_INLINE;
+
+    // v16 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 16x2_tr variants.
+
+    friend inline void load_16x1_tr( const void *a00, const void *a01,
+				     const void *a02, const void *a03,
+				     const void *a04, const void *a05,
+				     const void *a06, const void *a07,
+				     const void *a08, const void *a09,
+				     const void *a10, const void *a11,
+				     const void *a12, const void *a13,
+				     const void *a14, const void *a15,
+				     v16 &a ) ALWAYS_INLINE;
+    friend inline void load_16x2_tr( const void * ALIGNED(8) a00,
+				     const void * ALIGNED(8) a01,
+				     const void * ALIGNED(8) a02,
+				     const void * ALIGNED(8) a03,
+				     const void * ALIGNED(8) a04,
+				     const void * ALIGNED(8) a05,
+				     const void * ALIGNED(8) a06,
+				     const void * ALIGNED(8) a07,
+				     const void * ALIGNED(8) a08,
+				     const void * ALIGNED(8) a09,
+				     const void * ALIGNED(8) a10,
+				     const void * ALIGNED(8) a11,
+				     const void * ALIGNED(8) a12,
+				     const void * ALIGNED(8) a13,
+				     const void * ALIGNED(8) a14,
+				     const void * ALIGNED(8) a15,
+				     v16 &a, v16 &b ) ALWAYS_INLINE;
+    friend inline void load_16x3_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c ) ALWAYS_INLINE;
+    friend inline void load_16x4_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr( const void * ALIGNED(64) a00,
+				     const void * ALIGNED(64) a01,
+				     const void * ALIGNED(64) a02,
+				     const void * ALIGNED(64) a03,
+				     const void * ALIGNED(64) a04,
+				     const void * ALIGNED(64) a05,
+				     const void * ALIGNED(64) a06,
+				     const void * ALIGNED(64) a07,
+				     const void * ALIGNED(64) a08,
+				     const void * ALIGNED(64) a09,
+				     const void * ALIGNED(64) a10,
+				     const void * ALIGNED(64) a11,
+				     const void * ALIGNED(64) a12,
+				     const void * ALIGNED(64) a13,
+				     const void * ALIGNED(64) a14,
+				     const void * ALIGNED(64) a15,
+				     v16 &a, v16 &b, v16 &c, v16 &d,
+				     v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr( const void * ALIGNED(64) a00,
+				      const void * ALIGNED(64) a01,
+				      const void * ALIGNED(64) a02,
+				      const void * ALIGNED(64) a03,
+				      const void * ALIGNED(64) a04,
+				      const void * ALIGNED(64) a05,
+				      const void * ALIGNED(64) a06,
+				      const void * ALIGNED(64) a07,
+				      const void * ALIGNED(64) a08,
+				      const void * ALIGNED(64) a09,
+				      const void * ALIGNED(64) a10,
+				      const void * ALIGNED(64) a11,
+				      const void * ALIGNED(64) a12,
+				      const void * ALIGNED(64) a13,
+				      const void * ALIGNED(64) a14,
+				      const void * ALIGNED(64) a15,
+				      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+				      v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+				      v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+				      v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+    friend inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+				       const void * ALIGNED(64) a01,
+				       const void * ALIGNED(64) a02,
+				       const void * ALIGNED(64) a03,
+				       const void * ALIGNED(64) a04,
+				       const void * ALIGNED(64) a05,
+				       const void * ALIGNED(64) a06,
+				       const void * ALIGNED(64) a07,
+				       v16 &a, v16 &b, v16 &c, v16 &d,
+				       v16 &e, v16 &f, v16 &g, v16 &h ) ALWAYS_INLINE;
+    friend inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+					const void * ALIGNED(64) a01,
+					const void * ALIGNED(64) a02,
+					const void * ALIGNED(64) a03,
+					const void * ALIGNED(64) a04,
+					const void * ALIGNED(64) a05,
+					const void * ALIGNED(64) a06,
+					const void * ALIGNED(64) a07,
+					const void * ALIGNED(64) a08,
+					const void * ALIGNED(64) a09,
+					const void * ALIGNED(64) a10,
+					const void * ALIGNED(64) a11,
+					const void * ALIGNED(64) a12,
+					const void * ALIGNED(64) a13,
+					const void * ALIGNED(64) a14,
+					const void * ALIGNED(64) a15,
+					v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+					v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+					v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+					v16 &b12, v16 &b13, v16 &b14, v16 &b15 ) ALWAYS_INLINE;
+
+    friend inline void store_16x1_tr( const v16 &a,
+				      void *a00, void *a01, void *a02, void *a03,
+				      void *a04, void *a05, void *a06, void *a07,
+				      void *a08, void *a09, void *a10, void *a11,
+				      void *a12, void *a13, void *a14, void *a15 ) ALWAYS_INLINE;
+    friend inline void store_16x2_tr( const v16 &a, const v16 &b,
+				      void * ALIGNED(8) a00,
+				      void * ALIGNED(8) a01,
+				      void * ALIGNED(8) a02,
+				      void * ALIGNED(8) a03,
+				      void * ALIGNED(8) a04,
+				      void * ALIGNED(8) a05,
+				      void * ALIGNED(8) a06,
+				      void * ALIGNED(8) a07,
+				      void * ALIGNED(8) a08,
+				      void * ALIGNED(8) a09,
+				      void * ALIGNED(8) a10,
+				      void * ALIGNED(8) a11,
+				      void * ALIGNED(8) a12,
+				      void * ALIGNED(8) a13,
+				      void * ALIGNED(8) a14,
+				      void * ALIGNED(8) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x4_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr( const v16 &a, const v16 &b,
+				      const v16 &c, const v16 &d,
+				      const v16 &e, const v16 &f,
+				      const v16 &g, const v16 &h,
+				      void * ALIGNED(64) a00,
+				      void * ALIGNED(64) a01,
+				      void * ALIGNED(64) a02,
+				      void * ALIGNED(64) a03,
+				      void * ALIGNED(64) a04,
+				      void * ALIGNED(64) a05,
+				      void * ALIGNED(64) a06,
+				      void * ALIGNED(64) a07,
+				      void * ALIGNED(64) a08,
+				      void * ALIGNED(64) a09,
+				      void * ALIGNED(64) a10,
+				      void * ALIGNED(64) a11,
+				      void * ALIGNED(64) a12,
+				      void * ALIGNED(64) a13,
+				      void * ALIGNED(64) a14,
+				      void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr( const v16 &b00, const v16 &b01,
+				       const v16 &b02, const v16 &b03,
+				       const v16 &b04, const v16 &b05,
+				       const v16 &b06, const v16 &b07,
+				       const v16 &b08, const v16 &b09,
+				       const v16 &b10, const v16 &b11,
+				       const v16 &b12, const v16 &b13,
+				       const v16 &b14, const v16 &b15,
+				       void * ALIGNED(64) a00,
+				       void * ALIGNED(64) a01,
+				       void * ALIGNED(64) a02,
+				       void * ALIGNED(64) a03,
+				       void * ALIGNED(64) a04,
+				       void * ALIGNED(64) a05,
+				       void * ALIGNED(64) a06,
+				       void * ALIGNED(64) a07,
+				       void * ALIGNED(64) a08,
+				       void * ALIGNED(64) a09,
+				       void * ALIGNED(64) a10,
+				       void * ALIGNED(64) a11,
+				       void * ALIGNED(64) a12,
+				       void * ALIGNED(64) a13,
+				       void * ALIGNED(64) a14,
+				       void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+    friend inline void store_16x8_tr_p( const v16 &a, const v16 &b,
+					const v16 &c, const v16 &d,
+					const v16 &e, const v16 &f,
+					const v16 &g, const v16 &h,
+					void * ALIGNED(64) a00,
+					void * ALIGNED(64) a01,
+					void * ALIGNED(64) a02,
+					void * ALIGNED(64) a03,
+					void * ALIGNED(64) a04,
+					void * ALIGNED(64) a05,
+					void * ALIGNED(64) a06,
+					void * ALIGNED(64) a07 ) ALWAYS_INLINE;
+    friend inline void store_16x16_tr_p( const v16 &b00, const v16 &b01,
+					 const v16 &b02, const v16 &b03,
+					 const v16 &b04, const v16 &b05,
+					 const v16 &b06, const v16 &b07,
+					 const v16 &b08, const v16 &b09,
+					 const v16 &b10, const v16 &b11,
+					 const v16 &b12, const v16 &b13,
+					 const v16 &b14, const v16 &b15,
+					 void * ALIGNED(64) a00,
+					 void * ALIGNED(64) a01,
+					 void * ALIGNED(64) a02,
+					 void * ALIGNED(64) a03,
+					 void * ALIGNED(64) a04,
+					 void * ALIGNED(64) a05,
+					 void * ALIGNED(64) a06,
+					 void * ALIGNED(64) a07,
+					 void * ALIGNED(64) a08,
+					 void * ALIGNED(64) a09,
+					 void * ALIGNED(64) a10,
+					 void * ALIGNED(64) a11,
+					 void * ALIGNED(64) a12,
+					 void * ALIGNED(64) a13,
+					 void * ALIGNED(64) a14,
+					 void * ALIGNED(64) a15 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int   i[16];
+      float f[16];
+    };
+
+  public:
+
+    v16() {}                    // Default constructor
+
+    v16( const v16 &a )         // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	i[j] = a.i[j];
+    }
+
+    ~v16() {}                   // Default destructor
+  };
+
+  // v16 miscellaneous functions
+
+  inline int any( const v16 &a )
+  {
+    return a.i[ 0] || a.i[ 1] || a.i[ 2] || a.i[ 3] ||
+           a.i[ 4] || a.i[ 5] || a.i[ 6] || a.i[ 7] ||
+           a.i[ 8] || a.i[ 9] || a.i[10] || a.i[11] ||
+           a.i[12] || a.i[13] || a.i[14] || a.i[15];
+  }
+
+  inline int all( const v16 &a )
+  {
+    return a.i[ 0] && a.i[ 1] && a.i[ 2] && a.i[ 3] &&
+           a.i[ 4] && a.i[ 5] && a.i[ 6] && a.i[ 7] &&
+           a.i[ 8] && a.i[ 9] && a.i[10] && a.i[11] &&
+           a.i[12] && a.i[13] && a.i[14] && a.i[15];
+  }
+
+  template<int n>
+  inline v16 splat( const v16 & a )
+  {
+    v16 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[n];
+
+    return b;
+  }
+
+  template<int i00, int i01, int i02, int i03, int i04, int i05, int i06, int i07, int i08, int i09, int i10, int i11, int i12, int i13, int i14, int i15>
+  inline v16 shuffle( const v16 & a )
+  {
+    v16 b;
+
+    b.i[ 0] = a.i[i00];
+    b.i[ 1] = a.i[i01];
+    b.i[ 2] = a.i[i02];
+    b.i[ 3] = a.i[i03];
+    b.i[ 4] = a.i[i04];
+    b.i[ 5] = a.i[i05];
+    b.i[ 6] = a.i[i06];
+    b.i[ 7] = a.i[i07];
+    b.i[ 8] = a.i[i08];
+    b.i[ 9] = a.i[i09];
+    b.i[10] = a.i[i10];
+    b.i[11] = a.i[i11];
+    b.i[12] = a.i[i12];
+    b.i[13] = a.i[i13];
+    b.i[14] = a.i[i14];
+    b.i[15] = a.i[i15];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v16 &a, v16 &b )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      sw( a.i[j], b.i[j] );
+  }
+
+  inline void transpose( v16 &a00, v16 &a01, v16 &a02, v16 &a03,
+			 v16 &a04, v16 &a05, v16 &a06, v16 &a07,
+			 v16 &a08, v16 &a09, v16 &a10, v16 &a11,
+			 v16 &a12, v16 &a13, v16 &a14, v16 &a15 )
+  {
+    sw( a00.i[1],a01.i[0] ); sw( a00.i[2],a02.i[0] ); sw( a00.i[3],a03.i[0] ); sw( a00.i[4],a04.i[0] ); sw( a00.i[5],a05.i[0] ); sw( a00.i[6],a06.i[0] ); sw( a00.i[7],a07.i[0] ); sw( a00.i[8],a08.i[0] ); sw( a00.i[9],a09.i[0] ); sw( a00.i[10],a10.i[0] ); sw( a00.i[11],a11.i[ 0] ); sw( a00.i[12],a12.i[ 0] ); sw( a00.i[13],a13.i[ 0] ); sw( a00.i[14],a14.i[ 0] ); sw( a00.i[15],a15.i[ 0] );
+                             sw( a01.i[2],a02.i[1] ); sw( a01.i[3],a03.i[1] ); sw( a01.i[4],a04.i[1] ); sw( a01.i[5],a05.i[1] ); sw( a01.i[6],a06.i[1] ); sw( a01.i[7],a07.i[1] ); sw( a01.i[8],a08.i[1] ); sw( a01.i[9],a09.i[1] ); sw( a01.i[10],a10.i[1] ); sw( a01.i[11],a11.i[ 1] ); sw( a01.i[12],a12.i[ 1] ); sw( a01.i[13],a13.i[ 1] ); sw( a01.i[14],a14.i[ 1] ); sw( a01.i[15],a15.i[ 1] );
+                                                      sw( a02.i[3],a03.i[2] ); sw( a02.i[4],a04.i[2] ); sw( a02.i[5],a05.i[2] ); sw( a02.i[6],a06.i[2] ); sw( a02.i[7],a07.i[2] ); sw( a02.i[8],a08.i[2] ); sw( a02.i[9],a09.i[2] ); sw( a02.i[10],a10.i[2] ); sw( a02.i[11],a11.i[ 2] ); sw( a02.i[12],a12.i[ 2] ); sw( a02.i[13],a13.i[ 2] ); sw( a02.i[14],a14.i[ 2] ); sw( a02.i[15],a15.i[ 2] );
+                                                                               sw( a03.i[4],a04.i[3] ); sw( a03.i[5],a05.i[3] ); sw( a03.i[6],a06.i[3] ); sw( a03.i[7],a07.i[3] ); sw( a03.i[8],a08.i[3] ); sw( a03.i[9],a09.i[3] ); sw( a03.i[10],a10.i[3] ); sw( a03.i[11],a11.i[ 3] ); sw( a03.i[12],a12.i[ 3] ); sw( a03.i[13],a13.i[ 3] ); sw( a03.i[14],a14.i[ 3] ); sw( a03.i[15],a15.i[ 3] );
+                                                                                                        sw( a04.i[5],a05.i[4] ); sw( a04.i[6],a06.i[4] ); sw( a04.i[7],a07.i[4] ); sw( a04.i[8],a08.i[4] ); sw( a04.i[9],a09.i[4] ); sw( a04.i[10],a10.i[4] ); sw( a04.i[11],a11.i[ 4] ); sw( a04.i[12],a12.i[ 4] ); sw( a04.i[13],a13.i[ 4] ); sw( a04.i[14],a14.i[ 4] ); sw( a04.i[15],a15.i[ 4] );
+                                                                                                                                 sw( a05.i[6],a06.i[5] ); sw( a05.i[7],a07.i[5] ); sw( a05.i[8],a08.i[5] ); sw( a05.i[9],a09.i[5] ); sw( a05.i[10],a10.i[5] ); sw( a05.i[11],a11.i[ 5] ); sw( a05.i[12],a12.i[ 5] ); sw( a05.i[13],a13.i[ 5] ); sw( a05.i[14],a14.i[ 5] ); sw( a05.i[15],a15.i[ 5] );
+                                                                                                                                                          sw( a06.i[7],a07.i[6] ); sw( a06.i[8],a08.i[6] ); sw( a06.i[9],a09.i[6] ); sw( a06.i[10],a10.i[6] ); sw( a06.i[11],a11.i[ 6] ); sw( a06.i[12],a12.i[ 6] ); sw( a06.i[13],a13.i[ 6] ); sw( a06.i[14],a14.i[ 6] ); sw( a06.i[15],a15.i[ 6] );
+                                                                                                                                                                                   sw( a07.i[8],a08.i[7] ); sw( a07.i[9],a09.i[7] ); sw( a07.i[10],a10.i[7] ); sw( a07.i[11],a11.i[ 7] ); sw( a07.i[12],a12.i[ 7] ); sw( a07.i[13],a13.i[ 7] ); sw( a07.i[14],a14.i[ 7] ); sw( a07.i[15],a15.i[ 7] );
+                                                                                                                                                                                                            sw( a08.i[9],a09.i[8] ); sw( a08.i[10],a10.i[8] ); sw( a08.i[11],a11.i[ 8] ); sw( a08.i[12],a12.i[ 8] ); sw( a08.i[13],a13.i[ 8] ); sw( a08.i[14],a14.i[ 8] ); sw( a08.i[15],a15.i[ 8] );
+                                                                                                                                                                                                                                     sw( a09.i[10],a10.i[9] ); sw( a09.i[11],a11.i[ 9] ); sw( a09.i[12],a12.i[ 9] ); sw( a09.i[13],a13.i[ 9] ); sw( a09.i[14],a14.i[ 9] ); sw( a09.i[15],a15.i[ 9] );
+                                                                                                                                                                                                                                                               sw( a10.i[11],a11.i[10] ); sw( a10.i[12],a12.i[10] ); sw( a10.i[13],a13.i[10] ); sw( a10.i[14],a14.i[10] ); sw( a10.i[15],a15.i[10] );
+                                                                                                                                                                                                                                                                                          sw( a11.i[12],a12.i[11] ); sw( a11.i[13],a13.i[11] ); sw( a11.i[14],a14.i[11] ); sw( a11.i[15],a15.i[11] );
+                                                                                                                                                                                                                                                                                                                     sw( a12.i[13],a13.i[12] ); sw( a12.i[14],a14.i[12] ); sw( a12.i[15],a15.i[12] );
+                                                                                                                                                                                                                                                                                                                                                sw( a13.i[14],a14.i[13] ); sw( a13.i[15],a15.i[13] );
+                                                                                                                                                                                                                                                                                                                                                                           sw( a14.i[15],a15.i[14] );
+  }
+
+# undef sw
+
+  // v16 memory manipulation functions
+
+  inline void load_16x1( const void * ALIGNED(64) p,
+			 v16 &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      a.i[j] = ((const int * ALIGNED(64))p)[j];
+  }
+
+  inline void store_16x1( const v16 &a,
+			  void * ALIGNED(64) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = a.i[j];
+  }
+
+  inline void stream_16x1( const v16 &a,
+			   void * ALIGNED(64) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = a.i[j];
+  }
+
+  inline void clear_16x1( void * ALIGNED(64) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))p)[j] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_16x1( void * ALIGNED(64) dst,
+			 const void * ALIGNED(64) src )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      ((int * ALIGNED(64))dst)[j] = ((const int * ALIGNED(64))src)[j];
+  }
+
+  inline void swap_16x1( void * ALIGNED(64) a,
+			 void * ALIGNED(64) b )
+  {
+    int t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+    {
+      t = ((int * ALIGNED(64))a)[j];
+      ((int * ALIGNED(64))a)[j] = ((int * ALIGNED(64))b)[j];
+      ((int * ALIGNED(64))b)[j] = t;
+    }
+  }
+
+  // v16 transposed memory manipulation functions
+
+  inline void load_16x1_tr( const void *a00, const void *a01,
+                            const void *a02, const void *a03,
+                            const void *a04, const void *a05,
+                            const void *a06, const void *a07,
+			    const void *a08, const void *a09,
+                            const void *a10, const void *a11,
+                            const void *a12, const void *a13,
+                            const void *a14, const void *a15,
+			    v16 &a )
+  {
+    a.i[ 0] = ((const int *)a00)[0];
+    a.i[ 1] = ((const int *)a01)[0];
+    a.i[ 2] = ((const int *)a02)[0];
+    a.i[ 3] = ((const int *)a03)[0];
+    a.i[ 4] = ((const int *)a04)[0];
+    a.i[ 5] = ((const int *)a05)[0];
+    a.i[ 6] = ((const int *)a06)[0];
+    a.i[ 7] = ((const int *)a07)[0];
+    a.i[ 8] = ((const int *)a08)[0];
+    a.i[ 9] = ((const int *)a09)[0];
+    a.i[10] = ((const int *)a10)[0];
+    a.i[11] = ((const int *)a11)[0];
+    a.i[12] = ((const int *)a12)[0];
+    a.i[13] = ((const int *)a13)[0];
+    a.i[14] = ((const int *)a14)[0];
+    a.i[15] = ((const int *)a15)[0];
+  }
+
+  inline void load_16x2_tr( const void * ALIGNED(8) a00,
+			    const void * ALIGNED(8) a01,
+			    const void * ALIGNED(8) a02,
+			    const void * ALIGNED(8) a03,
+			    const void * ALIGNED(8) a04,
+			    const void * ALIGNED(8) a05,
+			    const void * ALIGNED(8) a06,
+			    const void * ALIGNED(8) a07,
+			    const void * ALIGNED(8) a08,
+			    const void * ALIGNED(8) a09,
+			    const void * ALIGNED(8) a10,
+			    const void * ALIGNED(8) a11,
+			    const void * ALIGNED(8) a12,
+			    const void * ALIGNED(8) a13,
+			    const void * ALIGNED(8) a14,
+			    const void * ALIGNED(8) a15,
+			    v16 &a, v16 &b )
+  {
+    a.i[ 0] = ((const int * ALIGNED(8))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(8))a00)[1];
+
+    a.i[ 1] = ((const int * ALIGNED(8))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(8))a01)[1];
+
+    a.i[ 2] = ((const int * ALIGNED(8))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(8))a02)[1];
+
+    a.i[ 3] = ((const int * ALIGNED(8))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(8))a03)[1];
+
+    a.i[ 4] = ((const int * ALIGNED(8))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(8))a04)[1];
+
+    a.i[ 5] = ((const int * ALIGNED(8))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(8))a05)[1];
+
+    a.i[ 6] = ((const int * ALIGNED(8))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(8))a06)[1];
+
+    a.i[ 7] = ((const int * ALIGNED(8))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(8))a07)[1];
+
+    a.i[ 8] = ((const int * ALIGNED(8))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(8))a08)[1];
+
+    a.i[ 9] = ((const int * ALIGNED(8))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(8))a09)[1];
+
+    a.i[10] = ((const int * ALIGNED(8))a10)[0];
+    b.i[10] = ((const int * ALIGNED(8))a10)[1];
+
+    a.i[11] = ((const int * ALIGNED(8))a11)[0];
+    b.i[11] = ((const int * ALIGNED(8))a11)[1];
+
+    a.i[12] = ((const int * ALIGNED(8))a12)[0];
+    b.i[12] = ((const int * ALIGNED(8))a12)[1];
+
+    a.i[13] = ((const int * ALIGNED(8))a13)[0];
+    b.i[13] = ((const int * ALIGNED(8))a13)[1];
+
+    a.i[14] = ((const int * ALIGNED(8))a14)[0];
+    b.i[14] = ((const int * ALIGNED(8))a14)[1];
+
+    a.i[15] = ((const int * ALIGNED(8))a15)[0];
+    b.i[15] = ((const int * ALIGNED(8))a15)[1];
+  }
+
+  inline void load_16x3_tr( const void * ALIGNED(64) a00,
+                            const void * ALIGNED(64) a01,
+                            const void * ALIGNED(64) a02,
+                            const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+                            const void * ALIGNED(64) a09,
+                            const void * ALIGNED(64) a10,
+                            const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2]; 
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2]; 
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2]; 
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2]; 
+  }
+
+  inline void load_16x4_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+  }
+
+  inline void load_16x8_tr( const void * ALIGNED(64) a00,
+			    const void * ALIGNED(64) a01,
+			    const void * ALIGNED(64) a02,
+			    const void * ALIGNED(64) a03,
+			    const void * ALIGNED(64) a04,
+			    const void * ALIGNED(64) a05,
+			    const void * ALIGNED(64) a06,
+			    const void * ALIGNED(64) a07,
+			    const void * ALIGNED(64) a08,
+			    const void * ALIGNED(64) a09,
+			    const void * ALIGNED(64) a10,
+			    const void * ALIGNED(64) a11,
+			    const void * ALIGNED(64) a12,
+			    const void * ALIGNED(64) a13,
+			    const void * ALIGNED(64) a14,
+			    const void * ALIGNED(64) a15,
+			    v16 &a, v16 &b, v16 &c, v16 &d,
+			    v16 &e, v16 &f, v16 &g, v16 &h )
+  {
+    a.i[ 0] = ((const int * ALIGNED(64))a00)[0];
+    b.i[ 0] = ((const int * ALIGNED(64))a00)[1];
+    c.i[ 0] = ((const int * ALIGNED(64))a00)[2];
+    d.i[ 0] = ((const int * ALIGNED(64))a00)[3];
+    e.i[ 0] = ((const int * ALIGNED(64))a00)[4];
+    f.i[ 0] = ((const int * ALIGNED(64))a00)[5];
+    g.i[ 0] = ((const int * ALIGNED(64))a00)[6];
+    h.i[ 0] = ((const int * ALIGNED(64))a00)[7];
+
+    a.i[ 1] = ((const int * ALIGNED(64))a01)[0];
+    b.i[ 1] = ((const int * ALIGNED(64))a01)[1];
+    c.i[ 1] = ((const int * ALIGNED(64))a01)[2];
+    d.i[ 1] = ((const int * ALIGNED(64))a01)[3];
+    e.i[ 1] = ((const int * ALIGNED(64))a01)[4];
+    f.i[ 1] = ((const int * ALIGNED(64))a01)[5];
+    g.i[ 1] = ((const int * ALIGNED(64))a01)[6];
+    h.i[ 1] = ((const int * ALIGNED(64))a01)[7];
+
+    a.i[ 2] = ((const int * ALIGNED(64))a02)[0];
+    b.i[ 2] = ((const int * ALIGNED(64))a02)[1];
+    c.i[ 2] = ((const int * ALIGNED(64))a02)[2];
+    d.i[ 2] = ((const int * ALIGNED(64))a02)[3];
+    e.i[ 2] = ((const int * ALIGNED(64))a02)[4];
+    f.i[ 2] = ((const int * ALIGNED(64))a02)[5];
+    g.i[ 2] = ((const int * ALIGNED(64))a02)[6];
+    h.i[ 2] = ((const int * ALIGNED(64))a02)[7];
+
+    a.i[ 3] = ((const int * ALIGNED(64))a03)[0];
+    b.i[ 3] = ((const int * ALIGNED(64))a03)[1];
+    c.i[ 3] = ((const int * ALIGNED(64))a03)[2];
+    d.i[ 3] = ((const int * ALIGNED(64))a03)[3];
+    e.i[ 3] = ((const int * ALIGNED(64))a03)[4];
+    f.i[ 3] = ((const int * ALIGNED(64))a03)[5];
+    g.i[ 3] = ((const int * ALIGNED(64))a03)[6];
+    h.i[ 3] = ((const int * ALIGNED(64))a03)[7];
+
+    a.i[ 4] = ((const int * ALIGNED(64))a04)[0];
+    b.i[ 4] = ((const int * ALIGNED(64))a04)[1];
+    c.i[ 4] = ((const int * ALIGNED(64))a04)[2];
+    d.i[ 4] = ((const int * ALIGNED(64))a04)[3];
+    e.i[ 4] = ((const int * ALIGNED(64))a04)[4];
+    f.i[ 4] = ((const int * ALIGNED(64))a04)[5];
+    g.i[ 4] = ((const int * ALIGNED(64))a04)[6];
+    h.i[ 4] = ((const int * ALIGNED(64))a04)[7];
+
+    a.i[ 5] = ((const int * ALIGNED(64))a05)[0];
+    b.i[ 5] = ((const int * ALIGNED(64))a05)[1];
+    c.i[ 5] = ((const int * ALIGNED(64))a05)[2];
+    d.i[ 5] = ((const int * ALIGNED(64))a05)[3];
+    e.i[ 5] = ((const int * ALIGNED(64))a05)[4];
+    f.i[ 5] = ((const int * ALIGNED(64))a05)[5];
+    g.i[ 5] = ((const int * ALIGNED(64))a05)[6];
+    h.i[ 5] = ((const int * ALIGNED(64))a05)[7];
+
+    a.i[ 6] = ((const int * ALIGNED(64))a06)[0];
+    b.i[ 6] = ((const int * ALIGNED(64))a06)[1];
+    c.i[ 6] = ((const int * ALIGNED(64))a06)[2];
+    d.i[ 6] = ((const int * ALIGNED(64))a06)[3];
+    e.i[ 6] = ((const int * ALIGNED(64))a06)[4];
+    f.i[ 6] = ((const int * ALIGNED(64))a06)[5];
+    g.i[ 6] = ((const int * ALIGNED(64))a06)[6];
+    h.i[ 6] = ((const int * ALIGNED(64))a06)[7];
+
+    a.i[ 7] = ((const int * ALIGNED(64))a07)[0];
+    b.i[ 7] = ((const int * ALIGNED(64))a07)[1];
+    c.i[ 7] = ((const int * ALIGNED(64))a07)[2];
+    d.i[ 7] = ((const int * ALIGNED(64))a07)[3];
+    e.i[ 7] = ((const int * ALIGNED(64))a07)[4];
+    f.i[ 7] = ((const int * ALIGNED(64))a07)[5];
+    g.i[ 7] = ((const int * ALIGNED(64))a07)[6];
+    h.i[ 7] = ((const int * ALIGNED(64))a07)[7];
+
+    a.i[ 8] = ((const int * ALIGNED(64))a08)[0];
+    b.i[ 8] = ((const int * ALIGNED(64))a08)[1];
+    c.i[ 8] = ((const int * ALIGNED(64))a08)[2];
+    d.i[ 8] = ((const int * ALIGNED(64))a08)[3];
+    e.i[ 8] = ((const int * ALIGNED(64))a08)[4];
+    f.i[ 8] = ((const int * ALIGNED(64))a08)[5];
+    g.i[ 8] = ((const int * ALIGNED(64))a08)[6];
+    h.i[ 8] = ((const int * ALIGNED(64))a08)[7];
+
+    a.i[ 9] = ((const int * ALIGNED(64))a09)[0];
+    b.i[ 9] = ((const int * ALIGNED(64))a09)[1];
+    c.i[ 9] = ((const int * ALIGNED(64))a09)[2];
+    d.i[ 9] = ((const int * ALIGNED(64))a09)[3];
+    e.i[ 9] = ((const int * ALIGNED(64))a09)[4];
+    f.i[ 9] = ((const int * ALIGNED(64))a09)[5];
+    g.i[ 9] = ((const int * ALIGNED(64))a09)[6];
+    h.i[ 9] = ((const int * ALIGNED(64))a09)[7];
+
+    a.i[10] = ((const int * ALIGNED(64))a10)[0];
+    b.i[10] = ((const int * ALIGNED(64))a10)[1];
+    c.i[10] = ((const int * ALIGNED(64))a10)[2];
+    d.i[10] = ((const int * ALIGNED(64))a10)[3];
+    e.i[10] = ((const int * ALIGNED(64))a10)[4];
+    f.i[10] = ((const int * ALIGNED(64))a10)[5];
+    g.i[10] = ((const int * ALIGNED(64))a10)[6];
+    h.i[10] = ((const int * ALIGNED(64))a10)[7];
+
+    a.i[11] = ((const int * ALIGNED(64))a11)[0];
+    b.i[11] = ((const int * ALIGNED(64))a11)[1];
+    c.i[11] = ((const int * ALIGNED(64))a11)[2];
+    d.i[11] = ((const int * ALIGNED(64))a11)[3];
+    e.i[11] = ((const int * ALIGNED(64))a11)[4];
+    f.i[11] = ((const int * ALIGNED(64))a11)[5];
+    g.i[11] = ((const int * ALIGNED(64))a11)[6];
+    h.i[11] = ((const int * ALIGNED(64))a11)[7];
+
+    a.i[12] = ((const int * ALIGNED(64))a12)[0];
+    b.i[12] = ((const int * ALIGNED(64))a12)[1];
+    c.i[12] = ((const int * ALIGNED(64))a12)[2];
+    d.i[12] = ((const int * ALIGNED(64))a12)[3];
+    e.i[12] = ((const int * ALIGNED(64))a12)[4];
+    f.i[12] = ((const int * ALIGNED(64))a12)[5];
+    g.i[12] = ((const int * ALIGNED(64))a12)[6];
+    h.i[12] = ((const int * ALIGNED(64))a12)[7];
+
+    a.i[13] = ((const int * ALIGNED(64))a13)[0];
+    b.i[13] = ((const int * ALIGNED(64))a13)[1];
+    c.i[13] = ((const int * ALIGNED(64))a13)[2];
+    d.i[13] = ((const int * ALIGNED(64))a13)[3];
+    e.i[13] = ((const int * ALIGNED(64))a13)[4];
+    f.i[13] = ((const int * ALIGNED(64))a13)[5];
+    g.i[13] = ((const int * ALIGNED(64))a13)[6];
+    h.i[13] = ((const int * ALIGNED(64))a13)[7];
+
+    a.i[14] = ((const int * ALIGNED(64))a14)[0];
+    b.i[14] = ((const int * ALIGNED(64))a14)[1];
+    c.i[14] = ((const int * ALIGNED(64))a14)[2];
+    d.i[14] = ((const int * ALIGNED(64))a14)[3];
+    e.i[14] = ((const int * ALIGNED(64))a14)[4];
+    f.i[14] = ((const int * ALIGNED(64))a14)[5];
+    g.i[14] = ((const int * ALIGNED(64))a14)[6];
+    h.i[14] = ((const int * ALIGNED(64))a14)[7];
+
+    a.i[15] = ((const int * ALIGNED(64))a15)[0];
+    b.i[15] = ((const int * ALIGNED(64))a15)[1];
+    c.i[15] = ((const int * ALIGNED(64))a15)[2];
+    d.i[15] = ((const int * ALIGNED(64))a15)[3];
+    e.i[15] = ((const int * ALIGNED(64))a15)[4];
+    f.i[15] = ((const int * ALIGNED(64))a15)[5];
+    g.i[15] = ((const int * ALIGNED(64))a15)[6];
+    h.i[15] = ((const int * ALIGNED(64))a15)[7];
+  }
+
+  inline void load_16x16_tr( const void * ALIGNED(64) a00,
+			     const void * ALIGNED(64) a01,
+			     const void * ALIGNED(64) a02,
+			     const void * ALIGNED(64) a03,
+			     const void * ALIGNED(64) a04,
+			     const void * ALIGNED(64) a05,
+			     const void * ALIGNED(64) a06,
+			     const void * ALIGNED(64) a07,
+			     const void * ALIGNED(64) a08,
+			     const void * ALIGNED(64) a09,
+			     const void * ALIGNED(64) a10,
+			     const void * ALIGNED(64) a11,
+			     const void * ALIGNED(64) a12,
+			     const void * ALIGNED(64) a13,
+			     const void * ALIGNED(64) a14,
+			     const void * ALIGNED(64) a15,
+			     v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			     v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			     v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			     v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b08.i[ 0] = ((const int * ALIGNED(64))a00)[ 8];
+    b09.i[ 0] = ((const int * ALIGNED(64))a00)[ 9];
+    b10.i[ 0] = ((const int * ALIGNED(64))a00)[10];
+    b11.i[ 0] = ((const int * ALIGNED(64))a00)[11];
+    b12.i[ 0] = ((const int * ALIGNED(64))a00)[12];
+    b13.i[ 0] = ((const int * ALIGNED(64))a00)[13];
+    b14.i[ 0] = ((const int * ALIGNED(64))a00)[14];
+    b15.i[ 0] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 1] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 1] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 1] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 1] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 1] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 1] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 1] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 1] = ((const int * ALIGNED(64))a01)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a01)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a01)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a01)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a01)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a01)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a01)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a01)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a02)[ 7];
+    b08.i[ 2] = ((const int * ALIGNED(64))a02)[ 8];
+    b09.i[ 2] = ((const int * ALIGNED(64))a02)[ 9];
+    b10.i[ 2] = ((const int * ALIGNED(64))a02)[10];
+    b11.i[ 2] = ((const int * ALIGNED(64))a02)[11];
+    b12.i[ 2] = ((const int * ALIGNED(64))a02)[12];
+    b13.i[ 2] = ((const int * ALIGNED(64))a02)[13];
+    b14.i[ 2] = ((const int * ALIGNED(64))a02)[14];
+    b15.i[ 2] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 3] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 3] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 3] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 3] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 3] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 3] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 3] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 3] = ((const int * ALIGNED(64))a03)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a03)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a03)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a03)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a03)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a03)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a03)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a03)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a04)[ 7];
+    b08.i[ 4] = ((const int * ALIGNED(64))a04)[ 8];
+    b09.i[ 4] = ((const int * ALIGNED(64))a04)[ 9];
+    b10.i[ 4] = ((const int * ALIGNED(64))a04)[10];
+    b11.i[ 4] = ((const int * ALIGNED(64))a04)[11];
+    b12.i[ 4] = ((const int * ALIGNED(64))a04)[12];
+    b13.i[ 4] = ((const int * ALIGNED(64))a04)[13];
+    b14.i[ 4] = ((const int * ALIGNED(64))a04)[14];
+    b15.i[ 4] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[ 5] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[ 5] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[ 5] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[ 5] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[ 5] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[ 5] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[ 5] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[ 5] = ((const int * ALIGNED(64))a05)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a05)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a05)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a05)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a05)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a05)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a05)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a05)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a06)[ 7];
+    b08.i[ 6] = ((const int * ALIGNED(64))a06)[ 8];
+    b09.i[ 6] = ((const int * ALIGNED(64))a06)[ 9];
+    b10.i[ 6] = ((const int * ALIGNED(64))a06)[10];
+    b11.i[ 6] = ((const int * ALIGNED(64))a06)[11];
+    b12.i[ 6] = ((const int * ALIGNED(64))a06)[12];
+    b13.i[ 6] = ((const int * ALIGNED(64))a06)[13];
+    b14.i[ 6] = ((const int * ALIGNED(64))a06)[14];
+    b15.i[ 6] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[ 7] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[ 7] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[ 7] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[ 7] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[ 7] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[ 7] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[ 7] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[ 7] = ((const int * ALIGNED(64))a07)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a07)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a07)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a07)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a07)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a07)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a07)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a07)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a07)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a08)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a08)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a08)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a08)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a08)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a08)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a08)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 8] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 8] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 8] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 8] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 8] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 8] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 8] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 8] = ((const int * ALIGNED(64))a08)[15];
+
+    b00.i[ 9] = ((const int * ALIGNED(64))a09)[ 0];
+    b01.i[ 9] = ((const int * ALIGNED(64))a09)[ 1];
+    b02.i[ 9] = ((const int * ALIGNED(64))a09)[ 2];
+    b03.i[ 9] = ((const int * ALIGNED(64))a09)[ 3];
+    b04.i[ 9] = ((const int * ALIGNED(64))a09)[ 4];
+    b05.i[ 9] = ((const int * ALIGNED(64))a09)[ 5];
+    b06.i[ 9] = ((const int * ALIGNED(64))a09)[ 6];
+    b07.i[ 9] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a09)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a10)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a10)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a10)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a10)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a10)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a10)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a10)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[10] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[10] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[10] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[10] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[10] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[10] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[10] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[10] = ((const int * ALIGNED(64))a10)[15];
+
+    b00.i[11] = ((const int * ALIGNED(64))a11)[ 0];
+    b01.i[11] = ((const int * ALIGNED(64))a11)[ 1];
+    b02.i[11] = ((const int * ALIGNED(64))a11)[ 2];
+    b03.i[11] = ((const int * ALIGNED(64))a11)[ 3];
+    b04.i[11] = ((const int * ALIGNED(64))a11)[ 4];
+    b05.i[11] = ((const int * ALIGNED(64))a11)[ 5];
+    b06.i[11] = ((const int * ALIGNED(64))a11)[ 6];
+    b07.i[11] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a11)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a12)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a12)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a12)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a12)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a12)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a12)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a12)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[12] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[12] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[12] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[12] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[12] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[12] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[12] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[12] = ((const int * ALIGNED(64))a12)[15];
+
+    b00.i[13] = ((const int * ALIGNED(64))a13)[ 0];
+    b01.i[13] = ((const int * ALIGNED(64))a13)[ 1];
+    b02.i[13] = ((const int * ALIGNED(64))a13)[ 2];
+    b03.i[13] = ((const int * ALIGNED(64))a13)[ 3];
+    b04.i[13] = ((const int * ALIGNED(64))a13)[ 4];
+    b05.i[13] = ((const int * ALIGNED(64))a13)[ 5];
+    b06.i[13] = ((const int * ALIGNED(64))a13)[ 6];
+    b07.i[13] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a13)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a14)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a14)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a14)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a14)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a14)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a14)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a14)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[14] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[14] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[14] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[14] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[14] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[14] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[14] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[14] = ((const int * ALIGNED(64))a14)[15];
+
+    b00.i[15] = ((const int * ALIGNED(64))a15)[ 0];
+    b01.i[15] = ((const int * ALIGNED(64))a15)[ 1];
+    b02.i[15] = ((const int * ALIGNED(64))a15)[ 2];
+    b03.i[15] = ((const int * ALIGNED(64))a15)[ 3];
+    b04.i[15] = ((const int * ALIGNED(64))a15)[ 4];
+    b05.i[15] = ((const int * ALIGNED(64))a15)[ 5];
+    b06.i[15] = ((const int * ALIGNED(64))a15)[ 6];
+    b07.i[15] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void load_16x8_tr_p( const void * ALIGNED(64) a00,
+			      const void * ALIGNED(64) a01,
+			      const void * ALIGNED(64) a02,
+			      const void * ALIGNED(64) a03,
+			      const void * ALIGNED(64) a04,
+			      const void * ALIGNED(64) a05,
+			      const void * ALIGNED(64) a06,
+			      const void * ALIGNED(64) a07,
+			      v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			      v16 &b04, v16 &b05, v16 &b06, v16 &b07 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+  }
+
+  inline void load_16x16_tr_p( const void * ALIGNED(64) a00,
+			       const void * ALIGNED(64) a01,
+			       const void * ALIGNED(64) a02,
+			       const void * ALIGNED(64) a03,
+			       const void * ALIGNED(64) a04,
+			       const void * ALIGNED(64) a05,
+			       const void * ALIGNED(64) a06,
+			       const void * ALIGNED(64) a07,
+			       const void * ALIGNED(64) a08,
+			       const void * ALIGNED(64) a09,
+			       const void * ALIGNED(64) a10,
+			       const void * ALIGNED(64) a11,
+			       const void * ALIGNED(64) a12,
+			       const void * ALIGNED(64) a13,
+			       const void * ALIGNED(64) a14,
+			       const void * ALIGNED(64) a15,
+			       v16 &b00, v16 &b01, v16 &b02, v16 &b03,
+			       v16 &b04, v16 &b05, v16 &b06, v16 &b07,
+			       v16 &b08, v16 &b09, v16 &b10, v16 &b11,
+			       v16 &b12, v16 &b13, v16 &b14, v16 &b15 )
+  {
+    b00.i[ 0] = ((const int * ALIGNED(64))a00)[ 0];
+    b01.i[ 0] = ((const int * ALIGNED(64))a00)[ 1];
+    b02.i[ 0] = ((const int * ALIGNED(64))a00)[ 2];
+    b03.i[ 0] = ((const int * ALIGNED(64))a00)[ 3];
+    b04.i[ 0] = ((const int * ALIGNED(64))a00)[ 4];
+    b05.i[ 0] = ((const int * ALIGNED(64))a00)[ 5];
+    b06.i[ 0] = ((const int * ALIGNED(64))a00)[ 6];
+    b07.i[ 0] = ((const int * ALIGNED(64))a00)[ 7];
+    b00.i[ 1] = ((const int * ALIGNED(64))a00)[ 8];
+    b01.i[ 1] = ((const int * ALIGNED(64))a00)[ 9];
+    b02.i[ 1] = ((const int * ALIGNED(64))a00)[10];
+    b03.i[ 1] = ((const int * ALIGNED(64))a00)[11];
+    b04.i[ 1] = ((const int * ALIGNED(64))a00)[12];
+    b05.i[ 1] = ((const int * ALIGNED(64))a00)[13];
+    b06.i[ 1] = ((const int * ALIGNED(64))a00)[14];
+    b07.i[ 1] = ((const int * ALIGNED(64))a00)[15];
+
+    b00.i[ 2] = ((const int * ALIGNED(64))a01)[ 0];
+    b01.i[ 2] = ((const int * ALIGNED(64))a01)[ 1];
+    b02.i[ 2] = ((const int * ALIGNED(64))a01)[ 2];
+    b03.i[ 2] = ((const int * ALIGNED(64))a01)[ 3];
+    b04.i[ 2] = ((const int * ALIGNED(64))a01)[ 4];
+    b05.i[ 2] = ((const int * ALIGNED(64))a01)[ 5];
+    b06.i[ 2] = ((const int * ALIGNED(64))a01)[ 6];
+    b07.i[ 2] = ((const int * ALIGNED(64))a01)[ 7];
+    b00.i[ 3] = ((const int * ALIGNED(64))a01)[ 8];
+    b01.i[ 3] = ((const int * ALIGNED(64))a01)[ 9];
+    b02.i[ 3] = ((const int * ALIGNED(64))a01)[10];
+    b03.i[ 3] = ((const int * ALIGNED(64))a01)[11];
+    b04.i[ 3] = ((const int * ALIGNED(64))a01)[12];
+    b05.i[ 3] = ((const int * ALIGNED(64))a01)[13];
+    b06.i[ 3] = ((const int * ALIGNED(64))a01)[14];
+    b07.i[ 3] = ((const int * ALIGNED(64))a01)[15];
+
+    b00.i[ 4] = ((const int * ALIGNED(64))a02)[ 0];
+    b01.i[ 4] = ((const int * ALIGNED(64))a02)[ 1];
+    b02.i[ 4] = ((const int * ALIGNED(64))a02)[ 2];
+    b03.i[ 4] = ((const int * ALIGNED(64))a02)[ 3];
+    b04.i[ 4] = ((const int * ALIGNED(64))a02)[ 4];
+    b05.i[ 4] = ((const int * ALIGNED(64))a02)[ 5];
+    b06.i[ 4] = ((const int * ALIGNED(64))a02)[ 6];
+    b07.i[ 4] = ((const int * ALIGNED(64))a02)[ 7];
+    b00.i[ 5] = ((const int * ALIGNED(64))a02)[ 8];
+    b01.i[ 5] = ((const int * ALIGNED(64))a02)[ 9];
+    b02.i[ 5] = ((const int * ALIGNED(64))a02)[10];
+    b03.i[ 5] = ((const int * ALIGNED(64))a02)[11];
+    b04.i[ 5] = ((const int * ALIGNED(64))a02)[12];
+    b05.i[ 5] = ((const int * ALIGNED(64))a02)[13];
+    b06.i[ 5] = ((const int * ALIGNED(64))a02)[14];
+    b07.i[ 5] = ((const int * ALIGNED(64))a02)[15];
+
+    b00.i[ 6] = ((const int * ALIGNED(64))a03)[ 0];
+    b01.i[ 6] = ((const int * ALIGNED(64))a03)[ 1];
+    b02.i[ 6] = ((const int * ALIGNED(64))a03)[ 2];
+    b03.i[ 6] = ((const int * ALIGNED(64))a03)[ 3];
+    b04.i[ 6] = ((const int * ALIGNED(64))a03)[ 4];
+    b05.i[ 6] = ((const int * ALIGNED(64))a03)[ 5];
+    b06.i[ 6] = ((const int * ALIGNED(64))a03)[ 6];
+    b07.i[ 6] = ((const int * ALIGNED(64))a03)[ 7];
+    b00.i[ 7] = ((const int * ALIGNED(64))a03)[ 8];
+    b01.i[ 7] = ((const int * ALIGNED(64))a03)[ 9];
+    b02.i[ 7] = ((const int * ALIGNED(64))a03)[10];
+    b03.i[ 7] = ((const int * ALIGNED(64))a03)[11];
+    b04.i[ 7] = ((const int * ALIGNED(64))a03)[12];
+    b05.i[ 7] = ((const int * ALIGNED(64))a03)[13];
+    b06.i[ 7] = ((const int * ALIGNED(64))a03)[14];
+    b07.i[ 7] = ((const int * ALIGNED(64))a03)[15];
+
+    b00.i[ 8] = ((const int * ALIGNED(64))a04)[ 0];
+    b01.i[ 8] = ((const int * ALIGNED(64))a04)[ 1];
+    b02.i[ 8] = ((const int * ALIGNED(64))a04)[ 2];
+    b03.i[ 8] = ((const int * ALIGNED(64))a04)[ 3];
+    b04.i[ 8] = ((const int * ALIGNED(64))a04)[ 4];
+    b05.i[ 8] = ((const int * ALIGNED(64))a04)[ 5];
+    b06.i[ 8] = ((const int * ALIGNED(64))a04)[ 6];
+    b07.i[ 8] = ((const int * ALIGNED(64))a04)[ 7];
+    b00.i[ 9] = ((const int * ALIGNED(64))a04)[ 8];
+    b01.i[ 9] = ((const int * ALIGNED(64))a04)[ 9];
+    b02.i[ 9] = ((const int * ALIGNED(64))a04)[10];
+    b03.i[ 9] = ((const int * ALIGNED(64))a04)[11];
+    b04.i[ 9] = ((const int * ALIGNED(64))a04)[12];
+    b05.i[ 9] = ((const int * ALIGNED(64))a04)[13];
+    b06.i[ 9] = ((const int * ALIGNED(64))a04)[14];
+    b07.i[ 9] = ((const int * ALIGNED(64))a04)[15];
+
+    b00.i[10] = ((const int * ALIGNED(64))a05)[ 0];
+    b01.i[10] = ((const int * ALIGNED(64))a05)[ 1];
+    b02.i[10] = ((const int * ALIGNED(64))a05)[ 2];
+    b03.i[10] = ((const int * ALIGNED(64))a05)[ 3];
+    b04.i[10] = ((const int * ALIGNED(64))a05)[ 4];
+    b05.i[10] = ((const int * ALIGNED(64))a05)[ 5];
+    b06.i[10] = ((const int * ALIGNED(64))a05)[ 6];
+    b07.i[10] = ((const int * ALIGNED(64))a05)[ 7];
+    b00.i[11] = ((const int * ALIGNED(64))a05)[ 8];
+    b01.i[11] = ((const int * ALIGNED(64))a05)[ 9];
+    b02.i[11] = ((const int * ALIGNED(64))a05)[10];
+    b03.i[11] = ((const int * ALIGNED(64))a05)[11];
+    b04.i[11] = ((const int * ALIGNED(64))a05)[12];
+    b05.i[11] = ((const int * ALIGNED(64))a05)[13];
+    b06.i[11] = ((const int * ALIGNED(64))a05)[14];
+    b07.i[11] = ((const int * ALIGNED(64))a05)[15];
+
+    b00.i[12] = ((const int * ALIGNED(64))a06)[ 0];
+    b01.i[12] = ((const int * ALIGNED(64))a06)[ 1];
+    b02.i[12] = ((const int * ALIGNED(64))a06)[ 2];
+    b03.i[12] = ((const int * ALIGNED(64))a06)[ 3];
+    b04.i[12] = ((const int * ALIGNED(64))a06)[ 4];
+    b05.i[12] = ((const int * ALIGNED(64))a06)[ 5];
+    b06.i[12] = ((const int * ALIGNED(64))a06)[ 6];
+    b07.i[12] = ((const int * ALIGNED(64))a06)[ 7];
+    b00.i[13] = ((const int * ALIGNED(64))a06)[ 8];
+    b01.i[13] = ((const int * ALIGNED(64))a06)[ 9];
+    b02.i[13] = ((const int * ALIGNED(64))a06)[10];
+    b03.i[13] = ((const int * ALIGNED(64))a06)[11];
+    b04.i[13] = ((const int * ALIGNED(64))a06)[12];
+    b05.i[13] = ((const int * ALIGNED(64))a06)[13];
+    b06.i[13] = ((const int * ALIGNED(64))a06)[14];
+    b07.i[13] = ((const int * ALIGNED(64))a06)[15];
+
+    b00.i[14] = ((const int * ALIGNED(64))a07)[ 0];
+    b01.i[14] = ((const int * ALIGNED(64))a07)[ 1];
+    b02.i[14] = ((const int * ALIGNED(64))a07)[ 2];
+    b03.i[14] = ((const int * ALIGNED(64))a07)[ 3];
+    b04.i[14] = ((const int * ALIGNED(64))a07)[ 4];
+    b05.i[14] = ((const int * ALIGNED(64))a07)[ 5];
+    b06.i[14] = ((const int * ALIGNED(64))a07)[ 6];
+    b07.i[14] = ((const int * ALIGNED(64))a07)[ 7];
+    b00.i[15] = ((const int * ALIGNED(64))a07)[ 8];
+    b01.i[15] = ((const int * ALIGNED(64))a07)[ 9];
+    b02.i[15] = ((const int * ALIGNED(64))a07)[10];
+    b03.i[15] = ((const int * ALIGNED(64))a07)[11];
+    b04.i[15] = ((const int * ALIGNED(64))a07)[12];
+    b05.i[15] = ((const int * ALIGNED(64))a07)[13];
+    b06.i[15] = ((const int * ALIGNED(64))a07)[14];
+    b07.i[15] = ((const int * ALIGNED(64))a07)[15];
+
+    b08.i[ 0] = ((const int * ALIGNED(64))a08)[ 0];
+    b09.i[ 0] = ((const int * ALIGNED(64))a08)[ 1];
+    b10.i[ 0] = ((const int * ALIGNED(64))a08)[ 2];
+    b11.i[ 0] = ((const int * ALIGNED(64))a08)[ 3];
+    b12.i[ 0] = ((const int * ALIGNED(64))a08)[ 4];
+    b13.i[ 0] = ((const int * ALIGNED(64))a08)[ 5];
+    b14.i[ 0] = ((const int * ALIGNED(64))a08)[ 6];
+    b15.i[ 0] = ((const int * ALIGNED(64))a08)[ 7];
+    b08.i[ 1] = ((const int * ALIGNED(64))a08)[ 8];
+    b09.i[ 1] = ((const int * ALIGNED(64))a08)[ 9];
+    b10.i[ 1] = ((const int * ALIGNED(64))a08)[10];
+    b11.i[ 1] = ((const int * ALIGNED(64))a08)[11];
+    b12.i[ 1] = ((const int * ALIGNED(64))a08)[12];
+    b13.i[ 1] = ((const int * ALIGNED(64))a08)[13];
+    b14.i[ 1] = ((const int * ALIGNED(64))a08)[14];
+    b15.i[ 1] = ((const int * ALIGNED(64))a08)[15];
+
+    b08.i[ 2] = ((const int * ALIGNED(64))a09)[ 0];
+    b09.i[ 2] = ((const int * ALIGNED(64))a09)[ 1];
+    b10.i[ 2] = ((const int * ALIGNED(64))a09)[ 2];
+    b11.i[ 2] = ((const int * ALIGNED(64))a09)[ 3];
+    b12.i[ 2] = ((const int * ALIGNED(64))a09)[ 4];
+    b13.i[ 2] = ((const int * ALIGNED(64))a09)[ 5];
+    b14.i[ 2] = ((const int * ALIGNED(64))a09)[ 6];
+    b15.i[ 2] = ((const int * ALIGNED(64))a09)[ 7];
+    b08.i[ 3] = ((const int * ALIGNED(64))a09)[ 8];
+    b09.i[ 3] = ((const int * ALIGNED(64))a09)[ 9];
+    b10.i[ 3] = ((const int * ALIGNED(64))a09)[10];
+    b11.i[ 3] = ((const int * ALIGNED(64))a09)[11];
+    b12.i[ 3] = ((const int * ALIGNED(64))a09)[12];
+    b13.i[ 3] = ((const int * ALIGNED(64))a09)[13];
+    b14.i[ 3] = ((const int * ALIGNED(64))a09)[14];
+    b15.i[ 3] = ((const int * ALIGNED(64))a09)[15];
+
+    b08.i[ 4] = ((const int * ALIGNED(64))a10)[ 0];
+    b09.i[ 4] = ((const int * ALIGNED(64))a10)[ 1];
+    b10.i[ 4] = ((const int * ALIGNED(64))a10)[ 2];
+    b11.i[ 4] = ((const int * ALIGNED(64))a10)[ 3];
+    b12.i[ 4] = ((const int * ALIGNED(64))a10)[ 4];
+    b13.i[ 4] = ((const int * ALIGNED(64))a10)[ 5];
+    b14.i[ 4] = ((const int * ALIGNED(64))a10)[ 6];
+    b15.i[ 4] = ((const int * ALIGNED(64))a10)[ 7];
+    b08.i[ 5] = ((const int * ALIGNED(64))a10)[ 8];
+    b09.i[ 5] = ((const int * ALIGNED(64))a10)[ 9];
+    b10.i[ 5] = ((const int * ALIGNED(64))a10)[10];
+    b11.i[ 5] = ((const int * ALIGNED(64))a10)[11];
+    b12.i[ 5] = ((const int * ALIGNED(64))a10)[12];
+    b13.i[ 5] = ((const int * ALIGNED(64))a10)[13];
+    b14.i[ 5] = ((const int * ALIGNED(64))a10)[14];
+    b15.i[ 5] = ((const int * ALIGNED(64))a10)[15];
+
+    b08.i[ 6] = ((const int * ALIGNED(64))a11)[ 0];
+    b09.i[ 6] = ((const int * ALIGNED(64))a11)[ 1];
+    b10.i[ 6] = ((const int * ALIGNED(64))a11)[ 2];
+    b11.i[ 6] = ((const int * ALIGNED(64))a11)[ 3];
+    b12.i[ 6] = ((const int * ALIGNED(64))a11)[ 4];
+    b13.i[ 6] = ((const int * ALIGNED(64))a11)[ 5];
+    b14.i[ 6] = ((const int * ALIGNED(64))a11)[ 6];
+    b15.i[ 6] = ((const int * ALIGNED(64))a11)[ 7];
+    b08.i[ 7] = ((const int * ALIGNED(64))a11)[ 8];
+    b09.i[ 7] = ((const int * ALIGNED(64))a11)[ 9];
+    b10.i[ 7] = ((const int * ALIGNED(64))a11)[10];
+    b11.i[ 7] = ((const int * ALIGNED(64))a11)[11];
+    b12.i[ 7] = ((const int * ALIGNED(64))a11)[12];
+    b13.i[ 7] = ((const int * ALIGNED(64))a11)[13];
+    b14.i[ 7] = ((const int * ALIGNED(64))a11)[14];
+    b15.i[ 7] = ((const int * ALIGNED(64))a11)[15];
+
+    b08.i[ 8] = ((const int * ALIGNED(64))a12)[ 0];
+    b09.i[ 8] = ((const int * ALIGNED(64))a12)[ 1];
+    b10.i[ 8] = ((const int * ALIGNED(64))a12)[ 2];
+    b11.i[ 8] = ((const int * ALIGNED(64))a12)[ 3];
+    b12.i[ 8] = ((const int * ALIGNED(64))a12)[ 4];
+    b13.i[ 8] = ((const int * ALIGNED(64))a12)[ 5];
+    b14.i[ 8] = ((const int * ALIGNED(64))a12)[ 6];
+    b15.i[ 8] = ((const int * ALIGNED(64))a12)[ 7];
+    b08.i[ 9] = ((const int * ALIGNED(64))a12)[ 8];
+    b09.i[ 9] = ((const int * ALIGNED(64))a12)[ 9];
+    b10.i[ 9] = ((const int * ALIGNED(64))a12)[10];
+    b11.i[ 9] = ((const int * ALIGNED(64))a12)[11];
+    b12.i[ 9] = ((const int * ALIGNED(64))a12)[12];
+    b13.i[ 9] = ((const int * ALIGNED(64))a12)[13];
+    b14.i[ 9] = ((const int * ALIGNED(64))a12)[14];
+    b15.i[ 9] = ((const int * ALIGNED(64))a12)[15];
+
+    b08.i[10] = ((const int * ALIGNED(64))a13)[ 0];
+    b09.i[10] = ((const int * ALIGNED(64))a13)[ 1];
+    b10.i[10] = ((const int * ALIGNED(64))a13)[ 2];
+    b11.i[10] = ((const int * ALIGNED(64))a13)[ 3];
+    b12.i[10] = ((const int * ALIGNED(64))a13)[ 4];
+    b13.i[10] = ((const int * ALIGNED(64))a13)[ 5];
+    b14.i[10] = ((const int * ALIGNED(64))a13)[ 6];
+    b15.i[10] = ((const int * ALIGNED(64))a13)[ 7];
+    b08.i[11] = ((const int * ALIGNED(64))a13)[ 8];
+    b09.i[11] = ((const int * ALIGNED(64))a13)[ 9];
+    b10.i[11] = ((const int * ALIGNED(64))a13)[10];
+    b11.i[11] = ((const int * ALIGNED(64))a13)[11];
+    b12.i[11] = ((const int * ALIGNED(64))a13)[12];
+    b13.i[11] = ((const int * ALIGNED(64))a13)[13];
+    b14.i[11] = ((const int * ALIGNED(64))a13)[14];
+    b15.i[11] = ((const int * ALIGNED(64))a13)[15];
+
+    b08.i[12] = ((const int * ALIGNED(64))a14)[ 0];
+    b09.i[12] = ((const int * ALIGNED(64))a14)[ 1];
+    b10.i[12] = ((const int * ALIGNED(64))a14)[ 2];
+    b11.i[12] = ((const int * ALIGNED(64))a14)[ 3];
+    b12.i[12] = ((const int * ALIGNED(64))a14)[ 4];
+    b13.i[12] = ((const int * ALIGNED(64))a14)[ 5];
+    b14.i[12] = ((const int * ALIGNED(64))a14)[ 6];
+    b15.i[12] = ((const int * ALIGNED(64))a14)[ 7];
+    b08.i[13] = ((const int * ALIGNED(64))a14)[ 8];
+    b09.i[13] = ((const int * ALIGNED(64))a14)[ 9];
+    b10.i[13] = ((const int * ALIGNED(64))a14)[10];
+    b11.i[13] = ((const int * ALIGNED(64))a14)[11];
+    b12.i[13] = ((const int * ALIGNED(64))a14)[12];
+    b13.i[13] = ((const int * ALIGNED(64))a14)[13];
+    b14.i[13] = ((const int * ALIGNED(64))a14)[14];
+    b15.i[13] = ((const int * ALIGNED(64))a14)[15];
+
+    b08.i[14] = ((const int * ALIGNED(64))a15)[ 0];
+    b09.i[14] = ((const int * ALIGNED(64))a15)[ 1];
+    b10.i[14] = ((const int * ALIGNED(64))a15)[ 2];
+    b11.i[14] = ((const int * ALIGNED(64))a15)[ 3];
+    b12.i[14] = ((const int * ALIGNED(64))a15)[ 4];
+    b13.i[14] = ((const int * ALIGNED(64))a15)[ 5];
+    b14.i[14] = ((const int * ALIGNED(64))a15)[ 6];
+    b15.i[14] = ((const int * ALIGNED(64))a15)[ 7];
+    b08.i[15] = ((const int * ALIGNED(64))a15)[ 8];
+    b09.i[15] = ((const int * ALIGNED(64))a15)[ 9];
+    b10.i[15] = ((const int * ALIGNED(64))a15)[10];
+    b11.i[15] = ((const int * ALIGNED(64))a15)[11];
+    b12.i[15] = ((const int * ALIGNED(64))a15)[12];
+    b13.i[15] = ((const int * ALIGNED(64))a15)[13];
+    b14.i[15] = ((const int * ALIGNED(64))a15)[14];
+    b15.i[15] = ((const int * ALIGNED(64))a15)[15];
+  }
+
+  inline void store_16x1_tr( const v16 &a,
+			     void *a00, void *a01, void *a02, void *a03,
+			     void *a04, void *a05, void *a06, void *a07,
+			     void *a08, void *a09, void *a10, void *a11,
+			     void *a12, void *a13, void *a14, void *a15 )
+  {
+    ((int *)a00)[0] = a.i[ 0];
+    ((int *)a01)[0] = a.i[ 1];
+    ((int *)a02)[0] = a.i[ 2];
+    ((int *)a03)[0] = a.i[ 3];
+    ((int *)a04)[0] = a.i[ 4];
+    ((int *)a05)[0] = a.i[ 5];
+    ((int *)a06)[0] = a.i[ 6];
+    ((int *)a07)[0] = a.i[ 7];
+    ((int *)a08)[0] = a.i[ 8];
+    ((int *)a09)[0] = a.i[ 9];
+    ((int *)a10)[0] = a.i[10];
+    ((int *)a11)[0] = a.i[11];
+    ((int *)a12)[0] = a.i[12];
+    ((int *)a13)[0] = a.i[13];
+    ((int *)a14)[0] = a.i[14];
+    ((int *)a15)[0] = a.i[15];
+  }
+
+  inline void store_16x2_tr( const v16 &a, const v16 &b,
+			     void * ALIGNED(8) a00, void * ALIGNED(8) a01,
+			     void * ALIGNED(8) a02, void * ALIGNED(8) a03,
+			     void * ALIGNED(8) a04, void * ALIGNED(8) a05,
+			     void * ALIGNED(8) a06, void * ALIGNED(8) a07,
+			     void * ALIGNED(8) a08, void * ALIGNED(8) a09,
+			     void * ALIGNED(8) a10, void * ALIGNED(8) a11,
+			     void * ALIGNED(8) a12, void * ALIGNED(8) a13,
+			     void * ALIGNED(8) a14, void * ALIGNED(8) a15 )
+  {
+    ((int * ALIGNED(8))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(8))a00)[1] = b.i[ 0];
+
+    ((int * ALIGNED(8))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(8))a01)[1] = b.i[ 1];
+
+    ((int * ALIGNED(8))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(8))a02)[1] = b.i[ 2];
+
+    ((int * ALIGNED(8))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(8))a03)[1] = b.i[ 3];
+
+    ((int * ALIGNED(8))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(8))a04)[1] = b.i[ 4];
+
+    ((int * ALIGNED(8))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(8))a05)[1] = b.i[ 5];
+
+    ((int * ALIGNED(8))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(8))a06)[1] = b.i[ 6];
+
+    ((int * ALIGNED(8))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(8))a07)[1] = b.i[ 7];
+
+    ((int * ALIGNED(8))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(8))a08)[1] = b.i[ 8];
+
+    ((int * ALIGNED(8))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(8))a09)[1] = b.i[ 9];
+
+    ((int * ALIGNED(8))a10)[0] = a.i[10];
+    ((int * ALIGNED(8))a10)[1] = b.i[10];
+
+    ((int * ALIGNED(8))a11)[0] = a.i[11];
+    ((int * ALIGNED(8))a11)[1] = b.i[11];
+
+    ((int * ALIGNED(8))a12)[0] = a.i[12];
+    ((int * ALIGNED(8))a12)[1] = b.i[12];
+
+    ((int * ALIGNED(8))a13)[0] = a.i[13];
+    ((int * ALIGNED(8))a13)[1] = b.i[13];
+
+    ((int * ALIGNED(8))a14)[0] = a.i[14];
+    ((int * ALIGNED(8))a14)[1] = b.i[14];
+
+    ((int * ALIGNED(8))a15)[0] = a.i[15];
+    ((int * ALIGNED(8))a15)[1] = b.i[15];
+  }
+
+  inline void store_16x3_tr( const v16 &a, const v16 &b, const v16 &c,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+  }
+
+  inline void store_16x4_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+  }
+
+  inline void store_16x8_tr( const v16 &a, const v16 &b, const v16 &c, const v16 &d,
+			     const v16 &e, const v16 &f, const v16 &g, const v16 &h,
+			     void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			     void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			     void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			     void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			     void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			     void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			     void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			     void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[0] = a.i[ 0];
+    ((int * ALIGNED(64))a00)[1] = b.i[ 0];
+    ((int * ALIGNED(64))a00)[2] = c.i[ 0];
+    ((int * ALIGNED(64))a00)[3] = d.i[ 0];
+    ((int * ALIGNED(64))a00)[4] = e.i[ 0];
+    ((int * ALIGNED(64))a00)[5] = f.i[ 0];
+    ((int * ALIGNED(64))a00)[6] = g.i[ 0];
+    ((int * ALIGNED(64))a00)[7] = h.i[ 0];
+
+    ((int * ALIGNED(64))a01)[0] = a.i[ 1];
+    ((int * ALIGNED(64))a01)[1] = b.i[ 1];
+    ((int * ALIGNED(64))a01)[2] = c.i[ 1];
+    ((int * ALIGNED(64))a01)[3] = d.i[ 1];
+    ((int * ALIGNED(64))a01)[4] = e.i[ 1];
+    ((int * ALIGNED(64))a01)[5] = f.i[ 1];
+    ((int * ALIGNED(64))a01)[6] = g.i[ 1];
+    ((int * ALIGNED(64))a01)[7] = h.i[ 1];
+
+    ((int * ALIGNED(64))a02)[0] = a.i[ 2];
+    ((int * ALIGNED(64))a02)[1] = b.i[ 2];
+    ((int * ALIGNED(64))a02)[2] = c.i[ 2];
+    ((int * ALIGNED(64))a02)[3] = d.i[ 2];
+    ((int * ALIGNED(64))a02)[4] = e.i[ 2];
+    ((int * ALIGNED(64))a02)[5] = f.i[ 2];
+    ((int * ALIGNED(64))a02)[6] = g.i[ 2];
+    ((int * ALIGNED(64))a02)[7] = h.i[ 2];
+
+    ((int * ALIGNED(64))a03)[0] = a.i[ 3];
+    ((int * ALIGNED(64))a03)[1] = b.i[ 3];
+    ((int * ALIGNED(64))a03)[2] = c.i[ 3];
+    ((int * ALIGNED(64))a03)[3] = d.i[ 3];
+    ((int * ALIGNED(64))a03)[4] = e.i[ 3];
+    ((int * ALIGNED(64))a03)[5] = f.i[ 3];
+    ((int * ALIGNED(64))a03)[6] = g.i[ 3];
+    ((int * ALIGNED(64))a03)[7] = h.i[ 3];
+
+    ((int * ALIGNED(64))a04)[0] = a.i[ 4];
+    ((int * ALIGNED(64))a04)[1] = b.i[ 4];
+    ((int * ALIGNED(64))a04)[2] = c.i[ 4];
+    ((int * ALIGNED(64))a04)[3] = d.i[ 4];
+    ((int * ALIGNED(64))a04)[4] = e.i[ 4];
+    ((int * ALIGNED(64))a04)[5] = f.i[ 4];
+    ((int * ALIGNED(64))a04)[6] = g.i[ 4];
+    ((int * ALIGNED(64))a04)[7] = h.i[ 4];
+
+    ((int * ALIGNED(64))a05)[0] = a.i[ 5];
+    ((int * ALIGNED(64))a05)[1] = b.i[ 5];
+    ((int * ALIGNED(64))a05)[2] = c.i[ 5];
+    ((int * ALIGNED(64))a05)[3] = d.i[ 5];
+    ((int * ALIGNED(64))a05)[4] = e.i[ 5];
+    ((int * ALIGNED(64))a05)[5] = f.i[ 5];
+    ((int * ALIGNED(64))a05)[6] = g.i[ 5];
+    ((int * ALIGNED(64))a05)[7] = h.i[ 5];
+
+    ((int * ALIGNED(64))a06)[0] = a.i[ 6];
+    ((int * ALIGNED(64))a06)[1] = b.i[ 6];
+    ((int * ALIGNED(64))a06)[2] = c.i[ 6];
+    ((int * ALIGNED(64))a06)[3] = d.i[ 6];
+    ((int * ALIGNED(64))a06)[4] = e.i[ 6];
+    ((int * ALIGNED(64))a06)[5] = f.i[ 6];
+    ((int * ALIGNED(64))a06)[6] = g.i[ 6];
+    ((int * ALIGNED(64))a06)[7] = h.i[ 6];
+
+    ((int * ALIGNED(64))a07)[0] = a.i[ 7];
+    ((int * ALIGNED(64))a07)[1] = b.i[ 7];
+    ((int * ALIGNED(64))a07)[2] = c.i[ 7];
+    ((int * ALIGNED(64))a07)[3] = d.i[ 7];
+    ((int * ALIGNED(64))a07)[4] = e.i[ 7];
+    ((int * ALIGNED(64))a07)[5] = f.i[ 7];
+    ((int * ALIGNED(64))a07)[6] = g.i[ 7];
+    ((int * ALIGNED(64))a07)[7] = h.i[ 7];
+
+    ((int * ALIGNED(64))a08)[0] = a.i[ 8];
+    ((int * ALIGNED(64))a08)[1] = b.i[ 8];
+    ((int * ALIGNED(64))a08)[2] = c.i[ 8];
+    ((int * ALIGNED(64))a08)[3] = d.i[ 8];
+    ((int * ALIGNED(64))a08)[4] = e.i[ 8];
+    ((int * ALIGNED(64))a08)[5] = f.i[ 8];
+    ((int * ALIGNED(64))a08)[6] = g.i[ 8];
+    ((int * ALIGNED(64))a08)[7] = h.i[ 8];
+
+    ((int * ALIGNED(64))a09)[0] = a.i[ 9];
+    ((int * ALIGNED(64))a09)[1] = b.i[ 9];
+    ((int * ALIGNED(64))a09)[2] = c.i[ 9];
+    ((int * ALIGNED(64))a09)[3] = d.i[ 9];
+    ((int * ALIGNED(64))a09)[4] = e.i[ 9];
+    ((int * ALIGNED(64))a09)[5] = f.i[ 9];
+    ((int * ALIGNED(64))a09)[6] = g.i[ 9];
+    ((int * ALIGNED(64))a09)[7] = h.i[ 9];
+
+    ((int * ALIGNED(64))a10)[0] = a.i[10];
+    ((int * ALIGNED(64))a10)[1] = b.i[10];
+    ((int * ALIGNED(64))a10)[2] = c.i[10];
+    ((int * ALIGNED(64))a10)[3] = d.i[10];
+    ((int * ALIGNED(64))a10)[4] = e.i[10];
+    ((int * ALIGNED(64))a10)[5] = f.i[10];
+    ((int * ALIGNED(64))a10)[6] = g.i[10];
+    ((int * ALIGNED(64))a10)[7] = h.i[10];
+
+    ((int * ALIGNED(64))a11)[0] = a.i[11];
+    ((int * ALIGNED(64))a11)[1] = b.i[11];
+    ((int * ALIGNED(64))a11)[2] = c.i[11];
+    ((int * ALIGNED(64))a11)[3] = d.i[11];
+    ((int * ALIGNED(64))a11)[4] = e.i[11];
+    ((int * ALIGNED(64))a11)[5] = f.i[11];
+    ((int * ALIGNED(64))a11)[6] = g.i[11];
+    ((int * ALIGNED(64))a11)[7] = h.i[11];
+
+    ((int * ALIGNED(64))a12)[0] = a.i[12];
+    ((int * ALIGNED(64))a12)[1] = b.i[12];
+    ((int * ALIGNED(64))a12)[2] = c.i[12];
+    ((int * ALIGNED(64))a12)[3] = d.i[12];
+    ((int * ALIGNED(64))a12)[4] = e.i[12];
+    ((int * ALIGNED(64))a12)[5] = f.i[12];
+    ((int * ALIGNED(64))a12)[6] = g.i[12];
+    ((int * ALIGNED(64))a12)[7] = h.i[12];
+
+    ((int * ALIGNED(64))a13)[0] = a.i[13];
+    ((int * ALIGNED(64))a13)[1] = b.i[13];
+    ((int * ALIGNED(64))a13)[2] = c.i[13];
+    ((int * ALIGNED(64))a13)[3] = d.i[13];
+    ((int * ALIGNED(64))a13)[4] = e.i[13];
+    ((int * ALIGNED(64))a13)[5] = f.i[13];
+    ((int * ALIGNED(64))a13)[6] = g.i[13];
+    ((int * ALIGNED(64))a13)[7] = h.i[13];
+
+    ((int * ALIGNED(64))a14)[0] = a.i[14];
+    ((int * ALIGNED(64))a14)[1] = b.i[14];
+    ((int * ALIGNED(64))a14)[2] = c.i[14];
+    ((int * ALIGNED(64))a14)[3] = d.i[14];
+    ((int * ALIGNED(64))a14)[4] = e.i[14];
+    ((int * ALIGNED(64))a14)[5] = f.i[14];
+    ((int * ALIGNED(64))a14)[6] = g.i[14];
+    ((int * ALIGNED(64))a14)[7] = h.i[14];
+
+    ((int * ALIGNED(64))a15)[0] = a.i[15];
+    ((int * ALIGNED(64))a15)[1] = b.i[15];
+    ((int * ALIGNED(64))a15)[2] = c.i[15];
+    ((int * ALIGNED(64))a15)[3] = d.i[15];
+    ((int * ALIGNED(64))a15)[4] = e.i[15];
+    ((int * ALIGNED(64))a15)[5] = f.i[15];
+    ((int * ALIGNED(64))a15)[6] = g.i[15];
+    ((int * ALIGNED(64))a15)[7] = h.i[15];
+  }
+
+  inline void store_16x16_tr( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+			      const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+			      const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+			      const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+			      void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+			      void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+			      void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+			      void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+			      void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+			      void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+			      void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+			      void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b08.i[ 0];
+    ((int * ALIGNED(64))a00)[ 9] = b09.i[ 0];
+    ((int * ALIGNED(64))a00)[10] = b10.i[ 0];
+    ((int * ALIGNED(64))a00)[11] = b11.i[ 0];
+    ((int * ALIGNED(64))a00)[12] = b12.i[ 0];
+    ((int * ALIGNED(64))a00)[13] = b13.i[ 0];
+    ((int * ALIGNED(64))a00)[14] = b14.i[ 0];
+    ((int * ALIGNED(64))a00)[15] = b15.i[ 0];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 1];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 1];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 1];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 1];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 1];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 1];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 1];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 1];
+    ((int * ALIGNED(64))a01)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a01)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a01)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a01)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a01)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a01)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a01)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a01)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a02)[ 8] = b08.i[ 2];
+    ((int * ALIGNED(64))a02)[ 9] = b09.i[ 2];
+    ((int * ALIGNED(64))a02)[10] = b10.i[ 2];
+    ((int * ALIGNED(64))a02)[11] = b11.i[ 2];
+    ((int * ALIGNED(64))a02)[12] = b12.i[ 2];
+    ((int * ALIGNED(64))a02)[13] = b13.i[ 2];
+    ((int * ALIGNED(64))a02)[14] = b14.i[ 2];
+    ((int * ALIGNED(64))a02)[15] = b15.i[ 2];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 3];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 3];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 3];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 3];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 3];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 3];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 3];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 3];
+    ((int * ALIGNED(64))a03)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a03)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a03)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a03)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a03)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a03)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a03)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a03)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a04)[ 8] = b08.i[ 4];
+    ((int * ALIGNED(64))a04)[ 9] = b09.i[ 4];
+    ((int * ALIGNED(64))a04)[10] = b10.i[ 4];
+    ((int * ALIGNED(64))a04)[11] = b11.i[ 4];
+    ((int * ALIGNED(64))a04)[12] = b12.i[ 4];
+    ((int * ALIGNED(64))a04)[13] = b13.i[ 4];
+    ((int * ALIGNED(64))a04)[14] = b14.i[ 4];
+    ((int * ALIGNED(64))a04)[15] = b15.i[ 4];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[ 5];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[ 5];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[ 5];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[ 5];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[ 5];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[ 5];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[ 5];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[ 5];
+    ((int * ALIGNED(64))a05)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a05)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a05)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a05)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a05)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a05)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a05)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a05)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a06)[ 8] = b08.i[ 6];
+    ((int * ALIGNED(64))a06)[ 9] = b09.i[ 6];
+    ((int * ALIGNED(64))a06)[10] = b10.i[ 6];
+    ((int * ALIGNED(64))a06)[11] = b11.i[ 6];
+    ((int * ALIGNED(64))a06)[12] = b12.i[ 6];
+    ((int * ALIGNED(64))a06)[13] = b13.i[ 6];
+    ((int * ALIGNED(64))a06)[14] = b14.i[ 6];
+    ((int * ALIGNED(64))a06)[15] = b15.i[ 6];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[ 7];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[ 7];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[ 7];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[ 7];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[ 7];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[ 7];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[ 7];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[ 7];
+    ((int * ALIGNED(64))a07)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a07)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a07)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a07)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a07)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a07)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a07)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a07)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a08)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a08)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a08)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a08)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a08)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a08)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a08)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a08)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 8];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 8];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 8];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 8];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 8];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 8];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 8];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 8];
+
+    ((int * ALIGNED(64))a09)[ 0] = b00.i[ 9];
+    ((int * ALIGNED(64))a09)[ 1] = b01.i[ 9];
+    ((int * ALIGNED(64))a09)[ 2] = b02.i[ 9];
+    ((int * ALIGNED(64))a09)[ 3] = b03.i[ 9];
+    ((int * ALIGNED(64))a09)[ 4] = b04.i[ 9];
+    ((int * ALIGNED(64))a09)[ 5] = b05.i[ 9];
+    ((int * ALIGNED(64))a09)[ 6] = b06.i[ 9];
+    ((int * ALIGNED(64))a09)[ 7] = b07.i[ 9];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a10)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a10)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a10)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a10)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a10)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a10)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a10)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a10)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[10];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[10];
+    ((int * ALIGNED(64))a10)[10] = b10.i[10];
+    ((int * ALIGNED(64))a10)[11] = b11.i[10];
+    ((int * ALIGNED(64))a10)[12] = b12.i[10];
+    ((int * ALIGNED(64))a10)[13] = b13.i[10];
+    ((int * ALIGNED(64))a10)[14] = b14.i[10];
+    ((int * ALIGNED(64))a10)[15] = b15.i[10];
+
+    ((int * ALIGNED(64))a11)[ 0] = b00.i[11];
+    ((int * ALIGNED(64))a11)[ 1] = b01.i[11];
+    ((int * ALIGNED(64))a11)[ 2] = b02.i[11];
+    ((int * ALIGNED(64))a11)[ 3] = b03.i[11];
+    ((int * ALIGNED(64))a11)[ 4] = b04.i[11];
+    ((int * ALIGNED(64))a11)[ 5] = b05.i[11];
+    ((int * ALIGNED(64))a11)[ 6] = b06.i[11];
+    ((int * ALIGNED(64))a11)[ 7] = b07.i[11];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a11)[10] = b10.i[11];
+    ((int * ALIGNED(64))a11)[11] = b11.i[11];
+    ((int * ALIGNED(64))a11)[12] = b12.i[11];
+    ((int * ALIGNED(64))a11)[13] = b13.i[11];
+    ((int * ALIGNED(64))a11)[14] = b14.i[11];
+    ((int * ALIGNED(64))a11)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a12)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a12)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a12)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a12)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a12)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a12)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a12)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a12)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[12];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[12];
+    ((int * ALIGNED(64))a12)[10] = b10.i[12];
+    ((int * ALIGNED(64))a12)[11] = b11.i[12];
+    ((int * ALIGNED(64))a12)[12] = b12.i[12];
+    ((int * ALIGNED(64))a12)[13] = b13.i[12];
+    ((int * ALIGNED(64))a12)[14] = b14.i[12];
+    ((int * ALIGNED(64))a12)[15] = b15.i[12];
+
+    ((int * ALIGNED(64))a13)[ 0] = b00.i[13];
+    ((int * ALIGNED(64))a13)[ 1] = b01.i[13];
+    ((int * ALIGNED(64))a13)[ 2] = b02.i[13];
+    ((int * ALIGNED(64))a13)[ 3] = b03.i[13];
+    ((int * ALIGNED(64))a13)[ 4] = b04.i[13];
+    ((int * ALIGNED(64))a13)[ 5] = b05.i[13];
+    ((int * ALIGNED(64))a13)[ 6] = b06.i[13];
+    ((int * ALIGNED(64))a13)[ 7] = b07.i[13];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a13)[10] = b10.i[13];
+    ((int * ALIGNED(64))a13)[11] = b11.i[13];
+    ((int * ALIGNED(64))a13)[12] = b12.i[13];
+    ((int * ALIGNED(64))a13)[13] = b13.i[13];
+    ((int * ALIGNED(64))a13)[14] = b14.i[13];
+    ((int * ALIGNED(64))a13)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a14)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a14)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a14)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a14)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a14)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a14)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a14)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a14)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[14];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[14];
+    ((int * ALIGNED(64))a14)[10] = b10.i[14];
+    ((int * ALIGNED(64))a14)[11] = b11.i[14];
+    ((int * ALIGNED(64))a14)[12] = b12.i[14];
+    ((int * ALIGNED(64))a14)[13] = b13.i[14];
+    ((int * ALIGNED(64))a14)[14] = b14.i[14];
+    ((int * ALIGNED(64))a14)[15] = b15.i[14];
+
+    ((int * ALIGNED(64))a15)[ 0] = b00.i[15];
+    ((int * ALIGNED(64))a15)[ 1] = b01.i[15];
+    ((int * ALIGNED(64))a15)[ 2] = b02.i[15];
+    ((int * ALIGNED(64))a15)[ 3] = b03.i[15];
+    ((int * ALIGNED(64))a15)[ 4] = b04.i[15];
+    ((int * ALIGNED(64))a15)[ 5] = b05.i[15];
+    ((int * ALIGNED(64))a15)[ 6] = b06.i[15];
+    ((int * ALIGNED(64))a15)[ 7] = b07.i[15];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  inline void store_16x8_tr_p( const v16 &b00,
+			       const v16 &b01,
+			       const v16 &b02,
+			       const v16 &b03,
+			       const v16 &b04,
+			       const v16 &b05,
+			       const v16 &b06,
+			       const v16 &b07,
+			       void * ALIGNED(64) a00,
+			       void * ALIGNED(64) a01,
+			       void * ALIGNED(64) a02,
+			       void * ALIGNED(64) a03,
+			       void * ALIGNED(64) a04,
+			       void * ALIGNED(64) a05,
+			       void * ALIGNED(64) a06,
+			       void * ALIGNED(64) a07 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+  }
+
+  inline void store_16x16_tr_p( const v16 &b00, const v16 &b01, const v16 &b02, const v16 &b03,
+				const v16 &b04, const v16 &b05, const v16 &b06, const v16 &b07,
+				const v16 &b08, const v16 &b09, const v16 &b10, const v16 &b11,
+				const v16 &b12, const v16 &b13, const v16 &b14, const v16 &b15,
+				void * ALIGNED(64) a00, void * ALIGNED(64) a01,
+				void * ALIGNED(64) a02, void * ALIGNED(64) a03,
+				void * ALIGNED(64) a04, void * ALIGNED(64) a05,
+				void * ALIGNED(64) a06, void * ALIGNED(64) a07,
+				void * ALIGNED(64) a08, void * ALIGNED(64) a09,
+				void * ALIGNED(64) a10, void * ALIGNED(64) a11,
+				void * ALIGNED(64) a12, void * ALIGNED(64) a13,
+				void * ALIGNED(64) a14, void * ALIGNED(64) a15 )
+  {
+    ((int * ALIGNED(64))a00)[ 0] = b00.i[ 0];
+    ((int * ALIGNED(64))a00)[ 1] = b01.i[ 0];
+    ((int * ALIGNED(64))a00)[ 2] = b02.i[ 0];
+    ((int * ALIGNED(64))a00)[ 3] = b03.i[ 0];
+    ((int * ALIGNED(64))a00)[ 4] = b04.i[ 0];
+    ((int * ALIGNED(64))a00)[ 5] = b05.i[ 0];
+    ((int * ALIGNED(64))a00)[ 6] = b06.i[ 0];
+    ((int * ALIGNED(64))a00)[ 7] = b07.i[ 0];
+    ((int * ALIGNED(64))a00)[ 8] = b00.i[ 1];
+    ((int * ALIGNED(64))a00)[ 9] = b01.i[ 1];
+    ((int * ALIGNED(64))a00)[10] = b02.i[ 1];
+    ((int * ALIGNED(64))a00)[11] = b03.i[ 1];
+    ((int * ALIGNED(64))a00)[12] = b04.i[ 1];
+    ((int * ALIGNED(64))a00)[13] = b05.i[ 1];
+    ((int * ALIGNED(64))a00)[14] = b06.i[ 1];
+    ((int * ALIGNED(64))a00)[15] = b07.i[ 1];
+
+    ((int * ALIGNED(64))a01)[ 0] = b00.i[ 2];
+    ((int * ALIGNED(64))a01)[ 1] = b01.i[ 2];
+    ((int * ALIGNED(64))a01)[ 2] = b02.i[ 2];
+    ((int * ALIGNED(64))a01)[ 3] = b03.i[ 2];
+    ((int * ALIGNED(64))a01)[ 4] = b04.i[ 2];
+    ((int * ALIGNED(64))a01)[ 5] = b05.i[ 2];
+    ((int * ALIGNED(64))a01)[ 6] = b06.i[ 2];
+    ((int * ALIGNED(64))a01)[ 7] = b07.i[ 2];
+    ((int * ALIGNED(64))a01)[ 8] = b00.i[ 3];
+    ((int * ALIGNED(64))a01)[ 9] = b01.i[ 3];
+    ((int * ALIGNED(64))a01)[10] = b02.i[ 3];
+    ((int * ALIGNED(64))a01)[11] = b03.i[ 3];
+    ((int * ALIGNED(64))a01)[12] = b04.i[ 3];
+    ((int * ALIGNED(64))a01)[13] = b05.i[ 3];
+    ((int * ALIGNED(64))a01)[14] = b06.i[ 3];
+    ((int * ALIGNED(64))a01)[15] = b07.i[ 3];
+
+    ((int * ALIGNED(64))a02)[ 0] = b00.i[ 4];
+    ((int * ALIGNED(64))a02)[ 1] = b01.i[ 4];
+    ((int * ALIGNED(64))a02)[ 2] = b02.i[ 4];
+    ((int * ALIGNED(64))a02)[ 3] = b03.i[ 4];
+    ((int * ALIGNED(64))a02)[ 4] = b04.i[ 4];
+    ((int * ALIGNED(64))a02)[ 5] = b05.i[ 4];
+    ((int * ALIGNED(64))a02)[ 6] = b06.i[ 4];
+    ((int * ALIGNED(64))a02)[ 7] = b07.i[ 4];
+    ((int * ALIGNED(64))a02)[ 8] = b00.i[ 5];
+    ((int * ALIGNED(64))a02)[ 9] = b01.i[ 5];
+    ((int * ALIGNED(64))a02)[10] = b02.i[ 5];
+    ((int * ALIGNED(64))a02)[11] = b03.i[ 5];
+    ((int * ALIGNED(64))a02)[12] = b04.i[ 5];
+    ((int * ALIGNED(64))a02)[13] = b05.i[ 5];
+    ((int * ALIGNED(64))a02)[14] = b06.i[ 5];
+    ((int * ALIGNED(64))a02)[15] = b07.i[ 5];
+
+    ((int * ALIGNED(64))a03)[ 0] = b00.i[ 6];
+    ((int * ALIGNED(64))a03)[ 1] = b01.i[ 6];
+    ((int * ALIGNED(64))a03)[ 2] = b02.i[ 6];
+    ((int * ALIGNED(64))a03)[ 3] = b03.i[ 6];
+    ((int * ALIGNED(64))a03)[ 4] = b04.i[ 6];
+    ((int * ALIGNED(64))a03)[ 5] = b05.i[ 6];
+    ((int * ALIGNED(64))a03)[ 6] = b06.i[ 6];
+    ((int * ALIGNED(64))a03)[ 7] = b07.i[ 6];
+    ((int * ALIGNED(64))a03)[ 8] = b00.i[ 7];
+    ((int * ALIGNED(64))a03)[ 9] = b01.i[ 7];
+    ((int * ALIGNED(64))a03)[10] = b02.i[ 7];
+    ((int * ALIGNED(64))a03)[11] = b03.i[ 7];
+    ((int * ALIGNED(64))a03)[12] = b04.i[ 7];
+    ((int * ALIGNED(64))a03)[13] = b05.i[ 7];
+    ((int * ALIGNED(64))a03)[14] = b06.i[ 7];
+    ((int * ALIGNED(64))a03)[15] = b07.i[ 7];
+
+    ((int * ALIGNED(64))a04)[ 0] = b00.i[ 8];
+    ((int * ALIGNED(64))a04)[ 1] = b01.i[ 8];
+    ((int * ALIGNED(64))a04)[ 2] = b02.i[ 8];
+    ((int * ALIGNED(64))a04)[ 3] = b03.i[ 8];
+    ((int * ALIGNED(64))a04)[ 4] = b04.i[ 8];
+    ((int * ALIGNED(64))a04)[ 5] = b05.i[ 8];
+    ((int * ALIGNED(64))a04)[ 6] = b06.i[ 8];
+    ((int * ALIGNED(64))a04)[ 7] = b07.i[ 8];
+    ((int * ALIGNED(64))a04)[ 8] = b00.i[ 9];
+    ((int * ALIGNED(64))a04)[ 9] = b01.i[ 9];
+    ((int * ALIGNED(64))a04)[10] = b02.i[ 9];
+    ((int * ALIGNED(64))a04)[11] = b03.i[ 9];
+    ((int * ALIGNED(64))a04)[12] = b04.i[ 9];
+    ((int * ALIGNED(64))a04)[13] = b05.i[ 9];
+    ((int * ALIGNED(64))a04)[14] = b06.i[ 9];
+    ((int * ALIGNED(64))a04)[15] = b07.i[ 9];
+
+    ((int * ALIGNED(64))a05)[ 0] = b00.i[10];
+    ((int * ALIGNED(64))a05)[ 1] = b01.i[10];
+    ((int * ALIGNED(64))a05)[ 2] = b02.i[10];
+    ((int * ALIGNED(64))a05)[ 3] = b03.i[10];
+    ((int * ALIGNED(64))a05)[ 4] = b04.i[10];
+    ((int * ALIGNED(64))a05)[ 5] = b05.i[10];
+    ((int * ALIGNED(64))a05)[ 6] = b06.i[10];
+    ((int * ALIGNED(64))a05)[ 7] = b07.i[10];
+    ((int * ALIGNED(64))a05)[ 8] = b00.i[11];
+    ((int * ALIGNED(64))a05)[ 9] = b01.i[11];
+    ((int * ALIGNED(64))a05)[10] = b02.i[11];
+    ((int * ALIGNED(64))a05)[11] = b03.i[11];
+    ((int * ALIGNED(64))a05)[12] = b04.i[11];
+    ((int * ALIGNED(64))a05)[13] = b05.i[11];
+    ((int * ALIGNED(64))a05)[14] = b06.i[11];
+    ((int * ALIGNED(64))a05)[15] = b07.i[11];
+
+    ((int * ALIGNED(64))a06)[ 0] = b00.i[12];
+    ((int * ALIGNED(64))a06)[ 1] = b01.i[12];
+    ((int * ALIGNED(64))a06)[ 2] = b02.i[12];
+    ((int * ALIGNED(64))a06)[ 3] = b03.i[12];
+    ((int * ALIGNED(64))a06)[ 4] = b04.i[12];
+    ((int * ALIGNED(64))a06)[ 5] = b05.i[12];
+    ((int * ALIGNED(64))a06)[ 6] = b06.i[12];
+    ((int * ALIGNED(64))a06)[ 7] = b07.i[12];
+    ((int * ALIGNED(64))a06)[ 8] = b00.i[13];
+    ((int * ALIGNED(64))a06)[ 9] = b01.i[13];
+    ((int * ALIGNED(64))a06)[10] = b02.i[13];
+    ((int * ALIGNED(64))a06)[11] = b03.i[13];
+    ((int * ALIGNED(64))a06)[12] = b04.i[13];
+    ((int * ALIGNED(64))a06)[13] = b05.i[13];
+    ((int * ALIGNED(64))a06)[14] = b06.i[13];
+    ((int * ALIGNED(64))a06)[15] = b07.i[13];
+
+    ((int * ALIGNED(64))a07)[ 0] = b00.i[14];
+    ((int * ALIGNED(64))a07)[ 1] = b01.i[14];
+    ((int * ALIGNED(64))a07)[ 2] = b02.i[14];
+    ((int * ALIGNED(64))a07)[ 3] = b03.i[14];
+    ((int * ALIGNED(64))a07)[ 4] = b04.i[14];
+    ((int * ALIGNED(64))a07)[ 5] = b05.i[14];
+    ((int * ALIGNED(64))a07)[ 6] = b06.i[14];
+    ((int * ALIGNED(64))a07)[ 7] = b07.i[14];
+    ((int * ALIGNED(64))a07)[ 8] = b00.i[15];
+    ((int * ALIGNED(64))a07)[ 9] = b01.i[15];
+    ((int * ALIGNED(64))a07)[10] = b02.i[15];
+    ((int * ALIGNED(64))a07)[11] = b03.i[15];
+    ((int * ALIGNED(64))a07)[12] = b04.i[15];
+    ((int * ALIGNED(64))a07)[13] = b05.i[15];
+    ((int * ALIGNED(64))a07)[14] = b06.i[15];
+    ((int * ALIGNED(64))a07)[15] = b07.i[15];
+
+    ((int * ALIGNED(64))a08)[ 0] = b08.i[ 0];
+    ((int * ALIGNED(64))a08)[ 1] = b09.i[ 0];
+    ((int * ALIGNED(64))a08)[ 2] = b10.i[ 0];
+    ((int * ALIGNED(64))a08)[ 3] = b11.i[ 0];
+    ((int * ALIGNED(64))a08)[ 4] = b12.i[ 0];
+    ((int * ALIGNED(64))a08)[ 5] = b13.i[ 0];
+    ((int * ALIGNED(64))a08)[ 6] = b14.i[ 0];
+    ((int * ALIGNED(64))a08)[ 7] = b15.i[ 0];
+    ((int * ALIGNED(64))a08)[ 8] = b08.i[ 1];
+    ((int * ALIGNED(64))a08)[ 9] = b09.i[ 1];
+    ((int * ALIGNED(64))a08)[10] = b10.i[ 1];
+    ((int * ALIGNED(64))a08)[11] = b11.i[ 1];
+    ((int * ALIGNED(64))a08)[12] = b12.i[ 1];
+    ((int * ALIGNED(64))a08)[13] = b13.i[ 1];
+    ((int * ALIGNED(64))a08)[14] = b14.i[ 1];
+    ((int * ALIGNED(64))a08)[15] = b15.i[ 1];
+
+    ((int * ALIGNED(64))a09)[ 0] = b08.i[ 2];
+    ((int * ALIGNED(64))a09)[ 1] = b09.i[ 2];
+    ((int * ALIGNED(64))a09)[ 2] = b10.i[ 2];
+    ((int * ALIGNED(64))a09)[ 3] = b11.i[ 2];
+    ((int * ALIGNED(64))a09)[ 4] = b12.i[ 2];
+    ((int * ALIGNED(64))a09)[ 5] = b13.i[ 2];
+    ((int * ALIGNED(64))a09)[ 6] = b14.i[ 2];
+    ((int * ALIGNED(64))a09)[ 7] = b15.i[ 2];
+    ((int * ALIGNED(64))a09)[ 8] = b08.i[ 3];
+    ((int * ALIGNED(64))a09)[ 9] = b09.i[ 3];
+    ((int * ALIGNED(64))a09)[10] = b10.i[ 3];
+    ((int * ALIGNED(64))a09)[11] = b11.i[ 3];
+    ((int * ALIGNED(64))a09)[12] = b12.i[ 3];
+    ((int * ALIGNED(64))a09)[13] = b13.i[ 3];
+    ((int * ALIGNED(64))a09)[14] = b14.i[ 3];
+    ((int * ALIGNED(64))a09)[15] = b15.i[ 3];
+
+    ((int * ALIGNED(64))a10)[ 0] = b08.i[ 4];
+    ((int * ALIGNED(64))a10)[ 1] = b09.i[ 4];
+    ((int * ALIGNED(64))a10)[ 2] = b10.i[ 4];
+    ((int * ALIGNED(64))a10)[ 3] = b11.i[ 4];
+    ((int * ALIGNED(64))a10)[ 4] = b12.i[ 4];
+    ((int * ALIGNED(64))a10)[ 5] = b13.i[ 4];
+    ((int * ALIGNED(64))a10)[ 6] = b14.i[ 4];
+    ((int * ALIGNED(64))a10)[ 7] = b15.i[ 4];
+    ((int * ALIGNED(64))a10)[ 8] = b08.i[ 5];
+    ((int * ALIGNED(64))a10)[ 9] = b09.i[ 5];
+    ((int * ALIGNED(64))a10)[10] = b10.i[ 5];
+    ((int * ALIGNED(64))a10)[11] = b11.i[ 5];
+    ((int * ALIGNED(64))a10)[12] = b12.i[ 5];
+    ((int * ALIGNED(64))a10)[13] = b13.i[ 5];
+    ((int * ALIGNED(64))a10)[14] = b14.i[ 5];
+    ((int * ALIGNED(64))a10)[15] = b15.i[ 5];
+
+    ((int * ALIGNED(64))a11)[ 0] = b08.i[ 6];
+    ((int * ALIGNED(64))a11)[ 1] = b09.i[ 6];
+    ((int * ALIGNED(64))a11)[ 2] = b10.i[ 6];
+    ((int * ALIGNED(64))a11)[ 3] = b11.i[ 6];
+    ((int * ALIGNED(64))a11)[ 4] = b12.i[ 6];
+    ((int * ALIGNED(64))a11)[ 5] = b13.i[ 6];
+    ((int * ALIGNED(64))a11)[ 6] = b14.i[ 6];
+    ((int * ALIGNED(64))a11)[ 7] = b15.i[ 6];
+    ((int * ALIGNED(64))a11)[ 8] = b08.i[ 7];
+    ((int * ALIGNED(64))a11)[ 9] = b09.i[ 7];
+    ((int * ALIGNED(64))a11)[10] = b10.i[ 7];
+    ((int * ALIGNED(64))a11)[11] = b11.i[ 7];
+    ((int * ALIGNED(64))a11)[12] = b12.i[ 7];
+    ((int * ALIGNED(64))a11)[13] = b13.i[ 7];
+    ((int * ALIGNED(64))a11)[14] = b14.i[ 7];
+    ((int * ALIGNED(64))a11)[15] = b15.i[ 7];
+
+    ((int * ALIGNED(64))a12)[ 0] = b08.i[ 8];
+    ((int * ALIGNED(64))a12)[ 1] = b09.i[ 8];
+    ((int * ALIGNED(64))a12)[ 2] = b10.i[ 8];
+    ((int * ALIGNED(64))a12)[ 3] = b11.i[ 8];
+    ((int * ALIGNED(64))a12)[ 4] = b12.i[ 8];
+    ((int * ALIGNED(64))a12)[ 5] = b13.i[ 8];
+    ((int * ALIGNED(64))a12)[ 6] = b14.i[ 8];
+    ((int * ALIGNED(64))a12)[ 7] = b15.i[ 8];
+    ((int * ALIGNED(64))a12)[ 8] = b08.i[ 9];
+    ((int * ALIGNED(64))a12)[ 9] = b09.i[ 9];
+    ((int * ALIGNED(64))a12)[10] = b10.i[ 9];
+    ((int * ALIGNED(64))a12)[11] = b11.i[ 9];
+    ((int * ALIGNED(64))a12)[12] = b12.i[ 9];
+    ((int * ALIGNED(64))a12)[13] = b13.i[ 9];
+    ((int * ALIGNED(64))a12)[14] = b14.i[ 9];
+    ((int * ALIGNED(64))a12)[15] = b15.i[ 9];
+
+    ((int * ALIGNED(64))a13)[ 0] = b08.i[10];
+    ((int * ALIGNED(64))a13)[ 1] = b09.i[10];
+    ((int * ALIGNED(64))a13)[ 2] = b10.i[10];
+    ((int * ALIGNED(64))a13)[ 3] = b11.i[10];
+    ((int * ALIGNED(64))a13)[ 4] = b12.i[10];
+    ((int * ALIGNED(64))a13)[ 5] = b13.i[10];
+    ((int * ALIGNED(64))a13)[ 6] = b14.i[10];
+    ((int * ALIGNED(64))a13)[ 7] = b15.i[10];
+    ((int * ALIGNED(64))a13)[ 8] = b08.i[11];
+    ((int * ALIGNED(64))a13)[ 9] = b09.i[11];
+    ((int * ALIGNED(64))a13)[10] = b10.i[11];
+    ((int * ALIGNED(64))a13)[11] = b11.i[11];
+    ((int * ALIGNED(64))a13)[12] = b12.i[11];
+    ((int * ALIGNED(64))a13)[13] = b13.i[11];
+    ((int * ALIGNED(64))a13)[14] = b14.i[11];
+    ((int * ALIGNED(64))a13)[15] = b15.i[11];
+
+    ((int * ALIGNED(64))a14)[ 0] = b08.i[12];
+    ((int * ALIGNED(64))a14)[ 1] = b09.i[12];
+    ((int * ALIGNED(64))a14)[ 2] = b10.i[12];
+    ((int * ALIGNED(64))a14)[ 3] = b11.i[12];
+    ((int * ALIGNED(64))a14)[ 4] = b12.i[12];
+    ((int * ALIGNED(64))a14)[ 5] = b13.i[12];
+    ((int * ALIGNED(64))a14)[ 6] = b14.i[12];
+    ((int * ALIGNED(64))a14)[ 7] = b15.i[12];
+    ((int * ALIGNED(64))a14)[ 8] = b08.i[13];
+    ((int * ALIGNED(64))a14)[ 9] = b09.i[13];
+    ((int * ALIGNED(64))a14)[10] = b10.i[13];
+    ((int * ALIGNED(64))a14)[11] = b11.i[13];
+    ((int * ALIGNED(64))a14)[12] = b12.i[13];
+    ((int * ALIGNED(64))a14)[13] = b13.i[13];
+    ((int * ALIGNED(64))a14)[14] = b14.i[13];
+    ((int * ALIGNED(64))a14)[15] = b15.i[13];
+
+    ((int * ALIGNED(64))a15)[ 0] = b08.i[14];
+    ((int * ALIGNED(64))a15)[ 1] = b09.i[14];
+    ((int * ALIGNED(64))a15)[ 2] = b10.i[14];
+    ((int * ALIGNED(64))a15)[ 3] = b11.i[14];
+    ((int * ALIGNED(64))a15)[ 4] = b12.i[14];
+    ((int * ALIGNED(64))a15)[ 5] = b13.i[14];
+    ((int * ALIGNED(64))a15)[ 6] = b14.i[14];
+    ((int * ALIGNED(64))a15)[ 7] = b15.i[14];
+    ((int * ALIGNED(64))a15)[ 8] = b08.i[15];
+    ((int * ALIGNED(64))a15)[ 9] = b09.i[15];
+    ((int * ALIGNED(64))a15)[10] = b10.i[15];
+    ((int * ALIGNED(64))a15)[11] = b11.i[15];
+    ((int * ALIGNED(64))a15)[12] = b12.i[15];
+    ((int * ALIGNED(64))a15)[13] = b13.i[15];
+    ((int * ALIGNED(64))a15)[14] = b14.i[15];
+    ((int * ALIGNED(64))a15)[15] = b15.i[15];
+  }
+
+  //////////////
+  // v16int class
+
+  class v16int : public v16
+  {
+    // v16int prefix unary operator friends
+
+    friend inline v16int operator  +( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  ~( const v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator  !( const v16int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16int prefix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a ) ALWAYS_INLINE;
+
+    // v16int postfix increment / decrement operator friends
+
+    friend inline v16int operator ++( v16int & a, int ) ALWAYS_INLINE;
+    friend inline v16int operator --( v16int & a, int ) ALWAYS_INLINE;
+
+    // v16int binary operator friends
+
+    friend inline v16int operator  +( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  -( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  *( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  /( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  %( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  ^( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  &( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  |( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <<( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >>( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int logical operator friends
+
+    friend inline v16int operator  <( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16int &a, const v16int &b ) ALWAYS_INLINE;
+
+    // v16int miscellaneous friends
+
+    friend inline v16int abs( const v16int &a ) ALWAYS_INLINE;
+    friend inline v16    czero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    friend inline v16 notczero( const v16int &c, const v16 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v16 merge( const v16int &c, const v16 &t, const v16 &f ) ALWAYS_INLINE;
+
+    // v16float unary operator friends
+
+    friend inline v16int operator  !( const v16float & a ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float miscellaneous friends
+
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16int constructors / destructors
+
+    v16int() {}                                  // Default constructor
+
+    v16int( const v16int &a )                    // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	i[j] = a.i[j];
+    }
+
+    v16int( const v16 &a )                       // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	i[j] = a.i[j];
+    }
+
+    v16int( int a )                              // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	i[j] = a;
+    }
+
+    v16int( int i00, int i01, int i02, int i03,
+	    int i04, int i05, int i06, int i07,
+	    int i08, int i09, int i10, int i11,
+	    int i12, int i13, int i14, int i15 ) // Init from scalars
+    {
+      i[ 0] = i00; i[ 1] = i01; i[ 2] = i02; i[ 3] = i03;
+      i[ 4] = i04; i[ 5] = i05; i[ 6] = i06; i[ 7] = i07;
+      i[ 8] = i08; i[ 9] = i09; i[10] = i10; i[11] = i11;
+      i[12] = i12; i[13] = i13; i[14] = i14; i[15] = i15;
+    }
+
+    ~v16int() {}                                 // Destructor
+
+    // v16int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v16int &operator op( const v16int &b ) \
+    {						  \
+      ALWAYS_VECTORIZE                            \
+      for( int j = 0; j < 16; j++ )               \
+        i[j] op b.i[j];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v16int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v16int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v16int operator op( const v16int & a ) \
+  {						\
+    v16int b;                                   \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 16; j++ )               \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v16int operator !( const v16int & a )
+  {
+    v16int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = - ( !a.i[j] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v16int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v16int operator op( v16int & a )       \
+  {						\
+    v16int b;                                   \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 16; j++ )               \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v16int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v16int operator op( v16int & a, int ) \
+  {					       \
+    v16int b;                                  \
+    ALWAYS_VECTORIZE                           \
+    for( int j = 0; j < 16; j++ )              \
+      b.i[j] = ( a.i[j] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v16int binary operators
+
+# define BINARY(op)                                             \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {								\
+    v16int c;                                                   \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 16; j++ )                               \
+      c.i[j] = a.i[j] op b.i[j];                                \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v16int logical operators
+
+# define LOGICAL(op)                                            \
+  inline v16int operator op( const v16int &a, const v16int &b ) \
+  {                                                             \
+    v16int c;                                                   \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 16; j++ )                               \
+      c.i[j] = - ( a.i[j] op b.i[j] );                          \
+    return c;                                                   \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16int miscellaneous functions
+
+  inline v16int abs( const v16int &a )
+  {
+    v16int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j];
+
+    return b;
+  }
+
+  inline v16 czero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
+
+    return b;
+  }
+
+  inline v16 notczero( const v16int &c, const v16 &a )
+  {
+    v16 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[j] & c.i[j];
+
+    return b;
+  }
+
+  inline v16 merge( const v16int &c, const v16 &t, const v16 &f )
+  {
+    v16 m;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+
+    return m;
+  }
+
+  ////////////////
+  // v16float class
+
+  class v16float : public v16
+  {
+    // v16float prefix unary operator friends
+
+    friend inline v16float operator  +( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator  ~( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16int   operator  !( const v16float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v16float prefix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a ) ALWAYS_INLINE;
+
+    // v16float postfix increment / decrement operator friends
+
+    friend inline v16float operator ++( v16float &a, int ) ALWAYS_INLINE;
+    friend inline v16float operator --( v16float &a, int ) ALWAYS_INLINE;
+
+    // v16float binary operator friends
+
+    friend inline v16float operator  +( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  -( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  *( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16float operator  /( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float logical operator friends
+
+    friend inline v16int operator  <( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator  >( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ==( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator !=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator <=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator >=( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator &&( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+    friend inline v16int operator ||( const v16float &a, const v16float &b ) ALWAYS_INLINE;
+
+    // v16float math library friends
+
+#   define CMATH_FR1(fn) friend inline v16float fn( const v16float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v16float fn( const v16float &a,  \
+                                                    const v16float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v16float miscellaneous friends
+
+    friend inline v16float rsqrt_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rsqrt       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp_approx( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float rcp       ( const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float fma ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fms ( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float fnms( const v16float &a, const v16float &b, const v16float &c ) ALWAYS_INLINE;
+    friend inline v16float  clear_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float    set_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline v16float toggle_bits( const v16int &m, const v16float &a ) ALWAYS_INLINE;
+    friend inline void increment_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+    friend inline void     scale_16x1( float * ALIGNED(64) p, const v16float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v16float constructors / destructors
+
+    v16float() {}                                          // Default constructor
+
+    v16float( const v16float &a )                          // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	f[j] = a.f[j];
+    }
+
+    v16float( const v16 &a )                               // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	f[j] = a.f[j];
+    }
+
+    v16float( float a )                                    // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 16; j++ )
+	f[j] = a;
+    }
+
+    v16float( float f00, float f01, float f02, float f03,
+	      float f04, float f05, float f06, float f07,
+	      float f08, float f09, float f10, float f11,
+	      float f12, float f13, float f14, float f15 ) // Init from scalars
+    {
+      f[ 0] = f00; f[ 1] = f01; f[ 2] = f02; f[ 3] = f03;
+      f[ 4] = f04; f[ 5] = f05; f[ 6] = f06; f[ 7] = f07;
+      f[ 8] = f08; f[ 9] = f09; f[10] = f10; f[11] = f11;
+      f[12] = f12; f[13] = f13; f[14] = f14; f[15] = f15;
+    }
+
+    ~v16float() {}                                         // Destructor
+
+    // v16float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v16float &operator op( const v16float &b )   \
+    {							\
+      ALWAYS_VECTORIZE                                  \
+      for( int j = 0; j < 16; j++ )                     \
+        f[j] op b.f[j];	                                \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v16float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v16float prefix unary operators
+
+  inline v16float operator +( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = +a.f[j];
+
+    return b;
+  }
+
+  inline v16float operator -( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = -a.f[j];
+
+    return b;
+  }
+
+  inline v16int operator !( const v16float &a )
+  {
+    v16int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = a.i[j] ? 0 : -1;
+
+    return b;
+  }
+
+  // v16float prefix increment / decrement operators
+
+  inline v16float operator ++( v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = ++a.f[j];
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = --a.f[j];
+
+    return b;
+  }
+
+  // v16float postfix increment / decrement operators
+
+  inline v16float operator ++( v16float &a, int )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = a.f[j]++;
+
+    return b;
+  }
+
+  inline v16float operator --( v16float &a, int )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = a.f[j]--;
+
+    return b;
+  }
+
+  // v16float binary operators
+
+# define BINARY(op)                                                   \
+  inline v16float operator op( const v16float &a, const v16float &b ) \
+  {								      \
+    v16float c;                                                       \
+    ALWAYS_VECTORIZE                                                  \
+    for( int j = 0; j < 16; j++ )                                     \
+      c.f[j] = a.f[j] op b.f[j];                                      \
+    return c;                                                         \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v16float logical operators
+
+# define LOGICAL(op)                                                \
+  inline v16int operator op( const v16float &a, const v16float &b ) \
+  {								    \
+    v16int c;                                                       \
+    ALWAYS_VECTORIZE                                                \
+    for( int j = 0; j < 16; j++ )                                   \
+      c.i[j] = -( a.f[j] op b.f[j] );                               \
+    return c;                                                       \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v16float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v16float fn( const v16float &a )       \
+  {						\
+    v16float b;                                 \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 16; j++ )               \
+      b.f[j] = ::fn( a.f[j] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v16float fn( const v16float &a, const v16float &b )    \
+  {								\
+    v16float c;                                                 \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 16; j++ )                               \
+      c.f[j] = ::fn( a.f[j], b.f[j] );                          \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v16float copysign( const v16float &a, const v16float &b )
+  {
+    v16float c;
+    float t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+    {
+      t = ::fabs( a.f[j] );
+      if( b.f[j] < 0 ) t = -t;
+      c.f[j] = t;
+    }
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v16float miscellaneous functions
+
+  inline v16float rsqrt_approx( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = ::sqrt( 1.0f/a.f[j] );
+
+    return b;
+  }
+
+  inline v16float rsqrt( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = ::sqrt( 1.0f/a.f[j] );
+
+    return b;
+  }
+
+  inline v16float rcp_approx( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = 1.0f/a.f[j];
+
+    return b;
+  }
+
+  inline v16float rcp( const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.f[j] = 1.0f/a.f[j];
+
+    return b;
+  }
+
+  inline v16float fma( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      d.f[j] = a.f[j] * b.f[j] + c.f[j];
+
+    return d;
+  }
+
+  inline v16float fms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      d.f[j] = a.f[j] * b.f[j] - c.f[j];
+
+    return d;
+  }
+
+  inline v16float fnms( const v16float &a, const v16float &b, const v16float &c )
+  {
+    v16float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      d.f[j] = c.f[j] - a.f[j] * b.f[j];
+
+    return d;
+  }
+
+  inline v16float clear_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = ( ~m.i[j] ) & a.i[j];
+
+    return b;
+  }
+
+  inline v16float set_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = m.i[j] | a.i[j];
+
+    return b;
+  }
+
+  inline v16float toggle_bits( const v16int &m, const v16float &a )
+  {
+    v16float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      b.i[j] = m.i[j] ^ a.i[j];
+
+    return b;
+  }
+
+  inline void increment_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      p[j] += a.f[j];
+  }
+
+  inline void decrement_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      p[j] -= a.f[j];
+  }
+
+  inline void scale_16x1( float * ALIGNED(64) p, const v16float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 16; j++ )
+      p[j] *= a.f[j];
+  }
+
+} // namespace v16
+
+#endif // _v16_portable_h_
diff --git a/src/util/v4/v4.h b/src/util/v4/v4.h
index 54135b7e..3cf5183c 100644
--- a/src/util/v4/v4.h
+++ b/src/util/v4/v4.h
@@ -10,6 +10,10 @@
 #   include "v4_portable.h"
 # elif defined USE_V4_SSE
 #   include "v4_sse.h"
+# elif defined USE_V4_AVX
+#   include "v4_avx.h"
+# elif defined USE_V4_AVX2
+#   include "v4_avx2.h"
 # endif
 #endif
 #undef IN_v4_h
diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h
index 930521f5..2c52d963 100644
--- a/src/util/v4/v4_altivec.h
+++ b/src/util/v4/v4_altivec.h
@@ -15,6 +15,12 @@
 #include <math.h>
 #include <altivec.h>
 
+// See if this fixes a problem when compiling with GNU compilers.
+#ifdef __GNUC__
+#undef bool
+#undef vector
+#endif
+
 namespace v4 {
 
   class v4;
@@ -51,14 +57,51 @@ namespace v4 {
     
     friend class v4int;
     friend class v4float;
-      
+
+    // -----------------------------------------------------------------------------
+    // hacks that need to be resolved more elegantly
+
+/*     friend inline v4 operator *( const v4 &a, const v4 &b ); */
+
+/* #   define ASSIGN(op,instr)                             \ */
+/*     inline v4 &operator op( const v4 &b )               \ */
+/*     {							\ */
+/*       instr;                                            \ */
+/*       return *this;                                     \ */
+/*     } */
+
+/*     ASSIGN(=, v = b.v ); */
+
+/* #   undef ASSIGN */
+
+/* #   define BINARY(op,instr)                                            \ */
+/*     inline v4 operator op( const v4 &a, const v4 &b )                  \ */
+/*     {								       \ */
+/*       v4 c;                                                            \ */
+/*       instr;                                                           \ */
+/*       return c;                                                        \ */
+/*     } */
+
+/*     BINARY(+, c.v = vec_add( a.v, b.v ) ) */
+/*     BINARY(-, c.v = vec_sub( a.v, b.v ) ) */
+/*     BINARY(*, c.v = vec_mul( a.v, b.v ) ) */
+/*       // BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */
+
+/* #   undef BINARY */
+    // end hacks
+    // -----------------------------------------------------------------------------
+
     // v4 miscellenous friends
 
     friend inline int any( const v4 &a );
     friend inline int all( const v4 &a );
-    friend inline v4 splat( const v4 &a, int n );
-    friend inline v4 shuffle( const v4 &a,
-                              int i0, int i1, int i2, int i3 );
+    template<int n>
+    friend inline v4 splat( const v4 &a );
+    // friend inline v4 splat( const v4 &a, int n );
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a );
+    // friend inline v4 shuffle( const v4 &a,
+    //                           int i0, int i1, int i2, int i3 );
     friend inline void swap( v4 &a, v4 &b );
     friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 );
 
@@ -120,6 +163,7 @@ namespace v4 {
                                      void * ALIGNED(16) a3 );
 
   protected:
+  public: // wdn
 
     _v4_float v;
     
@@ -142,15 +186,16 @@ namespace v4 {
   inline int all( const v4 &a ) {
     return vec_all_ne( (_v4_int)a.v, _false );
   }
-  
-  inline v4 splat( const v4 & a, int n ) {
+
+  template<int n>
+  inline v4 splat( const v4 & a ) {
     v4 b;
     b.v = vec_splat( a.v, n );
     return b;
   }
 
-  inline v4 shuffle( const v4 & a,
-                     int i0, int i1, int i2, int i3 ) {
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a ) {
     _v4_float a_v = a.v;
     v4 b;
     b.v = vec_perm( a_v, a_v, _PERM( i0, i1, i2, i3 ) );
@@ -229,6 +274,7 @@ namespace v4 {
                        ((const float *)pd)[0] };
   }
 
+  #if 0
   inline void load_4x2_tr( const void * ALIGNED(8) pa,
                            const void * ALIGNED(8) pb,
                            const void * ALIGNED(8) pc,
@@ -243,6 +289,29 @@ namespace v4 {
                        ((const float *)pc)[1],
                        ((const float *)pd)[1] };
   }
+  #endif
+
+  inline void load_4x2_tr( const void * ALIGNED(8) pa,
+                           const void * ALIGNED(8) pb,
+                           const void * ALIGNED(8) pc,
+                           const void * ALIGNED(8) pd,
+                           v4 &a, v4 &b )
+  {
+    _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 =  0  1  2  3
+    _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 =  4  5  6  7
+    _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 =  8  9 10 11
+    _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 15
+
+    // Step 1: Interleave top and bottom half
+
+    _v4_float a1 = vec_mergeh( a0, c1 );           // a1 =  0  8  1  9
+    _v4_float b1 = vec_mergeh( b0, d1 );           // b1 =  4 12  5 13
+
+    // Step 2: Interleave even and odd rows
+
+    a.v          = vec_mergeh( a1, b1 );           // a  =  0  4  8 12
+    b.v          = vec_mergel( a1, b1 );           // b  =  1  5  9 13
+  }
   
   inline void load_4x3_tr( const void * ALIGNED(16) pa,
                            const void * ALIGNED(16) pb,
@@ -272,7 +341,8 @@ namespace v4 {
                            const void * ALIGNED(16) pb,
                            const void * ALIGNED(16) pc,
                            const void * ALIGNED(16) pd,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
     _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 =  0  1  2  3
     _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 =  4  5  6  7
     _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 =  8  9 10 11
@@ -699,6 +769,12 @@ namespace v4 {
     friend inline v4float operator  *( const v4float &a, const v4float &b );
     friend inline v4float operator  /( const v4float &a, const v4float &b );
 
+    // -------------------------------------------------------------------------
+    // begin hacks
+    // friend inline v4float operator  *( const v4float &a, const v4 &b );
+    // end hacks
+    // -------------------------------------------------------------------------
+
     // v4float logical operator friends
 
     friend inline v4int operator  <( const v4float &a, const v4float &b );
@@ -777,6 +853,33 @@ namespace v4 {
     ASSIGN(-=, v = vec_sub(v,b.v) );
     ASSIGN(*=, v = vec_madd(v,b.v,_zero) );
 
+    // This does one NR iteration and is supposed to be accurate enough.
+    inline v4float &operator /=( const v4float &a ) {
+      _v4_float a_v = a.v, b_v;
+
+      // Compute an estimate of the reciprocal of a (??-bit accurate)
+
+      b_v = vec_re( a_v );
+
+      // FIXME: CHECK NUMERICS ... HOW MANY N-R STEPS TO USE?  APPLE'S
+      // ALTIVEC WEB PAGE SUGGESTS TWO STEPS AND GIVES THE BELOW
+      // IMPLEMENTATION FOR THE REFINEMENT.
+
+      // FIXME: IS THIS THE MOST ACCURATE FORM FOR THE REFINEMENT?
+      // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE
+      // ACCURATE (OR AT LEAST USES FEWER CONSTANTS).
+
+      b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+
+      // Compute n * refined( (1/a)_estimate ) to get result n/a
+
+      v = vec_madd( v, b_v, _zero );
+
+      return *this;
+    }
+
+    #if 0
+    // This is a more accurate version that does two NR iterations.
     inline v4float &operator /=( const v4float &a ) {
       _v4_float a_v = a.v, b_v;
 
@@ -801,6 +904,7 @@ namespace v4 {
 
       return *this;
     }
+    #endif
 
 #   undef ASSIGN
 
@@ -873,7 +977,7 @@ namespace v4 {
   }
 
   // v4float binary operators
-    
+
 # define BINARY(op,instr)                                            \
   inline v4float operator op( const v4float &a, const v4float &b ) { \
     v4float c;                                                       \
@@ -885,6 +989,33 @@ namespace v4 {
   BINARY(-, c.v = vec_sub( a.v, b.v ) )
   BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) )
 
+  inline v4float operator /( const v4float &n, const v4float &a ) {
+    _v4_float a_v = a.v, b_v;
+    v4float c;
+
+    // Compute an estimate of the reciprocal of a (??-bit accurate)
+
+    b_v = vec_re( a_v );
+
+    // FIXME: CHECK NUMERICS ... HOW MANY N-R STEPS TO USE?  APPLE'S
+    // ALTIVEC WEB PAGE SUGGESTS TWO STEPS AND GIVES THE BELOW
+    // IMPLEMENTATION FOR THE REFINEMENT.
+
+    // FIXME: IS THIS THE MOST ACCURATE FORM FOR THE REFINEMENT?
+    // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE
+    // ACCURATE (OR AT LEAST USES FEWER CONSTANTS).
+
+    b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+
+    // Compute n * refined( (1/a)_estimate ) to get result n/a
+    
+    c.v = vec_madd( n.v, b_v, _zero );
+
+    return c;
+  }
+
+  #if 0
+  // This is a more accurate version that does two NR iterations.
   inline v4float operator /( const v4float &n, const v4float &a ) {
     _v4_float a_v = a.v, b_v;
     v4float c;
@@ -910,9 +1041,25 @@ namespace v4 {
 
     return c;
   }
+  #endif
 
 # undef BINARY
 
+  // -------------------------------------------------------------------------
+  // begin hacks
+/* # define BINARY(op,instr)                                            \ */
+/*   inline v4float operator op( const v4float &a, const v4 &b ) {      \ */
+/*     v4float c;                                                       \ */
+/*     instr;                                                           \ */
+/*     return c;                                                        \ */
+/*   } */
+
+/*   BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */
+
+/* # undef BINARY */
+  // end hacks
+  // -------------------------------------------------------------------------
+
   // v4float logical operators
 
 # define LOGICAL(op,instr)                                         \
@@ -944,10 +1091,10 @@ namespace v4 {
     union { float f[4]; _v4_float v; } t; \
     v4float b;                            \
     t.v = a.v;                            \
-    b.v = (_v4_float){ ::fn( t.f[0] ),    \
-                       ::fn( t.f[1] ),    \
-                       ::fn( t.f[2] ),    \
-                       ::fn( t.f[3] ) };  \
+    b.v = (_v4_float){ (float) ::fn( t.f[0] ),  \
+                       (float) ::fn( t.f[1] ),    \
+                       (float) ::fn( t.f[2] ),    \
+                       (float) ::fn( t.f[3] ) };  \
     return b;                             \
   }
   
@@ -958,10 +1105,10 @@ namespace v4 {
     v4float c;                                              \
     t.v = a.v;                                              \
     u.v = b.v;                                              \
-    c.v = (_v4_float){ ::fn( t.f[0], u.f[0] ),              \
-                       ::fn( t.f[1], u.f[1] ),              \
-                       ::fn( t.f[2], u.f[2] ),              \
-                       ::fn( t.f[3], u.f[3] ) };            \
+    c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ),              \
+                       (float) ::fn( t.f[1], u.f[1] ),              \
+                       (float) ::fn( t.f[2], u.f[2] ),              \
+                       (float) ::fn( t.f[3], u.f[3] ) };            \
     return c;                                               \
   }
 
@@ -977,6 +1124,31 @@ namespace v4 {
     return b;
   }
 
+  // This version does one NR iteration and is supposed to be accurate enough.
+  inline v4float sqrt( const v4float &a ) {
+    _v4_float a_v = a.v, b_v;
+    v4float b;
+
+    // Compute an estimate of the rsqrt (??-bit accurate)
+
+    b_v = vec_rsqrte( a_v );
+
+    // FIXME: CHECK NUMERICS.  HOW MANY N-R STEPS NECESSARY?
+    // APPLE'S ALTIVEC PAGE SUGGESTS TWO.
+
+    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+                    vec_madd( b_v, _half, _zero ),
+                    b_v );
+
+    // Compute the sqrt(a) via a*refined_rsqrt_estimate(a) ~ sqrt(a)
+
+    b.v = vec_madd( a_v, b_v, _zero );
+
+    return b;
+  }
+
+  #if 0
+  // This is a more accurate version that does two NR iterations.
   inline v4float sqrt( const v4float &a ) {
     _v4_float a_v = a.v, b_v;
     v4float b;
@@ -1001,6 +1173,7 @@ namespace v4 {
 
     return b;
   }
+  #endif
 
   inline v4float copysign( const v4float &a, const v4float &b ) {
     v4float c;
@@ -1018,7 +1191,31 @@ namespace v4 {
     b.v = vec_rsqrte( a.v );
     return b;
   }
-  
+
+  // This version does one NR iteration and is supposed to be accurate enough.
+  inline v4float rsqrt( const v4float &a ) {
+    _v4_float a_v = a.v, b_v;
+    v4float b;
+
+    // Compute an estimate of the rsqrt (??-bit accurate)
+
+    b_v = vec_rsqrte( a_v );
+
+    // FIXME: CHECK NUMERICS.  HOW MANY N-R STEPS NECESSARY?
+    // APPLE'S ALTIVEC PAGE SUGGESTS TWO.
+
+    //    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+    //                    vec_madd( b_v, _half, _zero ),
+    //                    b_v );
+    b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+                    vec_madd( b_v, _half, _zero ),
+                    b_v );
+
+    return b;
+  }
+
+  #if 0
+  // This is a more accurate version that does two NR iterations.
   inline v4float rsqrt( const v4float &a ) {
     _v4_float a_v = a.v, b_v;
     v4float b;
@@ -1039,13 +1236,39 @@ namespace v4 {
 
     return b;
   }
+  #endif
 
   inline v4float rcp_approx( const v4float &a ) {
     v4float b;
     b.v = vec_re( a.v );
     return b;
   }
-  
+
+  // This version does one NR iteration and is supposed to be accurate enough.
+  inline v4float rcp( const v4float &a ) {
+    _v4_float a_v = a.v, b_v;
+    v4float b;
+
+    // Compute an estimate of the reciprocal of a (??-bit accurate)
+
+    b_v = vec_re( a_v );
+
+    // FIXME: CHECK NUMERICS ... HOW MANY N-R STEPS TO USE?  APPLE'S
+    // ALTIVEC WEB PAGE SUGGESTS TWO STEPS AND GIVES THE BELOW
+    // IMPLEMENTATION FOR THE REFINEMENT.
+
+    // FIXME: IS THIS THE MOST ACCURATE FORM FOR THE REFINEMENT?
+    // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE
+    // ACCURATE (OR AT LEAST USES FEWER CONSTANTS).
+
+    //    b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+    b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+
+    return b;
+  }
+
+  #if 0
+  // This is a more accurate version that does two NR iterations.
   inline v4float rcp( const v4float &a ) {
     _v4_float a_v = a.v, b_v;
     v4float b;
@@ -1067,6 +1290,7 @@ namespace v4 {
 
     return b;
   }
+  #endif
 
   inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
     v4float d;
@@ -1076,7 +1300,8 @@ namespace v4 {
 
   inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
     v4float d;
-    d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ...
+    //    d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ...
+    d.v = vec_msub( a.v, b.v, c.v ) ; 
     return d;
   }
 
diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h
new file mode 100644
index 00000000..f2b47552
--- /dev/null
+++ b/src/util/v4/v4_avx.h
@@ -0,0 +1,1013 @@
+#ifndef _v4_avx_h_
+#define _v4_avx_h_
+
+#ifndef IN_v4_h
+#error "Do not include v4_avx.h directly; use v4.h"
+#endif
+
+#define V4_ACCELERATION
+#define V4_AVX_ACCELERATION
+
+#include <xmmintrin.h>
+#include <math.h>
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+// FIXME: IN PORTABLE, ALTIVEC, SPU
+// - UPDATE V4INT, V4FLOAT
+
+// This requires gcc-3.3 and up
+// Also, Bug 12902 has not been resolved on gcc-3.x.x. See README.patches for
+// details.  gcc-4.x.x does not seem to have this bug but may suffer from
+// other problems (use "-fno-strict-aliasing" on these platforms)
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v4 {
+
+  class v4;
+  class v4int;
+  class v4float;
+
+  template<int i0, int i1, int i2, int i3> struct permute {
+    constexpr static int value = i0 + i1*4 + i2*16 + i3*64;
+  }; // permute
+
+# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
+  
+  ////////////////
+  // v4 base class
+  
+  class v4 {
+    
+    friend class v4int;
+    friend class v4float;
+      
+    // v4 miscellenous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4 czero(    const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+
+    // v4 memory manipulation friends
+        
+    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void copy_4x1( void * ALIGNED(16) dst,
+                                 const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v4 transposed memory manipulation friends
+
+    friend inline void load_4x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+                                    v4 &a ) ALWAYS_INLINE;
+    friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+    friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+    
+    friend inline void store_4x1_tr( const v4 &a,
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+    friend inline void store_4x2_tr( const v4 &a, const v4 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+    friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+    friend inline void store_4x4_tr( const v4 &a, const v4 &b,
+                                     const v4 &c, const v4 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+  protected:
+
+    union {
+      int i[4];
+      float f[4];
+      __m128 v;
+    };
+    
+  public:
+
+    v4() {}                    // Default constructor
+    v4(const v4 &a) { v=a.v; } // Copy constructor
+    ~v4() {}                   // Default destructor
+
+  };
+  
+  // v4 miscellaneous functions
+
+  inline int any( const v4 &a ) {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3];
+  }
+  
+  inline int all( const v4 &a ) {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3];
+  }
+
+  // Note: n MUST BE AN IMMEDIATE!
+  template<int n>
+  inline v4 splat(const v4 & a) {
+    __m128 a_v = a.v;
+    v4 b;
+    b.v = _mm_shuffle_ps( a_v, a_v, (n*permute<1,1,1,1>::value));
+    return b;
+  }
+
+  // Note: i0:3 MUST BE IMMEDIATES! */
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a ) {
+    __m128 a_v = a.v;
+    v4 b;
+    b.v = _mm_shuffle_ps( a_v, a_v, (permute<i0,i1,i2,i3>::value) );
+    return b;
+  }
+
+  inline void swap( v4 &a, v4 &b ) { 
+    __m128 a_v = a.v; a.v = b.v; b.v = a_v;
+  }
+
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) {
+    __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u;
+    t    = _mm_unpackhi_ps( a0_v, a1_v );
+    a0_v = _mm_unpacklo_ps( a0_v, a1_v );
+    u    = _mm_unpackhi_ps( a2_v, a3_v );
+    a2_v = _mm_unpacklo_ps( a2_v, a3_v );
+    a1_v = _mm_movehl_ps( a2_v, a0_v );
+    a0_v = _mm_movelh_ps( a0_v, a2_v );
+    a2_v = _mm_movelh_ps( t, u );
+    a3_v = _mm_movehl_ps( u, t );
+    a0.v = a0_v; a1.v = a1_v; a2.v = a2_v; a3.v = a3_v;
+  }
+
+  // v4 memory manipulation functions
+  
+  inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) {
+    a.v = _mm_load_ps((float *)p);
+  }
+
+  inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) {
+    _mm_store_ps((float *)p,a.v);
+  }
+
+  inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) {
+    _mm_stream_ps((float *)p,a.v);
+  }
+
+  inline void clear_4x1( void * ALIGNED(16) p ) {
+    _mm_store_ps( (float *)p, _mm_setzero_ps() );
+  }
+
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src ) {
+    _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) );
+  }
+
+  /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
+  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) {
+    __m128 t = _mm_load_ps((float *)a);
+    _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) );
+    _mm_store_ps( (float *)b, t );
+  }
+
+  // v4 transposed memory manipulation functions
+
+  inline void load_4x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3, v4 &a ) {
+    a.v = _mm_setr_ps( ((const float *)a0)[0],
+                       ((const float *)a1)[0],
+                       ((const float *)a2)[0],
+                       ((const float *)a3)[0] );
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a, v4 &b ) {
+    __m128 a_v, b_v, t;
+    b_v = _mm_setzero_ps();
+    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
+    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+    a_v = _mm_shuffle_ps( t, b_v, 0x88 );
+    b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+    a.v = a_v; b.v = b_v;
+  }
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c ) {
+    __m128 a_v, b_v, c_v, t, u;
+    t   = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    u   = _mm_load_ps( (const float *)a3 );
+    a_v = _mm_unpacklo_ps( t, b_v );
+    b_v = _mm_unpackhi_ps( t, b_v );
+    t   = _mm_unpacklo_ps( c_v, u );
+    u   = _mm_unpackhi_ps( c_v, u );
+    c_v = _mm_movelh_ps( b_v, u );
+    b_v = _mm_movehl_ps( t, a_v );
+    a_v = _mm_movelh_ps( a_v, t );
+    a.v = a_v; b.v = b_v; c.v = c_v;
+  }
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+    a_v = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    d_v = _mm_load_ps( (const float *)a3 );
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+    b_v = _mm_movehl_ps( c_v, a_v );
+    a_v = _mm_movelh_ps( a_v, c_v );
+    c_v = _mm_movelh_ps( t, u );
+    d_v = _mm_movehl_ps( u, t );
+    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
+  }
+
+  inline void store_4x1_tr( const v4 &a,
+                            void *a0, void *a1, void *a2, void *a3 ) {
+    ((float *)a0)[0] = a.f[0];
+    ((float *)a1)[0] = a.f[1];
+    ((float *)a2)[0] = a.f[2];
+    ((float *)a3)[0] = a.f[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a, const v4 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) {
+    __m128 a_v = a.v, b_v = b.v, t;
+    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
+    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
+    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
+    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
+    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
+    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
+  }
+
+  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
+    __m128 a_v = a.v, b_v = b.v, t;
+    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
+    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
+    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
+    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
+    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
+    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
+    ((float *)a0)[2] = c.f[0];
+    ((float *)a1)[2] = c.f[1];
+    ((float *)a2)[2] = c.f[2];
+    ((float *)a3)[2] = c.f[3];
+  }
+
+  /* FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) */
+  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
+    __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+    b_v = _mm_movehl_ps( c_v, a_v );
+    a_v = _mm_movelh_ps( a_v, c_v );
+    c_v = _mm_movelh_ps( t, u );
+    d_v = _mm_movehl_ps( u, t );
+    _mm_store_ps( (float *)a0, a_v );
+    _mm_store_ps( (float *)a1, b_v );
+    _mm_store_ps( (float *)a2, c_v );
+    _mm_store_ps( (float *)a3, d_v );
+  }
+
+  //////////////
+  // v4int class
+
+  class v4int : public v4 {
+
+    // v4int prefix unary operator friends
+
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4int prefix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
+
+    // v4int postfix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
+
+    // v4int binary operator friends
+
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int logical operator friends
+
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
+
+    // v4float unary operator friends
+
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float miscellaneous friends
+
+    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v4int constructors / destructors
+    
+    v4int() {}                                // Default constructor
+    v4int( const v4int &a ) { v = a.v; }      // Copy constructor
+    v4int( const v4 &a ) { v = a.v; }         // Init from mixed
+    v4int( int a ) {                          // Init from scalar
+      union { int i; float f; } u;
+      u.i = a;
+      v = _mm_set1_ps( u.f );
+    }
+    v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars
+      union { int i; float f; } u0, u1, u2, u3;
+      u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3;
+      v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f );
+    }
+    ~v4int() {};                              // Destructor
+    
+    // v4int assignment operators
+  
+#   define ASSIGN(op)			          \
+    inline v4int &operator op( const v4int &b ) { \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      return *this;                               \
+    }
+
+    inline v4int &operator =(const v4int &b) {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+
+    inline v4int &operator ^=(const v4int &b) {
+      v = _mm_xor_ps( v, b.v );
+      return *this;
+    }
+
+    inline v4int &operator &=(const v4int &b) {
+      v = _mm_and_ps( v, b.v );
+      return *this;
+    }
+
+    inline v4int &operator |=(const v4int &b) {
+      v = _mm_or_ps( v, b.v );
+      return *this;
+    }
+
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v4int member access operator
+    
+    inline int &operator []( int n ) { return i[n]; }
+    inline int  operator ()( int n ) { return i[n]; }
+
+  };
+
+  // v4int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v4int operator op( const v4int & a ) { \
+    v4int b;                                    \
+    b.i[0] = (op a.i[0]);                       \
+    b.i[1] = (op a.i[1]);                       \
+    b.i[2] = (op a.i[2]);                       \
+    b.i[3] = (op a.i[3]);                       \
+    return b;                                   \
+  }
+
+  inline v4int operator +( const v4int & a ) {
+    v4int b;
+    b.v = a.v;
+    return b;
+  }
+
+  PREFIX_UNARY(-)
+
+  inline v4int operator !( const v4int & a ) {
+    v4int b;
+    b.i[0] = -(!a.i[0]);
+    b.i[1] = -(!a.i[1]);
+    b.i[2] = -(!a.i[2]);
+    b.i[3] = -(!a.i[3]);
+    return b;
+  }
+
+  inline v4int operator ~( const v4int & a ) {
+    v4int b;
+    union { int i; float f; } u;
+    u.i = -1;
+    b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) );
+    return b;
+  }
+  
+# undef PREFIX_UNARY
+
+  // v4int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v4int operator op( v4int & a ) {       \
+    v4int b;                                    \
+    b.i[0] = (op a.i[0]);                       \
+    b.i[1] = (op a.i[1]);                       \
+    b.i[2] = (op a.i[2]);                       \
+    b.i[3] = (op a.i[3]);                       \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v4int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v4int operator op( v4int & a, int ) { \
+    v4int b;                                   \
+    b.i[0] = (a.i[0] op);                      \
+    b.i[1] = (a.i[1] op);                      \
+    b.i[2] = (a.i[2] op);                      \
+    b.i[3] = (a.i[3] op);                      \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v4int binary operators
+  
+# define BINARY(op)                                             \
+  inline v4int operator op( const v4int &a, const v4int &b ) {	\
+    v4int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+
+  inline v4int operator ^( const v4int &a, const v4int &b ) {
+    v4int c;
+    c.v = _mm_xor_ps( a.v, b.v );
+    return c;
+  }
+
+  inline v4int operator &( const v4int &a, const v4int &b ) {
+    v4int c;
+    c.v = _mm_and_ps( a.v, b.v );
+    return c;
+  }
+
+  inline v4int operator |( const v4int &a, const v4int &b ) {
+    v4int c;
+    c.v = _mm_or_ps( a.v, b.v );
+    return c;
+  }
+
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v4int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v4int operator op( const v4int &a, const v4int &b ) { \
+    v4int c;                                                   \
+    c.i[0] = -(a.i[0] op b.i[0]);                              \
+    c.i[1] = -(a.i[1] op b.i[1]);                              \
+    c.i[2] = -(a.i[2] op b.i[2]);                              \
+    c.i[3] = -(a.i[3] op b.i[3]);                              \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+  
+# undef LOGICAL
+
+  // v4int miscellaneous functions
+
+  inline v4int abs( const v4int &a ) {
+    v4int b;
+    b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0];
+    b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1];
+    b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2];
+    b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3];
+    return b;
+  }
+
+  inline v4 czero( const v4int &c, const v4 &a ) {
+    v4 b;
+    b.v = _mm_andnot_ps(c.v,a.v);
+    return b;
+  }
+
+  inline v4 notczero( const v4int &c, const v4 &a ) {
+    v4 b;
+    b.v = _mm_and_ps(c.v,a.v);
+    return b;
+  }
+  
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) {
+    __m128 c_v = c.v;
+    v4 tf;
+    tf.v = _mm_or_ps(_mm_andnot_ps(c_v,f.v),_mm_and_ps(c_v,t.v));
+    return tf;
+  }
+
+  ////////////////
+  // v4float class
+
+  class v4float : public v4 {
+
+    // v4float prefix unary operator friends
+
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4float prefix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
+
+    // v4float postfix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
+
+    // v4float binary operator friends
+
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float math library friends
+
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                   const v4float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v4float miscellaneous friends
+
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    // FIXME: crack
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+    
+  public:
+
+    // v4float constructors / destructors
+    
+    v4float() {}                                        // Default constructor
+    v4float( const v4float &a ) { v = a.v; }            // Copy constructor
+    v4float( const v4 &a ) { v = a.v; }                 // Init from mixed
+    v4float( float a ) {                                // Init from scalar
+      v = _mm_set1_ps( a );
+    }
+    v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars
+      v = _mm_setr_ps( f0, f1, f2, f3 );
+    }
+    ~v4float() {}                                       // Destructor
+
+    // v4float assignment operators
+
+#   define ASSIGN(op,intrin)				\
+    inline v4float &operator op(const v4float &b) {	\
+      v = intrin(v,b.v);				\
+      return *this;					\
+    }
+
+    inline v4float &operator =(const v4float &b) {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=,_mm_add_ps)
+    ASSIGN(-=,_mm_sub_ps)
+    ASSIGN(*=,_mm_mul_ps)
+    ASSIGN(/=,_mm_div_ps)
+
+#   undef ASSIGN
+
+    // v4float member access operator
+
+    inline float &operator []( int n ) { return f[n]; }
+    inline float  operator ()( int n ) { return f[n]; }
+
+  };
+
+  // v4float prefix unary operators
+
+  inline v4float operator +( const v4float &a ) {
+    v4float b;
+    b.v = a.v;
+    return b;
+  }
+
+  inline v4float operator -( const v4float &a ) {
+    v4float b;
+    b.v = _mm_sub_ps(_mm_setzero_ps(),a.v);
+    return b;
+  }
+
+  inline v4int operator !( const v4float &a ) {
+    v4int b;
+    b.v = _mm_cmpeq_ps(_mm_setzero_ps(),a.v);
+    return b;
+  }
+
+  // v4float prefix increment / decrement operators
+
+  inline v4float operator ++( v4float &a ) {
+    v4float b;
+    __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) );
+    a.v = t;
+    b.v = t;
+    return b;
+  }
+
+  inline v4float operator --( v4float &a ) {
+    v4float b;
+    __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) );
+    a.v = t;
+    b.v = t;
+    return b;
+  }
+
+  // v4float postfix increment / decrement operators
+
+  inline v4float operator ++( v4float &a, int ) {
+    v4float b;
+    __m128 a_v = a.v;
+    a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) );
+    b.v = a_v;
+    return b;
+  }
+
+  inline v4float operator --( v4float &a, int ) {
+    v4float b;
+    __m128 a_v = a.v;
+    a.v = _mm_sub_ps(a_v, _mm_set1_ps( 1 ) );
+    b.v = a_v;
+    return b;
+  }
+
+  // v4float binary operators
+    
+# define BINARY(op,intrin)                                           \
+  inline v4float operator op( const v4float &a, const v4float &b ) { \
+    v4float c;                                                       \
+    c.v = intrin(a.v,b.v);                                           \
+    return c;                                                        \
+  }
+
+  BINARY(+,_mm_add_ps)
+  BINARY(-,_mm_sub_ps)
+  BINARY(*,_mm_mul_ps)
+  BINARY(/,_mm_div_ps)
+
+# undef BINARY
+
+  // v4float logical operators
+
+# define LOGICAL(op,intrin)                                        \
+  inline v4int operator op( const v4float &a, const v4float &b ) { \
+    v4int c;                                                       \
+    c.v = intrin(a.v,b.v);                                         \
+    return c;                                                      \
+  }
+
+  LOGICAL(<, _mm_cmplt_ps )
+  LOGICAL(>, _mm_cmpgt_ps )
+  LOGICAL(==,_mm_cmpeq_ps )
+  LOGICAL(!=,_mm_cmpneq_ps)
+  LOGICAL(<=,_mm_cmple_ps )
+  LOGICAL(>=,_mm_cmpge_ps )
+
+  inline v4int operator &&( const v4float &a, const v4float &b ) {
+    v4int c;
+    __m128 vzero = _mm_setzero_ps();
+    c.v = _mm_and_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+    return c;
+  }
+
+  inline v4int operator ||( const v4float &a, const v4float &b ) {
+    v4int c;
+    __m128 vzero = _mm_setzero_ps();
+    c.v = _mm_or_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+    return c;
+  }
+
+# undef LOGICAL
+
+  // v4float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v4float fn( const v4float &a ) {       \
+    v4float b;                                  \
+    b.f[0] = ::fn(a.f[0]);                      \
+    b.f[1] = ::fn(a.f[1]);                      \
+    b.f[2] = ::fn(a.f[2]);                      \
+    b.f[3] = ::fn(a.f[3]);                      \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v4float fn( const v4float &a, const v4float &b ) {     \
+    v4float c;                                                  \
+    c.f[0] = ::fn(a.f[0],b.f[0]);                               \
+    c.f[1] = ::fn(a.f[1],b.f[1]);                               \
+    c.f[2] = ::fn(a.f[2],b.f[2]);                               \
+    c.f[3] = ::fn(a.f[3],b.f[3]);                               \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v4float fabs( const v4float &a ) {
+    v4float b;
+    b.v = _mm_andnot_ps( _mm_set1_ps( -0.f ), a.v );
+    return b;
+  }
+
+  inline v4float sqrt( const v4float &a ) {
+    v4float b;
+    b.v = _mm_sqrt_ps(a.v);
+    return b;
+  }
+
+  inline v4float copysign( const v4float &a, const v4float &b ) {
+    v4float c;
+    __m128 t = _mm_set1_ps( -0.f );
+    c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) );
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v4float miscelleanous functions
+  
+  inline v4float rsqrt_approx( const v4float &a ) {
+    v4float b;
+    b.v = _mm_rsqrt_ps(a.v);
+    return b;
+  }
+  
+  inline v4float rsqrt( const v4float &a ) {
+    v4float b;
+    __m128 a_v = a.v, b_v;
+    b_v = _mm_rsqrt_ps(a_v);
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+    b.v = _mm_add_ps(b_v,_mm_mul_ps(_mm_set1_ps(0.5f),
+                                    _mm_sub_ps(b_v,_mm_mul_ps(a_v,
+                                                   _mm_mul_ps(b_v,
+                                                   _mm_mul_ps(b_v,b_v))))));
+    return b;
+  }
+
+  inline v4float rcp_approx( const v4float &a ) {
+    v4float b;
+    b.v = _mm_rcp_ps(a.v);
+    return b;
+  }
+  
+  inline v4float rcp( const v4float &a ) {
+    v4float b;
+    __m128 a_v = a.v, b_v;
+    b_v = _mm_rcp_ps(a_v);
+    b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v)));
+    return b;
+  }
+
+  inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
+    v4float d;
+    d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v );
+    return d;
+  }
+
+  inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
+    v4float d;
+    d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v );
+    return d;
+  }
+
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) {
+    v4float d;
+    d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) );
+    return d;
+  }
+
+  inline v4float clear_bits( const v4int &m, const v4float &a ) {
+    v4float b;
+    b.v = _mm_andnot_ps( m.v, a.v );
+    return b;
+  }
+
+  inline v4float set_bits( const v4int &m, const v4float &a ) {
+    v4float b;
+    b.v = _mm_or_ps( m.v, a.v );
+    return b;
+  }
+
+  inline v4float toggle_bits( const v4int &m, const v4float &a ) {
+    v4float b;
+    b.v = _mm_xor_ps( m.v, a.v );
+    return b;
+  }
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) {
+    _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) {
+    _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) {
+    _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh ) {
+    __m128 l = _mm_set1_ps(1), s = _mm_setr_ps(-0.f,+0.f,-0.f,+0.f);
+    __m128 z = wl.v, xy;
+    xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) );
+    z  = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) );
+    xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ),
+                     _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) );
+    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) );
+    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) );
+  }
+
+# undef PERM
+
+} // namespace v4
+
+#endif // _v4_avx_h_
diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
new file mode 100644
index 00000000..abb7814f
--- /dev/null
+++ b/src/util/v4/v4_avx2.h
@@ -0,0 +1,1442 @@
+#ifndef _v4_avx2_h_
+#define _v4_avx2_h_
+
+#ifndef IN_v4_h
+#error "Do not include v4_avx2.h directly; use v4.h"
+#endif
+
+#include <immintrin.h>
+#include <math.h>
+
+#define V4_ACCELERATION
+#define V4_AVX2_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v4
+{
+  class v4;
+  class v4int;
+  class v4float;
+
+  template<int i0, int i1, int i2, int i3>
+  struct permute
+  {
+    constexpr static int value = i0 + i1*4 + i2*16 + i3*64;
+  };
+
+# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
+
+  ////////////////
+  // v4 base class
+
+  class v4
+  {
+    friend class v4int;
+    friend class v4float;
+
+    // v4 miscellaneous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+
+    // v4 memory manipulation friends
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v4 transposed memory manipulation friends
+
+    friend inline void load_4x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+                                    v4 &a ) ALWAYS_INLINE;
+
+    friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
+    friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
+    friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
+    friend inline void store_4x1_tr( const v4 &a,
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x2_tr( const v4 &a, const v4 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x4_tr( const v4 &a, const v4 &b,
+                                     const v4 &c, const v4 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[4];
+      float f[4];
+      __m128 v;
+    };
+
+  public:
+
+    v4() {}                    // Default constructor
+
+    v4( const v4 &a )          // Copy constructor
+    {
+      v=a.v;
+    }
+
+    ~v4() {}                   // Default destructor
+  };
+
+  // v4 miscellaneous functions
+
+  inline int any( const v4 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3];
+  }
+
+  inline int all( const v4 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3];
+  }
+
+  template<int n>
+  inline v4 splat( const v4 & a )
+  {
+    __m128 a_v = a.v;
+    v4 b;
+
+    b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) );
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a )
+  {
+    __m128 a_v = a.v;
+    v4 b;
+
+    b.v = _mm_shuffle_ps( a_v, a_v, ( permute<i0,i1,i2,i3>::value ) );
+
+    return b;
+  }
+
+  inline void swap( v4 &a, v4 &b )
+  { 
+    __m128 a_v = a.v;
+
+    a.v = b.v;
+
+    b.v = a_v;
+  }
+
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u;
+
+    t    = _mm_unpackhi_ps( a0_v, a1_v );
+    a0_v = _mm_unpacklo_ps( a0_v, a1_v );
+    u    = _mm_unpackhi_ps( a2_v, a3_v );
+    a2_v = _mm_unpacklo_ps( a2_v, a3_v );
+
+    a1_v = _mm_movehl_ps( a2_v, a0_v );
+    a0_v = _mm_movelh_ps( a0_v, a2_v );
+    a2_v = _mm_movelh_ps( t, u );
+    a3_v = _mm_movehl_ps( u, t );
+
+    a0.v = a0_v;
+    a1.v = a1_v;
+    a2.v = a2_v;
+    a3.v = a3_v;
+  }
+
+  // v4 memory manipulation functions
+
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    a.v = _mm_load_ps( (float *)p );
+  }
+
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    _mm_store_ps( (float *)p, a.v );
+  }
+
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    _mm_stream_ps( (float *)p, a.v );
+  }
+
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    _mm_store_ps( (float *)p, _mm_setzero_ps() );
+  }
+
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) );
+  }
+
+  /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
+  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b )
+  {
+    __m128 t = _mm_load_ps((float *)a);
+
+    _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) );
+    _mm_store_ps( (float *)b, t );
+  }
+
+  // v4 transposed memory manipulation functions
+
+  inline void load_4x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+			   v4 &a )
+  {
+    a.v = _mm_setr_ps( ((const float *)a0)[0],
+                       ((const float *)a1)[0],
+                       ((const float *)a2)[0],
+                       ((const float *)a3)[0] );
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a, v4 &b )
+  {
+    __m128 a_v, b_v, t;
+
+    b_v = _mm_setzero_ps();
+
+    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
+    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+
+    a_v = _mm_shuffle_ps( t, b_v, 0x88 );
+    b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+
+    a.v = a_v;
+    b.v = b_v;
+  }
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c )
+  {
+    __m128 a_v, b_v, c_v, t, u;
+
+    t   = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    u   = _mm_load_ps( (const float *)a3 );
+
+    a_v = _mm_unpacklo_ps( t, b_v );
+    b_v = _mm_unpackhi_ps( t, b_v );
+    t   = _mm_unpacklo_ps( c_v, u );
+    u   = _mm_unpackhi_ps( c_v, u );
+
+    c_v = _mm_movelh_ps( b_v, u );
+    b_v = _mm_movehl_ps( t, a_v );
+    a_v = _mm_movelh_ps( a_v, t );
+
+    a.v = a_v;
+    b.v = b_v;
+    c.v = c_v;
+  }
+
+#if 0
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+    a_v = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    d_v = _mm_load_ps( (const float *)a3 );
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+    b_v = _mm_movehl_ps( c_v, a_v );
+    a_v = _mm_movelh_ps( a_v, c_v );
+    c_v = _mm_movelh_ps( t, u );
+    d_v = _mm_movehl_ps( u, t );
+    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
+  }
+#endif
+
+#if 0
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+
+    a_v = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    d_v = _mm_load_ps( (const float *)a3 );
+
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+
+    b.v = _mm_movehl_ps( c_v, a_v );
+    a.v = _mm_movelh_ps( a_v, c_v );
+    c.v = _mm_movelh_ps( t, u );
+    d.v = _mm_movehl_ps( u, t );
+  }
+#endif
+
+#if 0
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+
+    a_v = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    d_v = _mm_load_ps( (const float *)a3 );
+
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+
+    a.v = _mm_movelh_ps( a_v, c_v );
+    b.v = _mm_movehl_ps( c_v, a_v );
+    d.v = _mm_movehl_ps( u, t );
+    c.v = _mm_movelh_ps( t, u );
+  }
+#endif
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+
+    a_v = _mm_load_ps( (const float *)a0 );
+    b_v = _mm_load_ps( (const float *)a1 );
+    c_v = _mm_load_ps( (const float *)a2 );
+    d_v = _mm_load_ps( (const float *)a3 );
+
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+
+    a.v = _mm_movelh_ps( a_v, c_v );
+    c.v = _mm_movelh_ps( t, u );
+    b.v = _mm_movehl_ps( c_v, a_v );
+    d.v = _mm_movehl_ps( u, t );
+  }
+
+  inline void store_4x1_tr( const v4 &a,
+                            void *a0, void *a1,
+			    void *a2, void *a3 )
+  {
+    ((float *)a0)[0] = a.f[0];
+    ((float *)a1)[0] = a.f[1];
+    ((float *)a2)[0] = a.f[2];
+    ((float *)a3)[0] = a.f[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a, const v4 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  {
+    __m128 a_v = a.v, b_v = b.v, t;
+
+    t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
+  }
+
+  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    __m128 a_v = a.v, b_v = b.v, t;
+
+    t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
+
+    ((float *)a0)[2] = c.f[0];
+    ((float *)a1)[2] = c.f[1];
+    ((float *)a2)[2] = c.f[2];
+    ((float *)a3)[2] = c.f[3];
+  }
+
+  // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR)
+  inline void store_4x4_tr( const v4 &a, const v4 &b,
+			    const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+
+    t   = _mm_unpackhi_ps( a_v, b_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
+    c_v = _mm_unpacklo_ps( c_v, d_v );
+
+    b_v = _mm_movehl_ps( c_v, a_v );
+    a_v = _mm_movelh_ps( a_v, c_v );
+    c_v = _mm_movelh_ps( t, u );
+    d_v = _mm_movehl_ps( u, t );
+
+    _mm_store_ps( (float *)a0, a_v );
+    _mm_store_ps( (float *)a1, b_v );
+    _mm_store_ps( (float *)a2, c_v );
+    _mm_store_ps( (float *)a3, d_v );
+  }
+
+  //////////////
+  // v4int class
+
+  class v4int : public v4
+  {
+    // v4int prefix unary operator friends
+
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4int prefix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
+
+    // v4int postfix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
+
+    // v4int binary operator friends
+
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int logical operator friends
+
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
+
+    // v4float unary operator friends
+
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float miscellaneous friends
+
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v4int constructors / destructors
+
+    v4int() {}                                // Default constructor
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      union
+      {
+	int i;
+	float f;
+      } u;
+
+      u.i = a;
+      v   = _mm_set1_ps( u.f );
+    }
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      union
+      {
+	int i;
+	float f;
+      } u0, u1, u2, u3;
+
+      u0.i = i0;
+      u1.i = i1;
+      u2.i = i2;
+      u3.i = i3;
+
+      v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f );
+    }
+
+    ~v4int() {}                               // Destructor
+
+    // v4int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v4int &operator op( const v4int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      return *this;                               \
+    }
+
+    inline v4int &operator =( const v4int &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+
+    inline v4int &operator ^=( const v4int &b )
+    {
+      v = _mm_xor_ps( v, b.v );
+
+      return *this;
+    }
+
+    inline v4int &operator &=( const v4int &b )
+    {
+      v = _mm_and_ps( v, b.v );
+
+      return *this;
+    }
+
+    inline v4int &operator |=( const v4int &b )
+    {
+      v = _mm_or_ps( v, b.v );
+
+      return *this;
+    }
+
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v4int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v4int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v4int operator op( const v4int & a )   \
+  {						\
+    v4int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    return b;                                   \
+  }
+
+  inline v4int operator +( const v4int & a )
+  {
+    v4int b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  PREFIX_UNARY(-)
+
+  inline v4int operator !( const v4int & a )
+  {
+    v4int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+
+    return b;
+  }
+
+  inline v4int operator ~( const v4int & a )
+  {
+    v4int b;
+
+    union
+    {
+      int i;
+      float f;
+    } u;
+
+    u.i = -1;
+    b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) );
+
+    return b;
+  }
+
+# undef PREFIX_UNARY
+
+  // v4int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v4int operator op( v4int & a )         \
+  {						\
+    v4int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v4int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v4int operator op( v4int & a, int )   \
+  {					       \
+    v4int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v4int binary operators
+
+# define BINARY(op)                                             \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {								\
+    v4int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+
+  inline v4int operator ^( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.v = _mm_xor_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v4int operator &( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.v = _mm_and_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v4int operator |( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.v = _mm_or_ps( a.v, b.v );
+
+    return c;
+  }
+
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v4int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {							       \
+    v4int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4int miscellaneous functions
+
+  inline v4int abs( const v4int &a )
+  {
+    v4int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+
+    return b;
+  }
+
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    b.v = _mm_andnot_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    b.v = _mm_and_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
+    __m128 c_v = c.v;
+    v4 tf;
+
+    tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
+		      _mm_and_ps( c_v, t.v ) );
+
+    return tf;
+  }
+
+  ////////////////
+  // v4float class
+
+  class v4float : public v4
+  {
+    // v4float prefix unary operator friends
+
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4float prefix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
+
+    // v4float postfix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
+
+    // v4float binary operator friends
+
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float math library friends
+
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                   const v4float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v4float miscellaneous friends
+
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
+  public:
+
+    // v4float constructors / destructors
+
+    v4float() {}                                        // Default constructor
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
+      v = _mm_set1_ps( a );
+    }
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      v = _mm_setr_ps( f0, f1, f2, f3 );
+    }
+
+    ~v4float() {}                                       // Destructor
+
+    // v4float assignment operators
+
+#   define ASSIGN(op,intrin)				\
+    inline v4float &operator op( const v4float &b )     \
+    {							\
+      v = intrin( v, b.v );                             \
+      return *this;					\
+    }
+
+    inline v4float &operator =( const v4float &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    ASSIGN( +=, _mm_add_ps )
+    ASSIGN( -=, _mm_sub_ps )
+    ASSIGN( *=, _mm_mul_ps )
+    ASSIGN( /=, _mm_div_ps )
+
+#   undef ASSIGN
+
+    // v4float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v4float prefix unary operators
+
+  inline v4float operator +( const v4float &a )
+  {
+    v4float b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  inline v4float operator -( const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_sub_ps( _mm_setzero_ps(), a.v );
+
+    return b;
+  }
+
+  inline v4int operator !( const v4float &a )
+  {
+    v4int b;
+
+    b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v );
+
+    return b;
+  }
+
+  // v4float prefix increment / decrement operators
+
+  inline v4float operator ++( v4float &a )
+  {
+    v4float b;
+
+    __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a )
+  {
+    v4float b;
+
+    __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  // v4float postfix increment / decrement operators
+
+  inline v4float operator ++( v4float &a, int )
+  {
+    v4float b;
+
+    __m128 a_v = a.v;
+
+    a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a, int )
+  {
+    v4float b;
+
+    __m128 a_v = a.v;
+
+    a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  // v4float binary operators
+
+# define BINARY(op,intrin)                                           \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {								     \
+    v4float c;                                                       \
+    c.v = intrin( a.v, b.v );                                        \
+    return c;                                                        \
+  }
+
+  BINARY( +, _mm_add_ps )
+  BINARY( -, _mm_sub_ps )
+  BINARY( *, _mm_mul_ps )
+  BINARY( /, _mm_div_ps )
+
+# undef BINARY
+
+  // v4float logical operators
+
+# define LOGICAL(op,intrin)                                        \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {								   \
+    v4int c;                                                       \
+    c.v = intrin( a.v, b.v );                                      \
+    return c;                                                      \
+  }
+
+  LOGICAL(  <, _mm_cmplt_ps )
+  LOGICAL(  >, _mm_cmpgt_ps )
+  LOGICAL( ==, _mm_cmpeq_ps )
+  LOGICAL( !=, _mm_cmpneq_ps )
+  LOGICAL( <=, _mm_cmple_ps )
+  LOGICAL( >=, _mm_cmpge_ps )
+
+  inline v4int operator &&( const v4float &a, const v4float &b )
+  {
+    v4int c;
+
+    __m128 vzero = _mm_setzero_ps();
+
+    c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
+		      _mm_cmpneq_ps( b.v, vzero ) );
+
+    return c;
+  }
+
+  inline v4int operator ||( const v4float &a, const v4float &b )
+  {
+    v4int c;
+
+    __m128 vzero = _mm_setzero_ps();
+
+    c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
+		     _mm_cmpneq_ps( b.v, vzero ) );
+
+    return c;
+  }
+
+# undef LOGICAL
+
+  // v4float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v4float fn( const v4float &a )         \
+  {						\
+    v4float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {								\
+    v4float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v4float fabs( const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v );
+
+    return b;
+  }
+
+  inline v4float sqrt( const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_sqrt_ps( a.v );
+
+    return b;
+  }
+
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
+    v4float c;
+
+    __m128 t = _mm_set1_ps( -0.0f );
+
+    c.v = _mm_or_ps( _mm_and_ps( t, b.v ),
+		     _mm_andnot_ps( t, a.v ) );
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v4float miscelleanous functions
+
+  inline v4float rsqrt_approx( const v4float &a )
+ {
+    v4float b;
+
+    b.v = _mm_rsqrt_ps( a.v );
+
+    return b;
+  }
+
+  #if 0
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    __m128 a_v = a.v, b_v;
+
+    b_v = _mm_rsqrt_ps( a_v );
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+    b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ),
+				       _mm_sub_ps( b_v,
+						   _mm_mul_ps( a_v,
+							       _mm_mul_ps( b_v,
+									   _mm_mul_ps( b_v, b_v )
+									 )
+							     )
+						 )
+				     )
+		    );
+
+    return b;
+  }
+  #endif
+
+  #if 0
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    __m128 a_v = a.v, b_v;
+
+    b_v = _mm_rsqrt_ps( a_v );
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+
+    b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
+			_mm_fnmadd_ps( a_v,
+				       _mm_mul_ps( b_v,
+						   _mm_mul_ps( b_v, b_v ) ),
+				       b_v ),
+			b_v );
+
+    return b;
+  }
+  #endif
+
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    __m128 b_v;
+
+    b_v = _mm_rsqrt_ps( a.v );
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+
+    b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
+			_mm_fnmadd_ps( a.v,
+				       _mm_mul_ps( b_v,
+						   _mm_mul_ps( b_v, b_v ) ),
+				       b_v ),
+			b_v );
+
+    return b;
+  }
+
+  inline v4float rcp_approx( const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_rcp_ps( a.v );
+
+    return b;
+  }
+
+  #if 0
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    __m128 a_v = a.v, b_v;
+
+    b_v = _mm_rcp_ps( a_v );
+
+    b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ),
+		      _mm_mul_ps( a_v,
+				  _mm_mul_ps( b_v, b_v )
+				)
+		    );
+
+    return b;
+  }
+  #endif
+
+  #if 0
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    __m128 a_v = a.v, b_v;
+
+    b_v = _mm_rcp_ps( a_v );
+
+    b.v = _mm_fnmadd_ps( a_v,
+			 _mm_mul_ps( b_v, b_v ),
+			 _mm_add_ps( b_v, b_v ) );
+
+    return b;
+  }
+  #endif
+
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    __m128 b_v;
+
+    b_v = _mm_rcp_ps( a.v );
+
+    b.v = _mm_fnmadd_ps( a.v,
+			 _mm_mul_ps( b_v, b_v ),
+			 _mm_add_ps( b_v, b_v ) );
+
+    return b;
+  }
+
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.v = _mm_fmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.v = _mm_fmsub_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.v = _mm_fnmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_andnot_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_or_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.v = _mm_xor_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
+  }
+
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
+  {
+    __m128 l = _mm_set1_ps( 1.0f ), s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f );
+    __m128 z = wl.v, xy;
+
+    xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) );
+
+    z  = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) );
+
+    xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ),
+                     _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) );
+
+    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) );
+
+    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) );
+  }
+
+# undef PERM
+
+} // namespace v4
+
+#endif // _v4_avx2_h_
diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h
index efbc0dbd..9f199697 100644
--- a/src/util/v4/v4_portable.h
+++ b/src/util/v4/v4_portable.h
@@ -5,6 +5,8 @@
 #error "Do not include v4_portable.h directly; use v4.h"
 #endif
 
+#include <math.h>
+
 #define V4_ACCELERATION
 #define V4_PORTABLE_ACCELERATION
 
@@ -12,180 +14,209 @@
 #define ALIGNED(n)
 #endif
 
-#include <math.h>
-
-namespace v4 {
+#define ALWAYS_INLINE __attribute__((always_inline))
 
+namespace v4
+{
   class v4;
   class v4int;
   class v4float;
-  
+
   ////////////////
   // v4 base class
-  
-  class v4 {
-    
+
+  class v4
+  {
     friend class v4int;
     friend class v4float;
-      
-    // v4 miscellenous friends
 
-    friend inline int any( const v4 &a );
-    friend inline int all( const v4 &a );
+    // v4 miscellaneous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
 
     template<int n>
-    friend inline v4 splat( const v4 &a );
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
 
     template<int i0, int i1, int i2, int i3>
-    friend inline v4 shuffle( const v4 &a );
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
 
-    friend inline void swap( v4 &a, v4 &b );
-    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 );
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
 
     // v4int miscellaneous friends
 
-    friend inline v4 czero(    const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
-    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b );
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
-        
-    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a );
-    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void copy_4x1( void * ALIGNED(16) dst,
-                                 const void * ALIGNED(16) src );
-    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b );
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
-    // Note: Half aligned values are permissible in the 4x2_tr variants!
 
     friend inline void load_4x1_tr( const void *a0, const void *a1,
                                     const void *a2, const void *a3,
-                                    v4 &a );
+                                    v4 &a ) ALWAYS_INLINE;
+
     friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
                                     const void * ALIGNED(8) a1,
                                     const void * ALIGNED(8) a2,
                                     const void * ALIGNED(8) a3,
-                                    v4 &a, v4 &b );
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
     friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c );
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
     friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c, v4 &d );
-    
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
     friend inline void store_4x1_tr( const v4 &a,
-                                     void *a0, void *a1, void *a2, void *a3 );
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x2_tr( const v4 &a, const v4 &b,
                                      void * ALIGNED(8) a0,
                                      void * ALIGNED(8) a1,
                                      void * ALIGNED(8) a2,
-                                     void * ALIGNED(8) a3 );
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x4_tr( const v4 &a, const v4 &b,
                                      const v4 &c, const v4 &d,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
 
   protected:
 
-    union {
+    union
+    {
       int i[4];
       float f[4];
     };
-    
+
   public:
 
     v4() {}                    // Default constructor
-    v4(const v4 &a) {          // Copy constructor
-      i[0]=a.i[0]; i[1]=a.i[1]; i[2]=a.i[2]; i[3]=a.i[3];
+
+    v4( const v4 &a )          // Copy constructor
+    {
+      i[0]=a.i[0];
+      i[1]=a.i[1];
+      i[2]=a.i[2];
+      i[3]=a.i[3];
     }
-    ~v4() {}                   // Default destructor
 
+    ~v4() {}                   // Default destructor
   };
-  
+
   // v4 miscellaneous functions
 
-  inline int any( const v4 &a ) {
+  inline int any( const v4 &a )
+  {
     return a.i[0] || a.i[1] || a.i[2] || a.i[3];
   }
-  
-  inline int all( const v4 &a ) {
+
+  inline int all( const v4 &a )
+  {
     return a.i[0] && a.i[1] && a.i[2] && a.i[3];
   }
-  
+
   template<int n>
-  inline v4 splat( const v4 & a ) {
+  inline v4 splat( const v4 & a )
+  {
     v4 b;
+
     b.i[0] = a.i[n];
     b.i[1] = a.i[n];
     b.i[2] = a.i[n];
     b.i[3] = a.i[n];
+
     return b;
   }
 
   template<int i0, int i1, int i2, int i3>
-  inline v4 shuffle( const v4 & a ) {
+  inline v4 shuffle( const v4 & a )
+  {
     v4 b;
+
     b.i[0] = a.i[i0];
     b.i[1] = a.i[i1];
     b.i[2] = a.i[i2];
     b.i[3] = a.i[i3];
+
     return b;
   }
 
 # define sw(x,y) x^=y, y^=x, x^=y
 
-  inline void swap( v4 &a, v4 &b ) { 
+  inline void swap( v4 &a, v4 &b )
+  {
     sw( a.i[0], b.i[0] );
     sw( a.i[1], b.i[1] );
     sw( a.i[2], b.i[2] );
     sw( a.i[3], b.i[3] );
   }
 
-  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) {
-    /**/ sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
-    /**/                        sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
-    /**/                                               sw( a2.i[3],a3.i[2] );
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+                                                  sw( a2.i[3],a3.i[2] );
   }
 
 # undef sw
 
   // v4 memory manipulation functions
-  
-  inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) {
+
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
     a.i[0] = ((const int * ALIGNED(16))p)[0];
     a.i[1] = ((const int * ALIGNED(16))p)[1];
     a.i[2] = ((const int * ALIGNED(16))p)[2];
     a.i[3] = ((const int * ALIGNED(16))p)[3];
   }
 
-  inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) {
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
     ((int * ALIGNED(16))p)[2] = a.i[2];
     ((int * ALIGNED(16))p)[3] = a.i[3];
   }
 
-  inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) {
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
     ((int * ALIGNED(16))p)[2] = a.i[2];
     ((int * ALIGNED(16))p)[3] = a.i[3];
   }
 
-  inline void clear_4x1( void * ALIGNED(16) p ) {
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
     ((int * ALIGNED(16))p)[0] = 0;
     ((int * ALIGNED(16))p)[1] = 0;
     ((int * ALIGNED(16))p)[2] = 0;
@@ -194,25 +225,42 @@ namespace v4 {
 
   // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
-                        const void * ALIGNED(16) src ) {
+                        const void * ALIGNED(16) src )
+  {
     ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
     ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
     ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
     ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
   }
 
-  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) {
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
     int t;
-    t = ((int * ALIGNED(16))a)[0]; ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0]; ((int * ALIGNED(16))b)[0] = t;
-    t = ((int * ALIGNED(16))a)[1]; ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1]; ((int * ALIGNED(16))b)[1] = t;
-    t = ((int * ALIGNED(16))a)[2]; ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2]; ((int * ALIGNED(16))b)[2] = t;
-    t = ((int * ALIGNED(16))a)[3]; ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3]; ((int * ALIGNED(16))b)[3] = t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
   }
 
   // v4 transposed memory manipulation functions
 
   inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3, v4 &a ) {
+                           const void *a2, const void *a3,
+			   v4 &a )
+  {
     a.i[0] = ((const int *)a0)[0];
     a.i[1] = ((const int *)a1)[0];
     a.i[2] = ((const int *)a2)[0];
@@ -223,28 +271,41 @@ namespace v4 {
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b ) {
-    a.i[0] = ((const int * ALIGNED(8))a0)[0]; b.i[0] = ((const int * ALIGNED(8))a0)[1];
-    a.i[1] = ((const int * ALIGNED(8))a1)[0]; b.i[1] = ((const int * ALIGNED(8))a1)[1];
-    a.i[2] = ((const int * ALIGNED(8))a2)[0]; b.i[2] = ((const int * ALIGNED(8))a2)[1];
-    a.i[3] = ((const int * ALIGNED(8))a3)[0]; b.i[3] = ((const int * ALIGNED(8))a3)[1];
+                           v4 &a, v4 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
   }
-  
+
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c ) {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0]; b.i[0] = ((const int* ALIGNED(16))a0)[1];
+                           v4 &a, v4 &b, v4 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
     c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    
-    a.i[1] = ((const int * ALIGNED(16))a1)[0]; b.i[1] = ((const int* ALIGNED(16))a1)[1];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
     c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    
-    a.i[2] = ((const int * ALIGNED(16))a2)[0]; b.i[2] = ((const int* ALIGNED(16))a2)[1];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
     c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    
-    a.i[3] = ((const int * ALIGNED(16))a3)[0]; b.i[3] = ((const int* ALIGNED(16))a3)[1];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
     c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
   }
 
@@ -252,22 +313,33 @@ namespace v4 {
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0]; b.i[0] = ((const int* ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2]; d.i[0] = ((const int* ALIGNED(16))a0)[3];
-    
-    a.i[1] = ((const int * ALIGNED(16))a1)[0]; b.i[1] = ((const int* ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2]; d.i[1] = ((const int* ALIGNED(16))a1)[3];
-    
-    a.i[2] = ((const int * ALIGNED(16))a2)[0]; b.i[2] = ((const int* ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2]; d.i[2] = ((const int* ALIGNED(16))a2)[3];
-    
-    a.i[3] = ((const int * ALIGNED(16))a3)[0]; b.i[3] = ((const int* ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; d.i[3] = ((const int* ALIGNED(16))a3)[3];
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1, void *a2, void *a3 ) {
+                            void *a0, void *a1,
+			    void *a2, void *a3 )
+  {
     ((int *)a0)[0] = a.i[0];
     ((int *)a1)[0] = a.i[1];
     ((int *)a2)[0] = a.i[2];
@@ -276,144 +348,188 @@ namespace v4 {
 
   inline void store_4x2_tr( const v4 &a, const v4 &b,
                             void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) {
-    ((int * ALIGNED(8))a0)[0] = a.i[0]; ((int * ALIGNED(8))a0)[1] = b.i[0];
-    ((int * ALIGNED(8))a1)[0] = a.i[1]; ((int * ALIGNED(8))a1)[1] = b.i[1];
-    ((int * ALIGNED(8))a2)[0] = a.i[2]; ((int * ALIGNED(8))a2)[1] = b.i[2];
-    ((int * ALIGNED(8))a3)[0] = a.i[3]; ((int * ALIGNED(8))a3)[1] = b.i[3];
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
   }
 
   inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
-    ((int * ALIGNED(16))a0)[0] = a.i[0]; ((int * ALIGNED(16))a0)[1] = b.i[0];
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
     ((int * ALIGNED(16))a0)[2] = c.i[0];
-    
-    ((int * ALIGNED(16))a1)[0] = a.i[1]; ((int * ALIGNED(16))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
     ((int * ALIGNED(16))a1)[2] = c.i[1];
-    
-    ((int * ALIGNED(16))a2)[0] = a.i[2]; ((int * ALIGNED(16))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
     ((int * ALIGNED(16))a2)[2] = c.i[2];
-    
-    ((int * ALIGNED(16))a3)[0] = a.i[3]; ((int * ALIGNED(16))a3)[1] = b.i[3];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
     ((int * ALIGNED(16))a3)[2] = c.i[3];
   }
-  
+
   inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
-    ((int * ALIGNED(16))a0)[0] = a.i[0]; ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0]; ((int * ALIGNED(16))a0)[3] = d.i[0];
-    
-    ((int * ALIGNED(16))a1)[0] = a.i[1]; ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1]; ((int * ALIGNED(16))a1)[3] = d.i[1];
-    
-    ((int * ALIGNED(16))a2)[0] = a.i[2]; ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2]; ((int * ALIGNED(16))a2)[3] = d.i[2];
-    
-    ((int * ALIGNED(16))a3)[0] = a.i[3]; ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3]; ((int * ALIGNED(16))a3)[3] = d.i[3];
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
   }
 
   //////////////
   // v4int class
 
-  class v4int : public v4 {
-
+  class v4int : public v4
+  {
     // v4int prefix unary operator friends
 
-    friend inline v4int operator  +( const v4int & a );
-    friend inline v4int operator  -( const v4int & a );
-    friend inline v4int operator  ~( const v4int & a );
-    friend inline v4int operator  !( const v4int & a );
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4int prefix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a );
-    friend inline v4int operator --( v4int & a );
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
 
     // v4int postfix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a, int );
-    friend inline v4int operator --( v4int & a, int );
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
 
     // v4int binary operator friends
 
-    friend inline v4int operator  +( const v4int &a, const v4int &b );
-    friend inline v4int operator  -( const v4int &a, const v4int &b );
-    friend inline v4int operator  *( const v4int &a, const v4int &b );
-    friend inline v4int operator  /( const v4int &a, const v4int &b );
-    friend inline v4int operator  %( const v4int &a, const v4int &b );
-    friend inline v4int operator  ^( const v4int &a, const v4int &b );
-    friend inline v4int operator  &( const v4int &a, const v4int &b );
-    friend inline v4int operator  |( const v4int &a, const v4int &b );
-    friend inline v4int operator <<( const v4int &a, const v4int &b );
-    friend inline v4int operator >>( const v4int &a, const v4int &b );
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int logical operator friends
 
-    friend inline v4int operator  <( const v4int &a, const v4int &b );
-    friend inline v4int operator  >( const v4int &a, const v4int &b );
-    friend inline v4int operator ==( const v4int &a, const v4int &b );
-    friend inline v4int operator !=( const v4int &a, const v4int &b );
-    friend inline v4int operator <=( const v4int &a, const v4int &b );
-    friend inline v4int operator >=( const v4int &a, const v4int &b );
-    friend inline v4int operator &&( const v4int &a, const v4int &b );
-    friend inline v4int operator ||( const v4int &a, const v4int &b );
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int miscellaneous friends
 
-    friend inline v4int abs( const v4int &a );
-    friend inline v4    czero( const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     // FIXME: cswap, notcswap!
-    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f );
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
 
     // v4float unary operator friends
 
-    friend inline v4int operator  !( const v4float & a ); 
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float miscellaneous friends
 
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
 
   public:
 
     // v4int constructors / destructors
-    
+
     v4int() {}                                // Default constructor
-    v4int( const v4int &a ) {                 // Copy constructor
-      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      i[0] = a.i[0];
+      i[1] = a.i[1];
+      i[2] = a.i[2];
+      i[3] = a.i[3];
     }
-    v4int( const v4 &a ) {                    // Init from mixed
-      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      i[0] = a.i[0];
+      i[1] = a.i[1];
+      i[2] = a.i[2];
+      i[3] = a.i[3];
     }
-    v4int( int a ) {                          // Init from scalar
-      i[0] = a; i[1] = a; i[2] = a; i[3] = a;
+
+    v4int( int a )                            // Init from scalar
+    {
+      i[0] = a;
+      i[1] = a;
+      i[2] = a;
+      i[3] = a;
     }
-    v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars
-      i[0] = i0; i[1] = i1; i[2] = i2; i[3] = i3;
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      i[0] = i0;
+      i[1] = i1;
+      i[2] = i2;
+      i[3] = i3;
     }
+
     ~v4int() {}                               // Destructor
-    
+
     // v4int assignment operators
-  
+
 #   define ASSIGN(op)			          \
-    inline v4int &operator op( const v4int &b ) { \
+    inline v4int &operator op( const v4int &b )   \
+    {						  \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -436,49 +552,60 @@ namespace v4 {
 #   undef ASSIGN
 
     // v4int member access operator
-    
-    inline int &operator []( int n ) { return i[n]; }
-    inline int  operator ()( int n ) { return i[n]; }
 
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
   };
 
   // v4int prefix unary operators
 
 # define PREFIX_UNARY(op)                       \
-  inline v4int operator op( const v4int & a ) { \
+  inline v4int operator op( const v4int & a )   \
+  {						\
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
   PREFIX_UNARY(+)
   PREFIX_UNARY(-)
 
-  inline v4int operator !( const v4int & a ) {
+  inline v4int operator !( const v4int & a )
+  {
     v4int b;
-    b.i[0] = -(!a.i[0]);
-    b.i[1] = -(!a.i[1]);
-    b.i[2] = -(!a.i[2]);
-    b.i[3] = -(!a.i[3]);
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+
     return b;
   }
 
   PREFIX_UNARY(~)
-  
+
 # undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
 # define PREFIX_INCDEC(op)                      \
-  inline v4int operator op( v4int & a ) {       \
+  inline v4int operator op( v4int & a )         \
+  {						\
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
@@ -490,12 +617,13 @@ namespace v4 {
   // v4int postfix increment / decrement
 
 # define POSTFIX_INCDEC(op)                    \
-  inline v4int operator op( v4int & a, int ) { \
+  inline v4int operator op( v4int & a, int )   \
+  {					       \
     v4int b;                                   \
-    b.i[0] = (a.i[0] op);                      \
-    b.i[1] = (a.i[1] op);                      \
-    b.i[2] = (a.i[2] op);                      \
-    b.i[3] = (a.i[3] op);                      \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
     return b;                                  \
   }
 
@@ -505,9 +633,10 @@ namespace v4 {
 # undef POSTFIX_INCDEC
 
   // v4int binary operators
-  
+
 # define BINARY(op)                                             \
-  inline v4int operator op( const v4int &a, const v4int &b ) {	\
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {								\
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -532,7 +661,8 @@ namespace v4 {
   // v4int logical operators
 
 # define LOGICAL(op)                                           \
-  inline v4int operator op( const v4int &a, const v4int &b ) { \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {							       \
     v4int c;                                                   \
     c.i[0] = -(a.i[0] op b.i[0]);                              \
     c.i[1] = -(a.i[1] op b.i[1]);                              \
@@ -549,93 +679,105 @@ namespace v4 {
   LOGICAL(>=)
   LOGICAL(&&)
   LOGICAL(||)
-  
+
 # undef LOGICAL
 
   // v4int miscellaneous functions
 
-  inline v4int abs( const v4int &a ) {
+  inline v4int abs( const v4int &a )
+  {
     v4int b;
-    b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0];
-    b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1];
-    b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2];
-    b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3];
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+
     return b;
   }
 
-  inline v4 czero( const v4int &c, const v4 &a ) {
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
     v4 b;
+
     b.i[0] = a.i[0] & ~c.i[0];
     b.i[1] = a.i[1] & ~c.i[1];
     b.i[2] = a.i[2] & ~c.i[2];
     b.i[3] = a.i[3] & ~c.i[3];
+
     return b;
   }
 
-  inline v4 notczero( const v4int &c, const v4 &a ) {
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
     v4 b;
+
     b.i[0] = a.i[0] & c.i[0];
     b.i[1] = a.i[1] & c.i[1];
     b.i[2] = a.i[2] & c.i[2];
     b.i[3] = a.i[3] & c.i[3];
+
     return b;
   }
-  
-  inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) {
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
     v4 m;
-    m.i[0] = (f.i[0] & ~c.i[0]) | (t.i[0] & c.i[0] );
-    m.i[1] = (f.i[1] & ~c.i[1]) | (t.i[1] & c.i[1] );
-    m.i[2] = (f.i[2] & ~c.i[2]) | (t.i[2] & c.i[2] );
-    m.i[3] = (f.i[3] & ~c.i[3]) | (t.i[3] & c.i[3] );
+
+    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+
     return m;
   }
 
   ////////////////
   // v4float class
 
-  class v4float : public v4 {
-
+  class v4float : public v4
+  {
     // v4float prefix unary operator friends
 
-    friend inline v4float operator  +( const v4float &a );
-    friend inline v4float operator  -( const v4float &a );
-    friend inline v4float operator  ~( const v4float &a );
-    friend inline v4int   operator  !( const v4float &a );
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4float prefix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a );
-    friend inline v4float operator --( v4float &a );
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
 
     // v4float postfix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a, int );
-    friend inline v4float operator --( v4float &a, int );
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
 
     // v4float binary operator friends
 
-    friend inline v4float operator  +( const v4float &a, const v4float &b );
-    friend inline v4float operator  -( const v4float &a, const v4float &b );
-    friend inline v4float operator  *( const v4float &a, const v4float &b );
-    friend inline v4float operator  /( const v4float &a, const v4float &b );
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a )
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
 #   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b )
+                                                   const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -650,45 +792,66 @@ namespace v4 {
 
     // v4float miscellaneous friends
 
-    friend inline v4float rsqrt_approx( const v4float &a );
-    friend inline v4float rsqrt( const v4float &a );
-    friend inline v4float rcp_approx( const v4float &a );
-    friend inline v4float rcp( const v4float &a );
-    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
-    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a );
-    // FIXME: crack
-    friend inline void trilinear( v4float & wl, v4float & wh );
-    
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
   public:
 
     // v4float constructors / destructors
-    
+
     v4float() {}                                        // Default constructor
-    v4float( const v4float &a ) {                       // Copy constructor
-      f[0] = a.f[0]; f[1] = a.f[1]; f[2] = a.f[2]; f[3] = a.f[3];
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
     }
-    v4float( const v4 &a ) {                            // Init from mixed
-      f[0] = a.f[0]; f[1] = a.f[1]; f[2] = a.f[2]; f[3] = a.f[3];
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
     }
-    v4float( float a ) {                                // Init from scalar
-      f[0] = a; f[1] = a; f[2] = a; f[3] = a;
+
+    v4float( float a )                                  // Init from scalar
+    {
+      f[0] = a;
+      f[1] = a;
+      f[2] = a;
+      f[3] = a;
     }
-    v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars
-      f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
     }
+
     ~v4float() {}                                       // Destructor
 
     // v4float assignment operators
 
 #   define ASSIGN(op)                                   \
-    inline v4float &operator op( const v4float &b ) {	\
+    inline v4float &operator op( const v4float &b )     \
+    {							\
       f[0] op b.f[0];		             		\
       f[1] op b.f[1];                                   \
       f[2] op b.f[2];                                   \
@@ -706,66 +869,112 @@ namespace v4 {
 
     // v4float member access operator
 
-    inline float &operator []( int n ) { return f[n]; }
-    inline float  operator ()( int n ) { return f[n]; }
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
 
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
   };
 
   // v4float prefix unary operators
 
-  inline v4float operator +( const v4float &a ) {
+  inline v4float operator +( const v4float &a )
+  {
     v4float b;
-    b.f[0] = +a.f[0]; b.f[1] = +a.f[1]; b.f[2] = +a.f[2]; b.f[3] = +a.f[3];
+
+    b.f[0] = +a.f[0];
+    b.f[1] = +a.f[1];
+    b.f[2] = +a.f[2];
+    b.f[3] = +a.f[3];
+
     return b;
   }
 
-  inline v4float operator -( const v4float &a ) {
+  inline v4float operator -( const v4float &a )
+  {
     v4float b;
-    b.f[0] = -a.f[0]; b.f[1] = -a.f[1]; b.f[2] = -a.f[2]; b.f[3] = -a.f[3];
+
+    b.f[0] = -a.f[0];
+    b.f[1] = -a.f[1];
+    b.f[2] = -a.f[2];
+    b.f[3] = -a.f[3];
+
     return b;
   }
 
-  inline v4int operator !( const v4float &a ) {
+  inline v4int operator !( const v4float &a )
+  {
     v4int b;
+
     b.i[0] = a.i[0] ? 0 : -1;
     b.i[1] = a.i[1] ? 0 : -1;
     b.i[2] = a.i[2] ? 0 : -1;
     b.i[3] = a.i[3] ? 0 : -1;
+
     return b;
   }
 
   // v4float prefix increment / decrement operators
 
-  inline v4float operator ++( v4float &a ) {
+  inline v4float operator ++( v4float &a )
+  {
     v4float b;
-    b.f[0] = ++a.f[0]; b.f[1] = ++a.f[1]; b.f[2] = ++a.f[2]; b.f[3] = ++a.f[3];
+
+    b.f[0] = ++a.f[0];
+    b.f[1] = ++a.f[1];
+    b.f[2] = ++a.f[2];
+    b.f[3] = ++a.f[3];
+
     return b;
   }
 
-  inline v4float operator --( v4float &a ) {
+  inline v4float operator --( v4float &a )
+  {
     v4float b;
-    b.f[0] = --a.f[0]; b.f[1] = --a.f[1]; b.f[2] = --a.f[2]; b.f[3] = --a.f[3];
+
+    b.f[0] = --a.f[0];
+    b.f[1] = --a.f[1];
+    b.f[2] = --a.f[2];
+    b.f[3] = --a.f[3];
+
     return b;
   }
 
   // v4float postfix increment / decrement operators
 
-  inline v4float operator ++( v4float &a, int ) {
+  inline v4float operator ++( v4float &a, int )
+  {
     v4float b;
-    b.f[0] = a.f[0]++; b.f[1] = a.f[1]++; b.f[2] = a.f[2]++; b.f[3] = a.f[3]++;
+
+    b.f[0] = a.f[0]++;
+    b.f[1] = a.f[1]++;
+    b.f[2] = a.f[2]++;
+    b.f[3] = a.f[3]++;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a, int ) {
+  inline v4float operator --( v4float &a, int )
+  {
     v4float b;
-    b.f[0] = a.f[0]--; b.f[1] = a.f[1]--; b.f[2] = a.f[2]--; b.f[3] = a.f[3]--;
+
+    b.f[0] = a.f[0]--;
+    b.f[1] = a.f[1]--;
+    b.f[2] = a.f[2]--;
+    b.f[3] = a.f[3]--;
+
     return b;
   }
 
   // v4float binary operators
-    
+
 # define BINARY(op)                                                  \
-  inline v4float operator op( const v4float &a, const v4float &b ) { \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {								     \
     v4float c;                                                       \
     c.f[0] = a.f[0] op b.f[0];                                       \
     c.f[1] = a.f[1] op b.f[1];                                       \
@@ -784,12 +993,13 @@ namespace v4 {
   // v4float logical operators
 
 # define LOGICAL(op)                                               \
-  inline v4int operator op( const v4float &a, const v4float &b ) { \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {								   \
     v4int c;                                                       \
-    c.i[0] = -( a.f[0] op b.f[0] );                                \
-    c.i[1] = -( a.f[1] op b.f[1] );                                \
-    c.i[2] = -( a.f[2] op b.f[2] );                                \
-    c.i[3] = -( a.f[3] op b.f[3] );                                \
+    c.i[0] = - ( a.f[0] op b.f[0] );                               \
+    c.i[1] = - ( a.f[1] op b.f[1] );                               \
+    c.i[2] = - ( a.f[2] op b.f[2] );                               \
+    c.i[3] = - ( a.f[3] op b.f[3] );                               \
     return c;                                                      \
   }
 
@@ -807,22 +1017,24 @@ namespace v4 {
   // v4float math library functions
 
 # define CMATH_FR1(fn)                          \
-  inline v4float fn( const v4float &a ) {       \
+  inline v4float fn( const v4float &a )         \
+  {						\
     v4float b;                                  \
-    b.f[0] = ::fn(a.f[0]);                      \
-    b.f[1] = ::fn(a.f[1]);                      \
-    b.f[2] = ::fn(a.f[2]);                      \
-    b.f[3] = ::fn(a.f[3]);                      \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
     return b;                                   \
   }
 
 # define CMATH_FR2(fn)                                          \
-  inline v4float fn( const v4float &a, const v4float &b ) {     \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {								\
     v4float c;                                                  \
-    c.f[0] = ::fn(a.f[0],b.f[0]);                               \
-    c.f[1] = ::fn(a.f[1],b.f[1]);                               \
-    c.f[2] = ::fn(a.f[2],b.f[2]);                               \
-    c.f[3] = ::fn(a.f[3],b.f[3]);                               \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
     return c;                                                   \
   }
 
@@ -832,142 +1044,192 @@ namespace v4 {
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
 
-  inline v4float copysign( const v4float &a, const v4float &b ) {
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
     v4float c;
     float t;
-    t = ::fabs(a.f[0]); if( b.f[0]<0 ) t = -t; c.f[0] = t;
-    t = ::fabs(a.f[1]); if( b.f[1]<0 ) t = -t; c.f[1] = t;
-    t = ::fabs(a.f[2]); if( b.f[2]<0 ) t = -t; c.f[2] = t;
-    t = ::fabs(a.f[3]); if( b.f[3]<0 ) t = -t; c.f[3] = t;
+
+    t = ::fabs( a.f[0] );
+    if( b.f[0] < 0 ) t = -t;
+    c.f[0] = t;
+
+    t = ::fabs( a.f[1] );
+    if( b.f[1] < 0 ) t = -t;
+    c.f[1] = t;
+
+    t = ::fabs( a.f[2] );
+    if( b.f[2] < 0 ) t = -t;
+    c.f[2] = t;
+
+    t = ::fabs( a.f[3] );
+    if( b.f[3] < 0 ) t = -t;
+    c.f[3] = t;
+
     return c;
   }
 
 # undef CMATH_FR1
 # undef CMATH_FR2
 
-  // v4float miscelleanous functions
-  
-  inline v4float rsqrt_approx( const v4float &a ) {
+  // v4float miscellaneous functions
+
+  inline v4float rsqrt_approx( const v4float &a )
+  {
     v4float b;
-    b.f[0] = ::sqrt( 1/a.f[0] );
-    b.f[1] = ::sqrt( 1/a.f[1] );
-    b.f[2] = ::sqrt( 1/a.f[2] );
-    b.f[3] = ::sqrt( 1/a.f[3] );
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+
     return b;
   }
-  
-  inline v4float rsqrt( const v4float &a ) {
+
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
-    b.f[0] = ::sqrt( 1/a.f[0] );
-    b.f[1] = ::sqrt( 1/a.f[1] );
-    b.f[2] = ::sqrt( 1/a.f[2] );
-    b.f[3] = ::sqrt( 1/a.f[3] );
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+
     return b;
   }
 
-  inline v4float rcp_approx( const v4float &a ) {
+  inline v4float rcp_approx( const v4float &a )
+  {
     v4float b;
-    b.f[0] = 1/a.f[0];
-    b.f[1] = 1/a.f[1];
-    b.f[2] = 1/a.f[2];
-    b.f[3] = 1/a.f[3];
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+
     return b;
   }
-  
-  inline v4float rcp( const v4float &a ) {
+
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
-    b.f[0] = 1/a.f[0];
-    b.f[1] = 1/a.f[1];
-    b.f[2] = 1/a.f[2];
-    b.f[3] = 1/a.f[3];
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+
     return b;
   }
 
-  inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
-    d.f[0] = a.f[0]*b.f[0] + c.f[0];
-    d.f[1] = a.f[1]*b.f[1] + c.f[1];
-    d.f[2] = a.f[2]*b.f[2] + c.f[2];
-    d.f[3] = a.f[3]*b.f[3] + c.f[3];
+
+    d.f[0] = a.f[0] * b.f[0] + c.f[0];
+    d.f[1] = a.f[1] * b.f[1] + c.f[1];
+    d.f[2] = a.f[2] * b.f[2] + c.f[2];
+    d.f[3] = a.f[3] * b.f[3] + c.f[3];
+
     return d;
   }
 
-  inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
-    d.f[0] = a.f[0]*b.f[0] - c.f[0];
-    d.f[1] = a.f[1]*b.f[1] - c.f[1];
-    d.f[2] = a.f[2]*b.f[2] - c.f[2];
-    d.f[3] = a.f[3]*b.f[3] - c.f[3];
+
+    d.f[0] = a.f[0] * b.f[0] - c.f[0];
+    d.f[1] = a.f[1] * b.f[1] - c.f[1];
+    d.f[2] = a.f[2] * b.f[2] - c.f[2];
+    d.f[3] = a.f[3] * b.f[3] - c.f[3];
+
     return d;
   }
 
-  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
-    d.f[0] = c.f[0] - a.f[0]*b.f[0];
-    d.f[1] = c.f[1] - a.f[1]*b.f[1];
-    d.f[2] = c.f[2] - a.f[2]*b.f[2];
-    d.f[3] = c.f[3] - a.f[3]*b.f[3];
+
+    d.f[0] = c.f[0] - a.f[0] * b.f[0];
+    d.f[1] = c.f[1] - a.f[1] * b.f[1];
+    d.f[2] = c.f[2] - a.f[2] * b.f[2];
+    d.f[3] = c.f[3] - a.f[3] * b.f[3];
+
     return d;
   }
 
-  inline v4float clear_bits(  const v4int &m, const v4float &a ) {
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
-    b.i[0] = (~m.i[0]) & a.i[0];
-    b.i[1] = (~m.i[1]) & a.i[1];
-    b.i[2] = (~m.i[2]) & a.i[2];
-    b.i[3] = (~m.i[3]) & a.i[3];
+
+    b.i[0] = ( ~m.i[0] ) & a.i[0];
+    b.i[1] = ( ~m.i[1] ) & a.i[1];
+    b.i[2] = ( ~m.i[2] ) & a.i[2];
+    b.i[3] = ( ~m.i[3] ) & a.i[3];
+
     return b;
   }
 
-  inline v4float set_bits(    const v4int &m, const v4float &a ) {
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.i[0] = m.i[0] | a.i[0];
     b.i[1] = m.i[1] | a.i[1];
     b.i[2] = m.i[2] | a.i[2];
     b.i[3] = m.i[3] | a.i[3];
+
     return b;
   }
 
-  inline v4float toggle_bits( const v4int &m, const v4float &a ) {
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.i[0] = m.i[0] ^ a.i[0];
     b.i[1] = m.i[1] ^ a.i[1];
     b.i[2] = m.i[2] ^ a.i[2];
     b.i[3] = m.i[3] ^ a.i[3];
+
     return b;
   }
-  
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) {
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
     p[0] += a.f[0];
     p[1] += a.f[1];
     p[2] += a.f[2];
     p[3] += a.f[3];
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
     p[0] -= a.f[0];
     p[1] -= a.f[1];
     p[2] -= a.f[2];
     p[3] -= a.f[3];
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
     p[0] *= a.f[0];
     p[1] *= a.f[1];
     p[2] *= a.f[2];
     p[3] *= a.f[3];
   }
 
-  inline void trilinear( v4float & wl, v4float & wh ) {
+  inline void trilinear( v4float & wl, v4float & wh )
+  {
     float x = wl.f[0], y = wl.f[1], z = wl.f[2];
-    wl.f[0] = ((1-x)*(1-y))*(1-z);
-    wl.f[1] = ((1+x)*(1-y))*(1-z);
-    wl.f[2] = ((1-x)*(1+y))*(1-z);
-    wl.f[3] = ((1+x)*(1+y))*(1-z);
-    wh.f[0] = ((1-x)*(1-y))*(1+z);
-    wh.f[1] = ((1+x)*(1-y))*(1+z);
-    wh.f[2] = ((1-x)*(1+y))*(1+z);
-    wh.f[3] = ((1+x)*(1+y))*(1+z);
+
+    wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+    wl.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+
+    wh.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+    wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z );
   }
 
 } // namespace v4
diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h
new file mode 100644
index 00000000..6b2555e8
--- /dev/null
+++ b/src/util/v4/v4_portable_v0.h
@@ -0,0 +1,1237 @@
+#ifndef _v4_portable_h_
+#define _v4_portable_h_
+
+#ifndef IN_v4_h
+#error "Do not include v4_portable.h directly; use v4.h"
+#endif
+
+#include <math.h>
+
+#define V4_ACCELERATION
+#define V4_PORTABLE_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v4
+{
+  class v4;
+  class v4int;
+  class v4float;
+
+  ////////////////
+  // v4 base class
+
+  class v4
+  {
+    friend class v4int;
+    friend class v4float;
+
+    // v4 miscellaneous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+
+    // v4 memory manipulation friends
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v4 transposed memory manipulation friends
+
+    friend inline void load_4x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+                                    v4 &a ) ALWAYS_INLINE;
+
+    friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
+    friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
+    friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
+    friend inline void store_4x1_tr( const v4 &a,
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x2_tr( const v4 &a, const v4 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x4_tr( const v4 &a, const v4 &b,
+                                     const v4 &c, const v4 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[4];
+      float f[4];
+    };
+
+  public:
+
+    v4() {}                    // Default constructor
+
+    v4( const v4 &a )          // Copy constructor
+    {
+      i[0]=a.i[0];
+      i[1]=a.i[1];
+      i[2]=a.i[2];
+      i[3]=a.i[3];
+    }
+
+    ~v4() {}                   // Default destructor
+  };
+
+  // v4 miscellaneous functions
+
+  inline int any( const v4 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3];
+  }
+
+  inline int all( const v4 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3];
+  }
+
+  template<int n>
+  inline v4 splat( const v4 & a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[n];
+    b.i[1] = a.i[n];
+    b.i[2] = a.i[n];
+    b.i[3] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v4 &a, v4 &b )
+  {
+    sw( a.i[0], b.i[0] );
+    sw( a.i[1], b.i[1] );
+    sw( a.i[2], b.i[2] );
+    sw( a.i[3], b.i[3] );
+  }
+
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+                                                  sw( a2.i[3],a3.i[2] );
+  }
+
+# undef sw
+
+  // v4 memory manipulation functions
+
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    a.i[0] = ((const int * ALIGNED(16))p)[0];
+    a.i[1] = ((const int * ALIGNED(16))p)[1];
+    a.i[2] = ((const int * ALIGNED(16))p)[2];
+    a.i[3] = ((const int * ALIGNED(16))p)[3];
+  }
+
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+  }
+
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+  }
+
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = 0;
+    ((int * ALIGNED(16))p)[1] = 0;
+    ((int * ALIGNED(16))p)[2] = 0;
+    ((int * ALIGNED(16))p)[3] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
+    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
+    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
+    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+  }
+
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
+  }
+
+  // v4 transposed memory manipulation functions
+
+  inline void load_4x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+			   v4 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a, v4 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+  }
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+  }
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+  }
+
+  inline void store_4x1_tr( const v4 &a,
+                            void *a0, void *a1,
+			    void *a2, void *a3 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a, const v4 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+  }
+
+  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+  }
+
+  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+  }
+
+  //////////////
+  // v4int class
+
+  class v4int : public v4
+  {
+    // v4int prefix unary operator friends
+
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4int prefix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
+
+    // v4int postfix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
+
+    // v4int binary operator friends
+
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int logical operator friends
+
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
+
+    // v4float unary operator friends
+
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float miscellaneous friends
+
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v4int constructors / destructors
+
+    v4int() {}                                // Default constructor
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      i[0] = a.i[0];
+      i[1] = a.i[1];
+      i[2] = a.i[2];
+      i[3] = a.i[3];
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      i[0] = a.i[0];
+      i[1] = a.i[1];
+      i[2] = a.i[2];
+      i[3] = a.i[3];
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      i[0] = a;
+      i[1] = a;
+      i[2] = a;
+      i[3] = a;
+    }
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      i[0] = i0;
+      i[1] = i1;
+      i[2] = i2;
+      i[3] = i3;
+    }
+
+    ~v4int() {}                               // Destructor
+
+    // v4int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v4int &operator op( const v4int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v4int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v4int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v4int operator op( const v4int & a )   \
+  {						\
+    v4int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v4int operator !( const v4int & a )
+  {
+    v4int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v4int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v4int operator op( v4int & a )         \
+  {						\
+    v4int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v4int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v4int operator op( v4int & a, int )   \
+  {					       \
+    v4int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v4int binary operators
+
+# define BINARY(op)                                             \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {								\
+    v4int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v4int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {							       \
+    v4int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4int miscellaneous functions
+
+  inline v4int abs( const v4int &a )
+  {
+    v4int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+
+    return b;
+  }
+
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[0] & ~c.i[0];
+    b.i[1] = a.i[1] & ~c.i[1];
+    b.i[2] = a.i[2] & ~c.i[2];
+    b.i[3] = a.i[3] & ~c.i[3];
+
+    return b;
+  }
+
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[0] & c.i[0];
+    b.i[1] = a.i[1] & c.i[1];
+    b.i[2] = a.i[2] & c.i[2];
+    b.i[3] = a.i[3] & c.i[3];
+
+    return b;
+  }
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
+    v4 m;
+
+    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+
+    return m;
+  }
+
+  ////////////////
+  // v4float class
+
+  class v4float : public v4
+  {
+    // v4float prefix unary operator friends
+
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4float prefix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
+
+    // v4float postfix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
+
+    // v4float binary operator friends
+
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float math library friends
+
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                   const v4float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v4float miscellaneous friends
+
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
+  public:
+
+    // v4float constructors / destructors
+
+    v4float() {}                                        // Default constructor
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
+      f[0] = a;
+      f[1] = a;
+      f[2] = a;
+      f[3] = a;
+    }
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
+    }
+
+    ~v4float() {}                                       // Destructor
+
+    // v4float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v4float &operator op( const v4float &b )     \
+    {							\
+      f[0] op b.f[0];		             		\
+      f[1] op b.f[1];                                   \
+      f[2] op b.f[2];                                   \
+      f[3] op b.f[3];                                   \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v4float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v4float prefix unary operators
+
+  inline v4float operator +( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = +a.f[0];
+    b.f[1] = +a.f[1];
+    b.f[2] = +a.f[2];
+    b.f[3] = +a.f[3];
+
+    return b;
+  }
+
+  inline v4float operator -( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = -a.f[0];
+    b.f[1] = -a.f[1];
+    b.f[2] = -a.f[2];
+    b.f[3] = -a.f[3];
+
+    return b;
+  }
+
+  inline v4int operator !( const v4float &a )
+  {
+    v4int b;
+
+    b.i[0] = a.i[0] ? 0 : -1;
+    b.i[1] = a.i[1] ? 0 : -1;
+    b.i[2] = a.i[2] ? 0 : -1;
+    b.i[3] = a.i[3] ? 0 : -1;
+
+    return b;
+  }
+
+  // v4float prefix increment / decrement operators
+
+  inline v4float operator ++( v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = ++a.f[0];
+    b.f[1] = ++a.f[1];
+    b.f[2] = ++a.f[2];
+    b.f[3] = ++a.f[3];
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = --a.f[0];
+    b.f[1] = --a.f[1];
+    b.f[2] = --a.f[2];
+    b.f[3] = --a.f[3];
+
+    return b;
+  }
+
+  // v4float postfix increment / decrement operators
+
+  inline v4float operator ++( v4float &a, int )
+  {
+    v4float b;
+
+    b.f[0] = a.f[0]++;
+    b.f[1] = a.f[1]++;
+    b.f[2] = a.f[2]++;
+    b.f[3] = a.f[3]++;
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a, int )
+  {
+    v4float b;
+
+    b.f[0] = a.f[0]--;
+    b.f[1] = a.f[1]--;
+    b.f[2] = a.f[2]--;
+    b.f[3] = a.f[3]--;
+
+    return b;
+  }
+
+  // v4float binary operators
+
+# define BINARY(op)                                                  \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {								     \
+    v4float c;                                                       \
+    c.f[0] = a.f[0] op b.f[0];                                       \
+    c.f[1] = a.f[1] op b.f[1];                                       \
+    c.f[2] = a.f[2] op b.f[2];                                       \
+    c.f[3] = a.f[3] op b.f[3];                                       \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v4float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {								   \
+    v4int c;                                                       \
+    c.i[0] = - ( a.f[0] op b.f[0] );                               \
+    c.i[1] = - ( a.f[1] op b.f[1] );                               \
+    c.i[2] = - ( a.f[2] op b.f[2] );                               \
+    c.i[3] = - ( a.f[3] op b.f[3] );                               \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v4float fn( const v4float &a )         \
+  {						\
+    v4float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {								\
+    v4float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
+    v4float c;
+    float t;
+
+    t = ::fabs( a.f[0] );
+    if( b.f[0] < 0 ) t = -t;
+    c.f[0] = t;
+
+    t = ::fabs( a.f[1] );
+    if( b.f[1] < 0 ) t = -t;
+    c.f[1] = t;
+
+    t = ::fabs( a.f[2] );
+    if( b.f[2] < 0 ) t = -t;
+    c.f[2] = t;
+
+    t = ::fabs( a.f[3] );
+    if( b.f[3] < 0 ) t = -t;
+    c.f[3] = t;
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v4float miscellaneous functions
+
+  inline v4float rsqrt_approx( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+
+    return b;
+  }
+
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+
+    return b;
+  }
+
+  inline v4float rcp_approx( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+
+    return b;
+  }
+
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+
+    return b;
+  }
+
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.f[0] = a.f[0] * b.f[0] + c.f[0];
+    d.f[1] = a.f[1] * b.f[1] + c.f[1];
+    d.f[2] = a.f[2] * b.f[2] + c.f[2];
+    d.f[3] = a.f[3] * b.f[3] + c.f[3];
+
+    return d;
+  }
+
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.f[0] = a.f[0] * b.f[0] - c.f[0];
+    d.f[1] = a.f[1] * b.f[1] - c.f[1];
+    d.f[2] = a.f[2] * b.f[2] - c.f[2];
+    d.f[3] = a.f[3] * b.f[3] - c.f[3];
+
+    return d;
+  }
+
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    d.f[0] = c.f[0] - a.f[0] * b.f[0];
+    d.f[1] = c.f[1] - a.f[1] * b.f[1];
+    d.f[2] = c.f[2] - a.f[2] * b.f[2];
+    d.f[3] = c.f[3] - a.f[3] * b.f[3];
+
+    return d;
+  }
+
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.i[0] = ( ~m.i[0] ) & a.i[0];
+    b.i[1] = ( ~m.i[1] ) & a.i[1];
+    b.i[2] = ( ~m.i[2] ) & a.i[2];
+    b.i[3] = ( ~m.i[3] ) & a.i[3];
+
+    return b;
+  }
+
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.i[0] = m.i[0] | a.i[0];
+    b.i[1] = m.i[1] | a.i[1];
+    b.i[2] = m.i[2] | a.i[2];
+    b.i[3] = m.i[3] | a.i[3];
+
+    return b;
+  }
+
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    b.i[0] = m.i[0] ^ a.i[0];
+    b.i[1] = m.i[1] ^ a.i[1];
+    b.i[2] = m.i[2] ^ a.i[2];
+    b.i[3] = m.i[3] ^ a.i[3];
+
+    return b;
+  }
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    p[0] += a.f[0];
+    p[1] += a.f[1];
+    p[2] += a.f[2];
+    p[3] += a.f[3];
+  }
+
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    p[0] -= a.f[0];
+    p[1] -= a.f[1];
+    p[2] -= a.f[2];
+    p[3] -= a.f[3];
+  }
+
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    p[0] *= a.f[0];
+    p[1] *= a.f[1];
+    p[2] *= a.f[2];
+    p[3] *= a.f[3];
+  }
+
+  inline void trilinear( v4float & wl, v4float & wh )
+  {
+    float x = wl.f[0], y = wl.f[1], z = wl.f[2];
+
+    wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+    wl.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+
+    wh.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+    wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+  }
+
+} // namespace v4
+
+#endif // _v4_portable_h_
diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h
new file mode 100644
index 00000000..4d3c4b20
--- /dev/null
+++ b/src/util/v4/v4_portable_v1.h
@@ -0,0 +1,1183 @@
+#ifndef _v4_portable_h_
+#define _v4_portable_h_
+
+#ifndef IN_v4_h
+#error "Do not include v4_portable.h directly; use v4.h"
+#endif
+
+#include <math.h>
+
+#define V4_ACCELERATION
+#define V4_PORTABLE_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+// This does not work with gcc 5.3.1 and the -fopenmp-simd
+// flag.  Does not seem to work with -fopenmp either.  Not
+// sure why.  It does work with the Intel compiler.  Need
+// to try later versions of gcc.
+// #define ALWAYS_VECTORIZE _Pragma( "omp simd" )
+
+// #define ALWAYS_VECTORIZE _Pragma( "simd" )
+
+#define ALWAYS_VECTORIZE \
+  _Pragma( "simd" ) \
+  _Pragma( "vector aligned" )
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v4
+{
+  class v4;
+  class v4int;
+  class v4float;
+
+  ////////////////
+  // v4 base class
+
+  class v4
+  {
+    friend class v4int;
+    friend class v4float;
+
+    // v4 miscellaneous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+
+    // v4 memory manipulation friends
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v4 transposed memory manipulation friends
+
+    friend inline void load_4x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+                                    v4 &a ) ALWAYS_INLINE;
+
+    friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
+    friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
+    friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
+    friend inline void store_4x1_tr( const v4 &a,
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x2_tr( const v4 &a, const v4 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x4_tr( const v4 &a, const v4 &b,
+                                     const v4 &c, const v4 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[4];
+      float f[4];
+    };
+
+  public:
+
+    v4() {}                    // Default constructor
+
+    v4( const v4 &a )          // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    ~v4() {}                   // Default destructor
+  };
+
+  // v4 miscellaneous functions
+
+  inline int any( const v4 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3];
+  }
+
+  inline int all( const v4 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3];
+  }
+
+  template<int n>
+  inline v4 splat( const v4 & a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v4 &a, v4 &b )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      sw( a.i[j], b.i[j] );
+  }
+
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+                                                  sw( a2.i[3],a3.i[2] );
+  }
+
+# undef sw
+
+  // v4 memory manipulation functions
+
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      a.i[j] = ((const int * ALIGNED(16))p)[j];
+  }
+
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
+  }
+
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+    {
+      t = ((int * ALIGNED(16))a)[j];
+      ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
+      ((int * ALIGNED(16))b)[j] = t;
+    }
+  }
+
+  // v4 transposed memory manipulation functions
+
+  inline void load_4x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+			   v4 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a, v4 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+  }
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+  }
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+  }
+
+  inline void store_4x1_tr( const v4 &a,
+                            void *a0, void *a1,
+			    void *a2, void *a3 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a, const v4 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+  }
+
+  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+  }
+
+  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+  }
+
+  //////////////
+  // v4int class
+
+  class v4int : public v4
+  {
+    // v4int prefix unary operator friends
+
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4int prefix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
+
+    // v4int postfix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
+
+    // v4int binary operator friends
+
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int logical operator friends
+
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
+
+    // v4float unary operator friends
+
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float miscellaneous friends
+
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v4int constructors / destructors
+
+    v4int() {}                                // Default constructor
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a;
+    }
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      i[0] = i0;
+      i[1] = i1;
+      i[2] = i2;
+      i[3] = i3;
+    }
+
+    ~v4int() {}                               // Destructor
+
+    // v4int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v4int &operator op( const v4int &b )   \
+    {						  \
+      ALWAYS_VECTORIZE                            \
+      for( int j = 0; j < 4; j++ )                \
+        i[j] op b.i[j];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v4int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v4int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v4int operator op( const v4int & a )   \
+  {						\
+    v4int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v4int operator !( const v4int & a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = - ( !a.i[j] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v4int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v4int operator op( v4int & a )         \
+  {						\
+    v4int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v4int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v4int operator op( v4int & a, int )   \
+  {					       \
+    v4int b;                                   \
+    ALWAYS_VECTORIZE                           \
+    for( int j = 0; j < 4; j++ )               \
+      b.i[j] = ( a.i[j] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v4int binary operators
+
+# define BINARY(op)                                             \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {								\
+    v4int c;                                                    \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 4; j++ )                                \
+      c.i[j] = a.i[j] op b.i[j];                                \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v4int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {							       \
+    v4int c;                                                   \
+    ALWAYS_VECTORIZE                                           \
+    for( int j = 0; j < 4; j++ )                               \
+      c.i[j] = - ( a.i[j] op b.i[j] );                         \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4int miscellaneous functions
+
+  inline v4int abs( const v4int &a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j];
+
+    return b;
+  }
+
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
+
+    return b;
+  }
+
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] & c.i[j];
+
+    return b;
+  }
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
+    v4 m;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+
+    return m;
+  }
+
+  ////////////////
+  // v4float class
+
+  class v4float : public v4
+  {
+    // v4float prefix unary operator friends
+
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4float prefix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
+
+    // v4float postfix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
+
+    // v4float binary operator friends
+
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float math library friends
+
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                   const v4float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v4float miscellaneous friends
+
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
+  public:
+
+    // v4float constructors / destructors
+
+    v4float() {}                                        // Default constructor
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a.f[j];
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a.f[j];
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a;
+    }
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
+    }
+
+    ~v4float() {}                                       // Destructor
+
+    // v4float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v4float &operator op( const v4float &b )     \
+    {							\
+      ALWAYS_VECTORIZE                                  \
+      for( int j = 0; j < 4; j++ )                      \
+        f[j] op b.f[j];		             		\
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v4float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v4float prefix unary operators
+
+  inline v4float operator +( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = +a.f[j];
+
+    return b;
+  }
+
+  inline v4float operator -( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = -a.f[j];
+
+    return b;
+  }
+
+  inline v4int operator !( const v4float &a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] ? 0 : -1;
+
+    return b;
+  }
+
+  // v4float prefix increment / decrement operators
+
+  inline v4float operator ++( v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ++a.f[j];
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = --a.f[j];
+
+    return b;
+  }
+
+  // v4float postfix increment / decrement operators
+
+  inline v4float operator ++( v4float &a, int )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = a.f[j]++;
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a, int )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = a.f[j]--;
+
+    return b;
+  }
+
+  // v4float binary operators
+
+# define BINARY(op)                                                  \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {								     \
+    v4float c;                                                       \
+    ALWAYS_VECTORIZE                                                 \
+    for( int j = 0; j < 4; j++ )                                     \
+      c.f[j] = a.f[j] op b.f[j];                                     \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v4float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {								   \
+    v4int c;                                                       \
+    ALWAYS_VECTORIZE                                               \
+    for( int j = 0; j < 4; j++ )                                   \
+      c.i[j] = - ( a.f[j] op b.f[j] );                             \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v4float fn( const v4float &a )         \
+  {						\
+    v4float b;                                  \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.f[j] = ::fn( a.f[j] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {								\
+    v4float c;                                                  \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 4; j++ )                                \
+      c.f[j] = ::fn( a.f[j], b.f[j] );                          \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
+    v4float c;
+    float t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+    {
+      t = ::fabs( a.f[j] );
+      if( b.f[j] < 0 ) t = -t;
+      c.f[j] = t;
+    }
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v4float miscellaneous functions
+
+  inline v4float rsqrt_approx( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v4float rcp_approx( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = a.f[j] * b.f[j] + c.f[j];
+
+    return d;
+  }
+
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = a.f[j] * b.f[j] - c.f[j];
+
+    return d;
+  }
+
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = c.f[j] - a.f[j] * b.f[j];
+
+    return d;
+  }
+
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = ( ~m.i[j] ) & a.i[j];
+
+    return b;
+  }
+
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = m.i[j] | a.i[j];
+
+    return b;
+  }
+
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = m.i[j] ^ a.i[j];
+
+    return b;
+  }
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] += a.f[j];
+  }
+
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] -= a.f[j];
+  }
+
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] *= a.f[j];
+  }
+
+  inline void trilinear( v4float & wl, v4float & wh )
+  {
+    float x = wl.f[0], y = wl.f[1], z = wl.f[2];
+
+    wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+    wl.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+
+    wh.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+    wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+  }
+
+} // namespace v4
+
+#endif // _v4_portable_h_
diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h
index d14d49a0..fe82058f 100644
--- a/src/util/v4/v4_sse.h
+++ b/src/util/v4/v4_sse.h
@@ -23,6 +23,8 @@
 // details.  gcc-4.x.x does not seem to have this bug but may suffer from
 // other problems (use "-fno-strict-aliasing" on these platforms)
 
+#define ALWAYS_INLINE __attribute__((always_inline))
+
 namespace v4 {
 
   class v4;
@@ -45,73 +47,73 @@ namespace v4 {
       
     // v4 miscellenous friends
 
-    friend inline int any( const v4 &a );
-    friend inline int all( const v4 &a );
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
 
     template<int n>
-    friend inline v4 splat( const v4 &a );
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
 
     template<int i0, int i1, int i2, int i3>
-    friend inline v4 shuffle( const v4 &a );
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
 
-    friend inline void swap( v4 &a, v4 &b );
-    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 );
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
 
     // v4int miscellaneous friends
 
-    friend inline v4 czero(    const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
-    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b );
+    friend inline v4 czero(    const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
         
-    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a );
-    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void clear_4x1( void * ALIGNED(16) dst );
+    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
     friend inline void copy_4x1( void * ALIGNED(16) dst,
-                                 const void * ALIGNED(16) src );
-    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b );
+                                 const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
     friend inline void load_4x1_tr( const void *a0, const void *a1,
                                     const void *a2, const void *a3,
-                                    v4 &a );
+                                    v4 &a ) ALWAYS_INLINE;
     friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
                                     const void * ALIGNED(8) a1,
                                     const void * ALIGNED(8) a2,
                                     const void * ALIGNED(8) a3,
-                                    v4 &a, v4 &b );
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
     friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c );
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
     friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c, v4 &d );
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
     
     friend inline void store_4x1_tr( const v4 &a,
-                                     void *a0, void *a1, void *a2, void *a3 );
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
     friend inline void store_4x2_tr( const v4 &a, const v4 &b,
                                      void * ALIGNED(8) a0,
                                      void * ALIGNED(8) a1,
                                      void * ALIGNED(8) a2,
-                                     void * ALIGNED(8) a3 );
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
     friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
     friend inline void store_4x4_tr( const v4 &a, const v4 &b,
                                      const v4 &c, const v4 &d,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
 
   protected:
 
@@ -331,74 +333,74 @@ namespace v4 {
 
     // v4int prefix unary operator friends
 
-    friend inline v4int operator  +( const v4int & a );
-    friend inline v4int operator  -( const v4int & a );
-    friend inline v4int operator  ~( const v4int & a );
-    friend inline v4int operator  !( const v4int & a );
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4int prefix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a );
-    friend inline v4int operator --( v4int & a );
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
 
     // v4int postfix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a, int );
-    friend inline v4int operator --( v4int & a, int );
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
 
     // v4int binary operator friends
 
-    friend inline v4int operator  +( const v4int &a, const v4int &b );
-    friend inline v4int operator  -( const v4int &a, const v4int &b );
-    friend inline v4int operator  *( const v4int &a, const v4int &b );
-    friend inline v4int operator  /( const v4int &a, const v4int &b );
-    friend inline v4int operator  %( const v4int &a, const v4int &b );
-    friend inline v4int operator  ^( const v4int &a, const v4int &b );
-    friend inline v4int operator  &( const v4int &a, const v4int &b );
-    friend inline v4int operator  |( const v4int &a, const v4int &b );
-    friend inline v4int operator <<( const v4int &a, const v4int &b );
-    friend inline v4int operator >>( const v4int &a, const v4int &b );
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int logical operator friends
 
-    friend inline v4int operator  <( const v4int &a, const v4int &b );
-    friend inline v4int operator  >( const v4int &a, const v4int &b );
-    friend inline v4int operator ==( const v4int &a, const v4int &b );
-    friend inline v4int operator !=( const v4int &a, const v4int &b );
-    friend inline v4int operator <=( const v4int &a, const v4int &b );
-    friend inline v4int operator >=( const v4int &a, const v4int &b );
-    friend inline v4int operator &&( const v4int &a, const v4int &b );
-    friend inline v4int operator ||( const v4int &a, const v4int &b );
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int miscellaneous friends
 
-    friend inline v4int abs( const v4int &a );
-    friend inline v4    czero( const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     // FIXME: cswap, notcswap!
-    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f );
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
 
     // v4float unary operator friends
 
-    friend inline v4int operator  !( const v4float & a ); 
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float miscellaneous friends
 
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
+    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
 
   public:
 
@@ -642,45 +644,45 @@ namespace v4 {
 
     // v4float prefix unary operator friends
 
-    friend inline v4float operator  +( const v4float &a );
-    friend inline v4float operator  -( const v4float &a );
-    friend inline v4float operator  ~( const v4float &a );
-    friend inline v4int   operator  !( const v4float &a );
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4float prefix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a );
-    friend inline v4float operator --( v4float &a );
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
 
     // v4float postfix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a, int );
-    friend inline v4float operator --( v4float &a, int );
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
 
     // v4float binary operator friends
 
-    friend inline v4float operator  +( const v4float &a, const v4float &b );
-    friend inline v4float operator  -( const v4float &a, const v4float &b );
-    friend inline v4float operator  *( const v4float &a, const v4float &b );
-    friend inline v4float operator  /( const v4float &a, const v4float &b );
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a )
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
 #   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b )
+                                                   const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -695,21 +697,21 @@ namespace v4 {
 
     // v4float miscellaneous friends
 
-    friend inline v4float rsqrt_approx( const v4float &a );
-    friend inline v4float rsqrt( const v4float &a );
-    friend inline v4float rcp_approx( const v4float &a );
-    friend inline v4float rcp( const v4float &a );
-    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
-    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a );
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
     // FIXME: crack
-    friend inline void trilinear( v4float &wl, v4float &wh );
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
     
   public:
 
@@ -914,7 +916,8 @@ namespace v4 {
     b.v = _mm_rsqrt_ps(a.v);
     return b;
   }
-  
+
+  #if 0
   inline v4float rsqrt( const v4float &a ) {
     v4float b;
     __m128 a_v = a.v, b_v;
@@ -927,13 +930,24 @@ namespace v4 {
                                                    _mm_mul_ps(b_v,b_v))))));
     return b;
   }
+  #endif
+
+  inline v4float rsqrt( const v4float &a ) {
+    v4float b;
+    b.f[0] = ::sqrt( 1/a.f[0] );
+    b.f[1] = ::sqrt( 1/a.f[1] );
+    b.f[2] = ::sqrt( 1/a.f[2] );
+    b.f[3] = ::sqrt( 1/a.f[3] );
+    return b;
+  }
 
   inline v4float rcp_approx( const v4float &a ) {
     v4float b;
     b.v = _mm_rcp_ps(a.v);
     return b;
   }
-  
+
+  #if 0
   inline v4float rcp( const v4float &a ) {
     v4float b;
     __m128 a_v = a.v, b_v;
@@ -941,6 +955,16 @@ namespace v4 {
     b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v)));
     return b;
   }
+  #endif
+
+  inline v4float rcp( const v4float &a ) {
+    v4float b;
+    b.f[0] = 1/a.f[0];
+    b.f[1] = 1/a.f[1];
+    b.f[2] = 1/a.f[2];
+    b.f[3] = 1/a.f[3];
+    return b;
+  }
 
   inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
     v4float d;
diff --git a/src/util/v8/test/v8.cc b/src/util/v8/test/v8.cc
new file mode 100644
index 00000000..bedc26a3
--- /dev/null
+++ b/src/util/v8/test/v8.cc
@@ -0,0 +1,752 @@
+#include <gtest/gtest.h>
+
+#include "../../util.h"
+#include "../v8.h"
+
+#include <iostream>
+
+using namespace v8;
+
+TEST(v8, test_any)
+{
+  v8int a;
+  int i;
+
+  for( i=0; i < 256; i++ )
+  {
+    a[0] = i&1,  a[1] = i&2,  a[2] = i&4,  a[3] = i&8;
+    a[4] = i&16, a[5] = i&32, a[6] = i&64, a[7] = i&128;
+
+    ASSERT_FALSE( ( i>0 && !any(a) ) || ( i==0 && any(a) ) );
+  }
+}
+
+TEST(v8, test_all)
+{
+  v8int a;
+  int i;
+  for( i=0; i < 256; i++ )
+  {
+    a[0] = i&1,  a[1] = i&2,  a[2] = i&4,  a[3] = i&8;
+    a[4] = i&16, a[5] = i&32, a[6] = i&64, a[7] = i&128;
+
+    ASSERT_FALSE( ( i < 255 && all(a) ) || ( i == 255 && !all(a) ) );
+  }
+}
+
+#if 0
+
+TEST(v8, test_splat)
+{
+  v8int a( 1, 2, 3, 4, 5, 6, 7, 8);
+  v8int b( 9,10,11,12,13,14,15,16);
+  v8int c(17,18,19,20,21,22,23,24);
+  v8int d(25,26,27,28,29,30,31,32);
+  v8int e(33,34,35,36,37,38,39,40);
+  v8int f(41,42,43,44,45,46,47,48);
+  v8int g(49,50,51,52,53,54,55,56);
+  v8int h(57,58,59,60,61,62,63,64);
+  v8int i(65,66,67,68,69,70,71,72);
+
+  b = splat<0>(a);
+  c = splat<1>(a);
+  d = splat<2>(a);
+  e = splat<3>(a);
+  f = splat<4>(a);
+  g = splat<5>(a);
+  h = splat<6>(a);
+  i = splat<7>(a);
+
+  ASSERT_FALSE( any(a!=v8int(1,2,3,4,5,6,7,8)) ||
+		any(b!=v8int(1,1,1,1,1,1,1,1)) ||
+		any(c!=v8int(2,2,2,2,2,2,2,2)) ||
+		any(d!=v8int(3,3,3,3,3,3,3,3)) ||
+		any(e!=v8int(4,4,4,4,4,4,4,4)) ||
+		any(f!=v8int(5,5,5,5,5,5,5,5)) ||
+		any(g!=v8int(6,6,6,6,6,6,6,6)) ||
+		any(h!=v8int(7,7,7,7,7,7,7,7)) ||
+		any(i!=v8int(8,8,8,8,8,8,8,8)) );
+}
+
+#endif
+
+TEST(v8, test_shuffle)
+{
+  v8int a( 0, 1, 2, 3, 4, 5, 6, 7);
+  v8int b( 9,10,11,12,13,14,15,16);
+  v8int c(17,18,19,20,21,22,23,24);
+  v8int d(25,26,27,28,29,30,31,32);
+  v8int e(33,34,35,36,37,38,39,40);
+  v8int f(41,42,43,44,45,46,47,48);
+  v8int g(49,50,51,52,53,54,55,56);
+  v8int h(57,58,59,60,61,62,63,64);
+  v8int i(65,66,67,68,69,70,71,72);
+
+  b = shuffle<1,2,3,4,5,6,7,0>(a);
+  c = shuffle<2,3,4,5,6,7,0,1>(a);
+  d = shuffle<3,4,5,6,7,0,1,2>(a);
+  e = shuffle<4,5,6,7,0,1,2,3>(a);
+  f = shuffle<5,6,7,0,1,2,3,4>(a);
+  g = shuffle<6,7,0,1,2,3,4,5>(a);
+  h = shuffle<7,0,1,2,3,4,5,6>(a);
+  i = shuffle<7,6,5,4,3,2,1,0>(a);
+
+  ASSERT_FALSE( any(a!=v8int(0,1,2,3,4,5,6,7)) ||
+		any(b!=v8int(1,2,3,4,5,6,7,0)) ||
+		any(c!=v8int(2,3,4,5,6,7,0,1)) ||
+		any(d!=v8int(3,4,5,6,7,0,1,2)) ||
+		any(e!=v8int(4,5,6,7,0,1,2,3)) ||
+		any(f!=v8int(5,6,7,0,1,2,3,4)) ||
+		any(g!=v8int(6,7,0,1,2,3,4,5)) ||
+		any(h!=v8int(7,0,1,2,3,4,5,6)) ||
+		any(i!=v8int(7,6,5,4,3,2,1,0)) );
+}
+
+// #endif
+
+TEST(v8, test_swap)
+{
+  v8int a( 1, 2, 3, 4, 5, 6, 7, 8);
+  v8int b( 9,10,11,12,13,14,15,16);
+
+  swap(a,b);
+
+  ASSERT_FALSE( any( a != v8int( 9,10,11,12,13,14,15,16) ) ||
+		any( b != v8int( 1, 2, 3, 4, 5, 6, 7, 8) ) );
+}
+
+TEST(v8, test_transpose)
+{
+  v8int a0( 0, 1, 2, 3, 4, 5, 6, 7);
+  v8int a1( 8, 9,10,11,12,13,14,15);
+  v8int a2(16,17,18,19,20,21,22,23);
+  v8int a3(24,25,26,27,28,29,30,31);
+  v8int a4(32,33,34,35,36,37,38,39);
+  v8int a5(40,41,42,43,44,45,46,47);
+  v8int a6(48,49,50,51,52,53,54,55);
+  v8int a7(56,57,58,59,60,61,62,63);
+
+  transpose( a0, a1, a2, a3, a4, a5, a6, a7 );
+
+  ASSERT_FALSE( any( a0 != v8int( 0,  8, 16, 24, 32, 40, 48, 56 ) ) ||
+		any( a1 != v8int( 1,  9, 17, 25, 33, 41, 49, 57 ) ) ||
+		any( a2 != v8int( 2, 10, 18, 26, 34, 42, 50, 58 ) ) ||
+		any( a3 != v8int( 3, 11, 19, 27, 35, 43, 51, 59 ) ) ||
+		any( a4 != v8int( 4, 12, 20, 28, 36, 44, 52, 60 ) ) ||
+		any( a5 != v8int( 5, 13, 21, 29, 37, 45, 53, 61 ) ) ||
+		any( a6 != v8int( 6, 14, 22, 30, 38, 46, 54, 62 ) ) ||
+		any( a7 != v8int( 7, 15, 23, 31, 39, 47, 55, 63 ) ) );
+}
+
+TEST(v8, test_load_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v8int a0(1,0,0,0,0,0,0,0);
+  v8int a1(0,0,0,0,0,0,0,0);
+  v8int a2(0,0,0,0,0,0,0,0);
+  v8int a3(0,0,0,0,0,0,0,0);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = i;
+
+  load_8x1( mem,    a0 );
+  load_8x1( mem+8,  a1 );
+  load_8x1( mem+16, a2 );
+  load_8x1( mem+24, a3 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v8int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v8int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v8int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v8, test_store_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v8int a0( 0, 1, 2, 3, 4, 5, 6, 7);
+  v8int a1( 8, 9,10,11,12,13,14,15);
+  v8int a2(16,17,18,19,20,21,22,23);
+  v8int a3(24,25,26,27,28,29,30,31);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = 0;
+
+  store_8x1( a0, mem      );
+  store_8x1( a1, mem +  8 );
+  store_8x1( a2, mem + 16 );
+  store_8x1( a3, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v8int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v8int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v8int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v8, test_stream_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  v8int a0( 0, 1, 2, 3, 4, 5, 6, 7);
+  v8int a1( 8, 9,10,11,12,13,14,15);
+  v8int a2(16,17,18,19,20,21,22,23);
+  v8int a3(24,25,26,27,28,29,30,31);
+
+  int i;
+
+  for( i=0; i < 32; i++ ) mem[i] = 0;
+
+  stream_8x1( a0, mem      );
+  stream_8x1( a1, mem +  8 );
+  stream_8x1( a2, mem + 16 );
+  stream_8x1( a3, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 1, 2, 3, 4, 5, 6, 7) ) ||
+		any( a1 != v8int( 8, 9,10,11,12,13,14,15) ) ||
+		any( a2 != v8int(16,17,18,19,20,21,22,23) ) ||
+		any( a3 != v8int(24,25,26,27,28,29,30,31) ) ||
+		i != 32 );
+}
+
+TEST(v8, test_clear_8x1)
+{
+  v8float vmem[4]; float * mem = (float *)vmem;
+
+  int i;
+
+  for(i=0; i < 32; i++) mem[i] = i;
+
+  clear_8x1( mem + 16 );
+  clear_8x1( mem + 24 );
+
+  for(i=16; i < 32; i++) mem[i] += i;
+
+  for(i=0; i < 32; i++) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v8, test_copy_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  int i;
+
+  for( i=0; i < 16; i++ ) mem[i] = i;
+
+  copy_8x1( mem + 16, mem     );
+  copy_8x1( mem + 24, mem + 8 );
+
+  for( i=16; i < 32; i++ ) mem[i] += 16;
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v8, test_swap_8x1)
+{
+  DECLARE_ALIGNED_ARRAY( int, 32, mem, 32 );
+
+  int i;
+
+  for( i=0; i < 16; i++ ) mem[i] = i;
+
+  copy_8x1( mem + 24, mem     );
+  copy_8x1( mem + 16, mem + 8 );
+
+  for( i=16; i < 32; i++ ) mem[i] += 16;
+
+  swap_8x1( mem + 16, mem + 24 );
+
+  for( i=0; i < 32; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( i != 32 );
+}
+
+TEST(v8, test_load_8x1_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x1_tr( mem,   mem+8,  mem+16, mem+24, mem+32, mem+40, mem+48, mem+56, a0 );
+  load_8x1_tr( mem+1, mem+9,  mem+17, mem+25, mem+33, mem+41, mem+49, mem+57, a1 );
+  load_8x1_tr( mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58, a2 );
+  load_8x1_tr( mem+3, mem+11, mem+19, mem+27, mem+35, mem+43, mem+51, mem+59, a3 );
+  load_8x1_tr( mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60, a4 );
+  load_8x1_tr( mem+5, mem+13, mem+21, mem+29, mem+37, mem+45, mem+53, mem+61, a5 );
+  load_8x1_tr( mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62, a6 );
+  load_8x1_tr( mem+7, mem+15, mem+23, mem+31, mem+39, mem+47, mem+55, mem+63, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x2_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x2_tr( mem,   mem+8,  mem+16, mem+24, mem+32, mem+40, mem+48, mem+56, a0, a1 );
+  load_8x2_tr( mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58, a2, a3 );
+  load_8x2_tr( mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60, a4, a5 );
+  load_8x2_tr( mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62, a6, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x2_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x2_tr( mem,    mem+2,  mem+4,  mem+6,  mem+8,  mem+10, mem+12, mem+14, a0, a1 );
+  load_8x2_tr( mem+16, mem+18, mem+20, mem+22, mem+24, mem+26, mem+28, mem+30, a2, a3 );
+  load_8x2_tr( mem+32, mem+34, mem+36, mem+38, mem+40, mem+42, mem+44, mem+46, a4, a5 );
+  load_8x2_tr( mem+48, mem+50, mem+52, mem+54, mem+56, mem+58, mem+60, mem+62, a6, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 2, 4, 6, 8,10,12,14) ) ||
+		any( a1 != v8int( 1, 3, 5, 7, 9,11,13,15) ) ||
+		any( a2 != v8int(16,18,20,22,24,26,28,30) ) ||
+		any( a3 != v8int(17,19,21,23,25,27,29,31) ) ||
+		any( a4 != v8int(32,34,36,38,40,42,44,46) ) ||
+		any( a5 != v8int(33,35,37,39,41,43,45,47) ) ||
+		any( a6 != v8int(48,50,52,54,56,58,60,62) ) ||
+		any( a7 != v8int(49,51,53,55,57,59,61,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x3_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x3_tr( mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56,
+	       a0, a1, a2 );
+
+  for( i=0; i < 64; i++ ) if( mem[i]!=i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x4_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x4_tr( mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56,
+	       a0, a1, a2, a3 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x4_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x4_tr( mem, mem+4, mem+8, mem+12, mem+16, mem+20, mem+24, mem+28,
+	       a0, a1, a2, a3 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0,4, 8,12,16,20,24,28) ) ||
+		any( a1 != v8int(1,5, 9,13,17,21,25,29) ) ||
+		any( a2 != v8int(2,6,10,14,18,22,26,30) ) ||
+		any( a3 != v8int(3,7,11,15,19,23,27,31) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_load_8x8_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0, a1, a2, a3, a4, a5, a6, a7;
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = i;
+
+  load_8x8_tr( mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56,
+	       a0, a1, a2, a3, a4, a5, a6, a7 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x1_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0(0, 8,16,24,32,40,48,56);
+  v8int a1(1, 9,17,25,33,41,49,57);
+  v8int a2(2,10,18,26,34,42,50,58);
+  v8int a3(3,11,19,27,35,43,51,59);
+  v8int a4(4,12,20,28,36,44,52,60);
+  v8int a5(5,13,21,29,37,45,53,61);
+  v8int a6(6,14,22,30,38,46,54,62);
+  v8int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x1_tr( a0, mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+  store_8x1_tr( a1, mem+1, mem+ 9, mem+17, mem+25, mem+33, mem+41, mem+49, mem+57 );
+  store_8x1_tr( a2, mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58 );
+  store_8x1_tr( a3, mem+3, mem+11, mem+19, mem+27, mem+35, mem+43, mem+51, mem+59 );
+  store_8x1_tr( a4, mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+  store_8x1_tr( a5, mem+5, mem+13, mem+21, mem+29, mem+37, mem+45, mem+53, mem+61 );
+  store_8x1_tr( a6, mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62 );
+  store_8x1_tr( a7, mem+7, mem+15, mem+23, mem+31, mem+39, mem+47, mem+55, mem+63 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x2_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0(0, 8,16,24,32,40,48,56);
+  v8int a1(1, 9,17,25,33,41,49,57);
+  v8int a2(2,10,18,26,34,42,50,58);
+  v8int a3(3,11,19,27,35,43,51,59);
+  v8int a4(4,12,20,28,36,44,52,60);
+  v8int a5(5,13,21,29,37,45,53,61);
+  v8int a6(6,14,22,30,38,46,54,62);
+  v8int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x2_tr( a0, a1, mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+  store_8x2_tr( a2, a3, mem+2, mem+10, mem+18, mem+26, mem+34, mem+42, mem+50, mem+58 );
+  store_8x2_tr( a4, a5, mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+  store_8x2_tr( a6, a7, mem+6, mem+14, mem+22, mem+30, mem+38, mem+46, mem+54, mem+62 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x2_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0( 0, 2, 4, 6, 8,10,12,14);
+  v8int a1( 1, 3, 5, 7, 9,11,13,15);
+  v8int a2(16,18,20,22,24,26,28,30);
+  v8int a3(17,19,21,23,25,27,29,31);
+  v8int a4(32,34,36,38,40,42,44,46);
+  v8int a5(33,35,37,39,41,43,45,47);
+  v8int a6(48,50,52,54,56,58,60,62);
+  v8int a7(49,51,53,55,57,59,61,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x2_tr( a0, a1, mem,    mem+ 2, mem+ 4, mem+ 6, mem+ 8, mem+10, mem+12, mem+14 );
+  store_8x2_tr( a2, a3, mem+16, mem+18, mem+20, mem+22, mem+24, mem+26, mem+28, mem+30 );
+  store_8x2_tr( a4, a5, mem+32, mem+34, mem+36, mem+38, mem+40, mem+42, mem+44, mem+46 );
+  store_8x2_tr( a6, a7, mem+48, mem+50, mem+52, mem+54, mem+56, mem+58, mem+60, mem+62 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 2, 4, 6, 8,10,12,14) ) ||
+		any( a1 != v8int( 1, 3, 5, 7, 9,11,13,15) ) ||
+		any( a2 != v8int(16,18,20,22,24,26,28,30) ) ||
+		any( a3 != v8int(17,19,21,23,25,27,29,31) ) ||
+		any( a4 != v8int(32,34,36,38,40,42,44,46) ) ||
+		any( a5 != v8int(33,35,37,39,41,43,45,47) ) ||
+		any( a6 != v8int(48,50,52,54,56,58,60,62) ) ||
+		any( a7 != v8int(49,51,53,55,57,59,61,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x3_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0(0, 8,16,24,32,40,48,56);
+  v8int a1(1, 9,17,25,33,41,49,57);
+  v8int a2(2,10,18,26,34,42,50,58);
+  v8int a3(3,11,19,27,35,43,51,59);
+  v8int a4(4,12,20,28,36,44,52,60);
+  v8int a5(5,13,21,29,37,45,53,61);
+  v8int a6(6,14,22,30,38,46,54,62);
+  v8int a7(7,15,23,31,39,47,55,63);
+
+  int i, j;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x3_tr( a0, a1, a2,
+		mem, mem+8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+
+  j = 0;
+  for( i=0; i < 8; i++ )
+  {
+    if( ( i <  3 && mem[i]    != i    ) ||
+	( i >= 3 && mem[i]    != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+ 8] != i+ 8 ) ||
+	( i >= 3 && mem[i+ 8] != 0   ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+16] != i+16 ) ||
+	( i >= 3 && mem[i+16] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+24] != i+24 ) ||
+	( i >= 3 && mem[i+24] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+32] != i+32 ) ||
+	( i >= 3 && mem[i+32] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+40] != i+40 ) ||
+	( i >= 3 && mem[i+40] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+48] != i+48 ) ||
+	( i >= 3 && mem[i+48] != 0    ) )
+      break;
+    else
+      j++;
+
+    if( ( i <  3 && mem[i+56] != i+56 ) ||
+	( i >= 3 && mem[i+56] != 0    ) )
+      break;
+    else
+      j++;
+  }
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		j != 64 );
+}
+
+TEST(v8, test_store_8x4_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0(0, 8,16,24,32,40,48,56);
+  v8int a1(1, 9,17,25,33,41,49,57);
+  v8int a2(2,10,18,26,34,42,50,58);
+  v8int a3(3,11,19,27,35,43,51,59);
+  v8int a4(4,12,20,28,36,44,52,60);
+  v8int a5(5,13,21,29,37,45,53,61);
+  v8int a6(6,14,22,30,38,46,54,62);
+  v8int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x4_tr( a0, a1, a2, a3,
+		mem,   mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+
+  store_8x4_tr( a4, a5, a6, a7,
+		mem+4, mem+12, mem+20, mem+28, mem+36, mem+44, mem+52, mem+60 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x4_tr_a)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0( 0, 4, 8,12,16,20,24,28);
+  v8int a1( 1, 5, 9,13,17,21,25,29);
+  v8int a2( 2, 6,10,14,18,22,26,30);
+  v8int a3( 3, 7,11,15,19,23,27,31);
+  v8int a4(32,36,40,44,48,52,56,60);
+  v8int a5(33,37,41,45,49,53,57,61);
+  v8int a6(34,38,42,46,50,54,58,62);
+  v8int a7(35,39,43,47,51,55,59,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x4_tr( a0, a1, a2, a3,
+		mem,    mem+ 4, mem+ 8, mem+12, mem+16, mem+20, mem+24, mem+28 );
+
+  store_8x4_tr( a4, a5, a6, a7,
+		mem+32, mem+36, mem+40, mem+44, mem+48, mem+52, mem+56, mem+60 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int( 0, 4, 8,12,16,20,24,28) ) ||
+		any( a1 != v8int( 1, 5, 9,13,17,21,25,29) ) ||
+		any( a2 != v8int( 2, 6,10,14,18,22,26,30) ) ||
+		any( a3 != v8int( 3, 7,11,15,19,23,27,31) ) ||
+		any( a4 != v8int(32,36,40,44,48,52,56,60) ) ||
+		any( a5 != v8int(33,37,41,45,49,53,57,61) ) ||
+		any( a6 != v8int(34,38,42,46,50,54,58,62) ) ||
+		any( a7 != v8int(35,39,43,47,51,55,59,63) ) ||
+		i != 64 );
+}
+
+TEST(v8, test_store_8x8_tr)
+{
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+
+  v8int a0(0, 8,16,24,32,40,48,56);
+  v8int a1(1, 9,17,25,33,41,49,57);
+  v8int a2(2,10,18,26,34,42,50,58);
+  v8int a3(3,11,19,27,35,43,51,59);
+  v8int a4(4,12,20,28,36,44,52,60);
+  v8int a5(5,13,21,29,37,45,53,61);
+  v8int a6(6,14,22,30,38,46,54,62);
+  v8int a7(7,15,23,31,39,47,55,63);
+
+  int i;
+
+  for( i=0; i < 64; i++ ) mem[i] = 0;
+
+  store_8x8_tr( a0, a1, a2, a3, a4, a5, a6, a7,
+		mem, mem+ 8, mem+16, mem+24, mem+32, mem+40, mem+48, mem+56 );
+
+  for( i=0; i < 64; i++ ) if( mem[i] != i ) break;
+
+  ASSERT_FALSE( any( a0 != v8int(0, 8,16,24,32,40,48,56) ) ||
+		any( a1 != v8int(1, 9,17,25,33,41,49,57) ) ||
+		any( a2 != v8int(2,10,18,26,34,42,50,58) ) ||
+		any( a3 != v8int(3,11,19,27,35,43,51,59) ) ||
+		any( a4 != v8int(4,12,20,28,36,44,52,60) ) ||
+		any( a5 != v8int(5,13,21,29,37,45,53,61) ) ||
+		any( a6 != v8int(6,14,22,30,38,46,54,62) ) ||
+		any( a7 != v8int(7,15,23,31,39,47,55,63) ) ||
+		i != 64 );
+}
diff --git a/src/util/v8/v8.h b/src/util/v8/v8.h
new file mode 100644
index 00000000..3275225b
--- /dev/null
+++ b/src/util/v8/v8.h
@@ -0,0 +1,16 @@
+#ifndef _v8_h_
+#define _v8_h_
+// FIXME: STYLE
+#define IN_v8_h
+// FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H?
+#ifdef __cplusplus
+# if defined USE_V8_PORTABLE
+#   include "v8_portable.h"
+# elif defined USE_V8_AVX2
+#   include "v8_avx2.h"
+# elif defined USE_V8_AVX
+#   include "v8_avx.h"
+# endif
+#endif
+#undef IN_v8_h
+#endif // _v8_h_
diff --git a/src/util/v8/v8_avx.h b/src/util/v8/v8_avx.h
new file mode 100644
index 00000000..7a78a228
--- /dev/null
+++ b/src/util/v8/v8_avx.h
@@ -0,0 +1,1688 @@
+#ifndef _v8_avx_h_
+#define _v8_avx_h_
+
+#ifndef IN_v8_h
+#error "Do not include v8_avx.h directly; use v8.h"
+#endif
+
+#include <immintrin.h>
+#include <math.h>
+
+#define V8_ACCELERATION
+#define V8_AVX_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+// Why does GNU not define this function?
+// #ifdef __GNUC__
+#ifndef __INTEL_COMPILER
+#define _mm256_set_m128(va, vb)					\
+        _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
+#endif
+
+namespace v8
+{
+  class v8;
+  class v8int;
+  class v8float;
+
+  ////////////////
+  // v8 base class
+
+  class v8
+  {
+    friend class v8int;
+    friend class v8float;
+
+    // v8 miscellaneous friends
+
+    friend inline int any( const v8 &a ) ALWAYS_INLINE;
+    friend inline int all( const v8 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v8 splat( const v8 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+    friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+				  v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE;
+
+    // v8 memory manipulation friends
+
+    friend inline void   load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE;
+    friend inline void  store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_8x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v8 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 8x2_tr variants.
+
+    friend inline void load_8x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+				    const void *a4, const void *a5,
+                                    const void *a6, const void *a7,
+                                    v8 &a ) ALWAYS_INLINE;
+
+    friend inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+				    const void * ALIGNED(8) a4,
+                                    const void * ALIGNED(8) a5,
+                                    const void * ALIGNED(8) a6,
+                                    const void * ALIGNED(8) a7,
+                                    v8 &a, v8 &b ) ALWAYS_INLINE;
+
+    friend inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE;
+
+    friend inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE;
+
+    friend inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d,
+                                    v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE;
+
+    friend inline void store_8x1_tr( const v8 &a,
+                                     void *a0, void *a1, void *a2, void *a3,
+                                     void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x2_tr( const v8 &a, const v8 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3,
+                                     void * ALIGNED(8) a4,
+                                     void * ALIGNED(8) a5,
+                                     void * ALIGNED(8) a6,
+                                     void * ALIGNED(8) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x4_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x8_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     const v8 &e, const v8 &f,
+                                     const v8 &g, const v8 &h,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[8];
+      float f[8];
+      __m256 v;
+    };
+
+  public:
+
+    v8() {}                    // Default constructor
+
+    v8( const v8 &a )          // Copy constructor
+    {
+      v = a.v;
+    }
+
+    ~v8() {}                   // Default destructor
+  };
+
+  // v8 miscellaneous functions
+
+  inline int any( const v8 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3] ||
+           a.i[4] || a.i[5] || a.i[6] || a.i[7];
+  }
+
+  inline int all( const v8 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3] &&
+           a.i[4] && a.i[5] && a.i[6] && a.i[7];
+  }
+
+  template<int n>
+  inline v8 splat( const v8 & a )
+  {
+    v8 b;
+
+    b.v = _mm256_set1_ps( a.v[n] );
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+  inline v8 shuffle( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+    b.i[4] = a.i[i4];
+    b.i[5] = a.i[i5];
+    b.i[6] = a.i[i6];
+    b.i[7] = a.i[i7];
+
+    return b;
+  }
+
+  inline void swap( v8 &a, v8 &b )
+  {
+    __m256 a_v = a.v;
+
+    a.v = b.v;
+
+    b.v = a_v;
+  }
+
+  inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+			 v8 &a4, v8 &a5, v8 &a6, v8 &a7 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    t0 = _mm256_unpacklo_ps( a0.v, a1.v );
+    t1 = _mm256_unpackhi_ps( a0.v, a1.v );
+    t2 = _mm256_unpacklo_ps( a2.v, a3.v );
+    t3 = _mm256_unpackhi_ps( a2.v, a3.v );
+    t4 = _mm256_unpacklo_ps( a4.v, a5.v );
+    t5 = _mm256_unpackhi_ps( a4.v, a5.v );
+    t6 = _mm256_unpacklo_ps( a6.v, a7.v );
+    t7 = _mm256_unpackhi_ps( a6.v, a7.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    a0.v = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    a1.v = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    a2.v = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    a3.v = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    a4.v = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    a5.v = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    a6.v = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    a7.v = _mm256_permute2f128_ps( u3, u7, 0x31 );
+  }
+
+  // v8 memory manipulation functions
+
+  inline void load_8x1( const void * ALIGNED(16) p,
+			v8 &a )
+  {
+    a.i[0] = ((const int * ALIGNED(16))p)[0];
+    a.i[1] = ((const int * ALIGNED(16))p)[1];
+    a.i[2] = ((const int * ALIGNED(16))p)[2];
+    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[4] = ((const int * ALIGNED(16))p)[4];
+    a.i[5] = ((const int * ALIGNED(16))p)[5];
+    a.i[6] = ((const int * ALIGNED(16))p)[6];
+    a.i[7] = ((const int * ALIGNED(16))p)[7];
+  }
+
+  inline void store_8x1( const v8 &a,
+			 void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void stream_8x1( const v8 &a,
+			  void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void clear_8x1( void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = 0;
+    ((int * ALIGNED(16))p)[1] = 0;
+    ((int * ALIGNED(16))p)[2] = 0;
+    ((int * ALIGNED(16))p)[3] = 0;
+    ((int * ALIGNED(16))p)[4] = 0;
+    ((int * ALIGNED(16))p)[5] = 0;
+    ((int * ALIGNED(16))p)[6] = 0;
+    ((int * ALIGNED(16))p)[7] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_8x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
+    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
+    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
+    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4];
+    ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5];
+    ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6];
+    ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7];
+  }
+
+  inline void swap_8x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
+
+    t = ((int * ALIGNED(16))a)[4];
+    ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4];
+    ((int * ALIGNED(16))b)[4] = t;
+
+    t = ((int * ALIGNED(16))a)[5];
+    ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5];
+    ((int * ALIGNED(16))b)[5] = t;
+
+    t = ((int * ALIGNED(16))a)[6];
+    ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6];
+    ((int * ALIGNED(16))b)[6] = t;
+
+    t = ((int * ALIGNED(16))a)[7];
+    ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7];
+    ((int * ALIGNED(16))b)[7] = t;
+  }
+
+  // v8 transposed memory manipulation functions
+
+  inline void load_8x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+                           const void *a4, const void *a5,
+                           const void *a6, const void *a7,
+			   v8 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+    a.i[4] = ((const int *)a4)[0];
+    a.i[5] = ((const int *)a5)[0];
+    a.i[6] = ((const int *)a6)[0];
+    a.i[7] = ((const int *)a7)[0];
+  }
+
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &a, v8 &b )
+  {
+    __m128 zero;
+    __m128 t0, t1, t2, t3;
+    __m256 u0, u1;
+
+    zero = _mm_setzero_ps();
+
+    t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a0 ), (__m64 *)a1 );
+    t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a2 ), (__m64 *)a3 );
+    t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a4 ), (__m64 *)a5 );
+    t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a6 ), (__m64 *)a7 );
+
+    u0 = _mm256_set_m128( t2, t0 );
+    u1 = _mm256_set_m128( t3, t1 );
+
+    a.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+    b.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+  }
+
+  inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+ 			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2]; 
+   }
+
+  inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d )
+  {
+    __m256 tmp0, tmp1, tmp2, tmp3;
+
+    a.v = _mm256_set_m128( _mm_load_ps( (const float *)a4 ),
+			   _mm_load_ps( (const float *)a0 ) );
+    b.v = _mm256_set_m128( _mm_load_ps( (const float *)a5 ),
+			   _mm_load_ps( (const float *)a1 ) );
+    c.v = _mm256_set_m128( _mm_load_ps( (const float *)a6 ),
+			   _mm_load_ps( (const float *)a2 ) );
+    d.v = _mm256_set_m128( _mm_load_ps( (const float *)a7 ),
+			   _mm_load_ps( (const float *)a3 ) );
+
+    tmp0 = _mm256_shuffle_ps( a.v, b.v, 0x44 );
+    tmp2 = _mm256_shuffle_ps( a.v, b.v, 0xEE );
+    tmp1 = _mm256_shuffle_ps( c.v, d.v, 0x44 );
+    tmp3 = _mm256_shuffle_ps( c.v, d.v, 0xEE );
+
+    a.v = _mm256_shuffle_ps( tmp0, tmp1, 0x88 );
+    b.v = _mm256_shuffle_ps( tmp0, tmp1, 0xDD );
+    c.v = _mm256_shuffle_ps( tmp2, tmp3, 0x88 );
+    d.v = _mm256_shuffle_ps( tmp2, tmp3, 0xDD );
+  }
+
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    a.v = _mm256_load_ps( (const float *)a0 );
+    b.v = _mm256_load_ps( (const float *)a1 );
+    c.v = _mm256_load_ps( (const float *)a2 );
+    d.v = _mm256_load_ps( (const float *)a3 );
+    e.v = _mm256_load_ps( (const float *)a4 );
+    f.v = _mm256_load_ps( (const float *)a5 );
+    g.v = _mm256_load_ps( (const float *)a6 );
+    h.v = _mm256_load_ps( (const float *)a7 );
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    a.v = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    b.v = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    c.v = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    d.v = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    e.v = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    f.v = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    g.v = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    h.v = _mm256_permute2f128_ps( u3, u7, 0x31 );
+  }
+
+  inline void store_8x1_tr( const v8 &a,
+                            void *a0, void *a1, void *a2, void *a3,
+                            void *a4, void *a5, void *a6, void *a7 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+    ((int *)a4)[0] = a.i[4];
+    ((int *)a5)[0] = a.i[5];
+    ((int *)a6)[0] = a.i[6];
+    ((int *)a7)[0] = a.i[7];
+  }
+
+  inline void store_8x2_tr( const v8 &a, const v8 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3,
+                            void * ALIGNED(8) a4, void * ALIGNED(8) a5,
+                            void * ALIGNED(8) a6, void * ALIGNED(8) a7 )
+  {
+    __m256 u0, u1;
+    __m128 t0, t1, t2, t3;
+
+    u0 = _mm256_unpacklo_ps( a.v, b.v );
+    u1 = _mm256_unpackhi_ps( a.v, b.v );
+
+    t0 = _mm256_extractf128_ps( u0, 0 );
+    t1 = _mm256_extractf128_ps( u1, 0 );
+    t2 = _mm256_extractf128_ps( u0, 1 );
+    t3 = _mm256_extractf128_ps( u1, 1 );
+
+    _mm_storel_pi( (__m64 *) a0, t0 );
+    _mm_storeh_pi( (__m64 *) a1, t0 );
+
+    _mm_storel_pi( (__m64 *) a2, t1 );
+    _mm_storeh_pi( (__m64 *) a3, t1 );
+
+    _mm_storel_pi( (__m64 *) a4, t2 );
+    _mm_storeh_pi( (__m64 *) a5, t2 );
+
+    _mm_storel_pi( (__m64 *) a6, t3 );
+    _mm_storeh_pi( (__m64 *) a7, t3 );
+  }
+
+  inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+  }
+
+  inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    __m256 u0, u1, u2, u3;
+    __m256 t0, t1, t2, t3;
+    __m128 s0, s1, s2, s3, s4, s5, s6, s7;
+
+    u0 = _mm256_unpacklo_ps( a.v, b.v );
+    u1 = _mm256_unpacklo_ps( c.v, d.v );
+    u2 = _mm256_unpackhi_ps( a.v, b.v );
+    u3 = _mm256_unpackhi_ps( c.v, d.v );
+
+    t0 = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t1 = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t2 = _mm256_shuffle_ps( u2, u3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t3 = _mm256_shuffle_ps( u2, u3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    s0 = _mm256_extractf128_ps( t0, 0 );
+    s1 = _mm256_extractf128_ps( t1, 0 );
+    s2 = _mm256_extractf128_ps( t2, 0 );
+    s3 = _mm256_extractf128_ps( t3, 0 );
+
+    s4 = _mm256_extractf128_ps( t0, 1 );
+    s5 = _mm256_extractf128_ps( t1, 1 );
+    s6 = _mm256_extractf128_ps( t2, 1 );
+    s7 = _mm256_extractf128_ps( t3, 1 );
+
+    _mm_store_ps( (float *) a0, s0 );
+    _mm_store_ps( (float *) a1, s1 );
+    _mm_store_ps( (float *) a2, s2 );
+    _mm_store_ps( (float *) a3, s3 );
+    _mm_store_ps( (float *) a4, s4 );
+    _mm_store_ps( (float *) a5, s5 );
+    _mm_store_ps( (float *) a6, s6 );
+    _mm_store_ps( (float *) a7, s7 );
+  }
+
+  inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+			    const v8 &e, const v8 &f, const v8 &g, const v8 &h,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    t0 = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    t1 = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    t2 = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    t3 = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    t4 = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    t5 = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    t6 = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    t7 = _mm256_permute2f128_ps( u3, u7, 0x31 );
+
+    _mm256_store_ps( (float *)a0, t0 );
+    _mm256_store_ps( (float *)a1, t1 );
+    _mm256_store_ps( (float *)a2, t2 );
+    _mm256_store_ps( (float *)a3, t3 );
+    _mm256_store_ps( (float *)a4, t4 );
+    _mm256_store_ps( (float *)a5, t5 );
+    _mm256_store_ps( (float *)a6, t6 );
+    _mm256_store_ps( (float *)a7, t7 );
+  }
+
+  //////////////
+  // v8int class
+
+  class v8int : public v8
+  {
+    // v8int prefix unary operator friends
+
+    friend inline v8int operator  +( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  ~( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  !( const v8int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8int prefix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a ) ALWAYS_INLINE;
+
+    // v8int postfix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE;
+
+    // v8int binary operator friends
+
+    friend inline v8int operator  +( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  *( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  /( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  %( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  ^( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  &( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  |( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int logical operator friends
+
+    friend inline v8int operator  <( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8int abs( const v8int &a ) ALWAYS_INLINE;
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE;
+
+    // v8float unary operator friends
+
+    friend inline v8int operator  !( const v8float & a ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float miscellaneous friends
+
+    friend inline v8float clear_bits(  const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float set_bits(    const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8int constructors / destructors
+
+    v8int() {}                                // Default constructor
+
+    v8int( const v8int &a )                   // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v8int( const v8 &a )                      // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v8int( int a )                            // Init from scalar
+    {
+      union
+      {
+	int i;
+	float f;
+      } u;
+      u.i = a;
+      v = _mm256_set1_ps( u.f );
+    }
+
+    v8int( int i0, int i1, int i2, int i3,
+	   int i4, int i5, int i6, int i7 )   // Init from scalars
+    {
+      union
+      {
+	int i;
+	float f;
+      } u0, u1, u2, u3, u4, u5, u6, u7;
+
+      u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3;
+      u4.i = i4; u5.i = i5; u6.i = i6; u7.i = i7;
+
+      v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f,
+			  u4.f, u5.f, u6.f, u7.f );
+    }
+
+    ~v8int() {}                               // Destructor
+
+    // v8int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v8int &operator op( const v8int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      i[4] op b.i[4];                             \
+      i[5] op b.i[5];                             \
+      i[6] op b.i[6];                             \
+      i[7] op b.i[7];                             \
+      return *this;                               \
+    }
+
+    inline v8int &operator =( const v8int &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+
+    inline v8int &operator ^=( const v8int &b )
+    {
+      v = _mm256_xor_ps( v, b.v );
+      return *this;
+    }
+
+    inline v8int &operator &=( const v8int &b )
+    {
+      v = _mm256_and_ps( v, b.v );
+      return *this;
+    }
+
+    inline v8int &operator |=( const v8int &b )
+    {
+      v = _mm256_or_ps( v, b.v );
+      return *this;
+    }
+
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v8int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v8int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v8int operator op( const v8int & a )   \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  inline v8int operator +( const v8int & a )
+  {
+    v8int b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  PREFIX_UNARY(-)
+
+  inline v8int operator !( const v8int & a )
+  {
+    v8int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+    b.i[4] = - ( !a.i[4] );
+    b.i[5] = - ( !a.i[5] );
+    b.i[6] = - ( !a.i[6] );
+    b.i[7] = - ( !a.i[7] );
+
+    return b;
+  }
+
+  inline v8int operator ~( const v8int & a )
+  {
+    v8int b;
+
+    union
+    {
+      int i;
+      float f;
+    } u;
+
+    u.i = -1;
+
+    b.v = _mm256_xor_ps( a.v, _mm256_set1_ps( u.f ) );
+
+    return b;
+  }
+
+# undef PREFIX_UNARY
+
+  // v8int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v8int operator op( v8int & a )         \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v8int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v8int operator op( v8int & a, int )   \
+  {					       \
+    v8int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    b.i[4] = ( a.i[4] op );                    \
+    b.i[5] = ( a.i[5] op );                    \
+    b.i[6] = ( a.i[6] op );                    \
+    b.i[7] = ( a.i[7] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v8int binary operators
+
+# define BINARY(op)                                             \
+  inline v8int operator op( const v8int &a, const v8int &b )    \
+  {								\
+    v8int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    c.i[4] = a.i[4] op b.i[4];                                  \
+    c.i[5] = a.i[5] op b.i[5];                                  \
+    c.i[6] = a.i[6] op b.i[6];                                  \
+    c.i[7] = a.i[7] op b.i[7];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+
+  inline v8int operator ^( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_xor_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v8int operator &( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_and_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v8int operator |( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_or_ps( a.v, b.v );
+
+    return c;
+  }
+
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v8int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v8int operator op( const v8int &a, const v8int &b )   \
+  {							       \
+    v8int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    c.i[4] = - ( a.i[4] op b.i[4] );                           \
+    c.i[5] = - ( a.i[5] op b.i[5] );                           \
+    c.i[6] = - ( a.i[6] op b.i[6] );                           \
+    c.i[7] = - ( a.i[7] op b.i[7] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8int miscellaneous functions
+
+  inline v8int abs( const v8int &a )
+  {
+    v8int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+    b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4];
+    b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5];
+    b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6];
+    b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7];
+
+    return b;
+  }
+
+  inline v8 czero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.v = _mm256_andnot_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v8 notczero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.v = _mm256_and_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v8 merge( const v8int &c, const v8 &t, const v8 &f )
+  {
+    __m256 c_v = c.v;
+
+    v8 tf;
+
+    tf.v = _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ),
+			 _mm256_and_ps( c_v, t.v ) );
+
+    return tf;
+  }
+
+  ////////////////
+  // v8float class
+
+  class v8float : public v8
+  {
+    // v8float prefix unary operator friends
+
+    friend inline v8float operator  +( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  ~( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8int   operator  !( const v8float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8float prefix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a ) ALWAYS_INLINE;
+
+    // v8float postfix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE;
+
+    // v8float binary operator friends
+
+    friend inline v8float operator  +( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  *( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  /( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float math library friends
+
+#   define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v8float fn( const v8float &a,  \
+                                                   const v8float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v8float miscellaneous friends
+
+    friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rsqrt       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void     scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8float constructors / destructors
+
+    v8float() {}                                        // Default constructor
+
+    v8float( const v8float &a )                         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v8float( const v8 &a )                              // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v8float( float a )                                  // Init from scalar
+    {
+      v = _mm256_set1_ps( a );
+    }
+
+    v8float( float f0, float f1, float f2, float f3,
+	     float f4, float f5, float f6, float f7 )   // Init from scalars
+    {
+      v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 );
+    }
+
+    ~v8float() {}                                       // Destructor
+
+    // v8float assignment operators
+
+#   define ASSIGN(op,intrin)				\
+    inline v8float &operator op( const v8float &b )     \
+    {							\
+      v = intrin(v,b.v);				\
+      return *this;					\
+    }
+
+    inline v8float &operator =( const v8float &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=,_mm256_add_ps)
+    ASSIGN(-=,_mm256_sub_ps)
+    ASSIGN(*=,_mm256_mul_ps)
+    ASSIGN(/=,_mm256_div_ps)
+
+#   undef ASSIGN
+
+    // v8float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v8float prefix unary operators
+
+  inline v8float operator +( const v8float &a )
+  {
+    v8float b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  inline v8float operator -( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_sub_ps( _mm256_setzero_ps(), a.v );
+
+    return b;
+  }
+
+  inline v8int operator !( const v8float &a )
+  {
+    v8int b;
+
+    b.v = _mm256_cmp_ps( _mm256_setzero_ps(), a.v, _CMP_EQ_OS );
+
+    return b;
+  }
+
+  // v8float prefix increment / decrement operators
+
+  inline v8float operator ++( v8float &a )
+  {
+    v8float b;
+    __m256 t = _mm256_add_ps( a.v, _mm256_set1_ps( 1 ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a )
+  {
+    v8float b;
+    __m256 t = _mm256_sub_ps( a.v, _mm256_set1_ps( 1 ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  // v8float postfix increment / decrement operators
+
+  inline v8float operator ++( v8float &a, int )
+  {
+    v8float b;
+    __m256 a_v = a.v;
+
+    a.v = _mm256_add_ps( a_v, _mm256_set1_ps( 1 ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a, int )
+  {
+    v8float b;
+    __m256 a_v = a.v;
+
+    a.v = _mm256_sub_ps(a_v, _mm256_set1_ps( 1 ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  // v8float binary operators
+
+# define BINARY(op,intrin)                                           \
+  inline v8float operator op( const v8float &a, const v8float &b )   \
+  {								     \
+    v8float c;                                                       \
+    c.v = intrin( a.v, b.v );                                        \
+    return c;                                                        \
+  }
+
+  BINARY( +, _mm256_add_ps )
+  BINARY( -, _mm256_sub_ps )
+  BINARY( *, _mm256_mul_ps )
+  BINARY( /, _mm256_div_ps )
+
+# undef BINARY
+
+  // v8float logical operators
+
+# define LOGICAL(op,intrin,flag)                                   \
+  inline v8int operator op( const v8float &a, const v8float &b )   \
+  {								   \
+    v8int c;                                                       \
+    c.v = intrin( a.v, b.v, flag );				   \
+    return c;                                                      \
+  }
+
+  LOGICAL( <,  _mm256_cmp_ps, _CMP_LT_OS  )
+  LOGICAL( >,  _mm256_cmp_ps, _CMP_GT_OS  )
+  LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS  )
+  LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS )
+  LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS  )
+  LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS  )
+
+  inline v8int operator &&( const v8float &a, const v8float &b )
+  {
+    v8int c;
+    __m256 vzero = _mm256_setzero_ps();
+
+    c.v = _mm256_and_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			 _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+
+    return c;
+  }
+
+  inline v8int operator ||( const v8float &a, const v8float &b )
+  {
+    v8int c;
+    __m256 vzero = _mm256_setzero_ps();
+
+    c.v = _mm256_or_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			_mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+
+    return c;
+  }
+
+# undef LOGICAL
+
+  // v8float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v8float fn( const v8float &a )         \
+  {						\
+    v8float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    b.f[4] = ::fn( a.f[4] );                    \
+    b.f[5] = ::fn( a.f[5] );                    \
+    b.f[6] = ::fn( a.f[6] );                    \
+    b.f[7] = ::fn( a.f[7] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v8float fn( const v8float &a, const v8float &b )       \
+  {								\
+    v8float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    c.f[4] = ::fn( a.f[4], b.f[4] );                            \
+    c.f[5] = ::fn( a.f[5], b.f[5] );                            \
+    c.f[6] = ::fn( a.f[6], b.f[6] );                            \
+    c.f[7] = ::fn( a.f[7], b.f[7] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v8float fabs( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_andnot_ps( _mm256_set1_ps( -0.f ), a.v );
+
+    return b;
+  }
+
+  inline v8float sqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_sqrt_ps( a.v );
+
+    return b;
+  }
+
+  inline v8float copysign( const v8float &a, const v8float &b )
+  {
+    v8float c;
+    __m256 t = _mm256_set1_ps( -0.f );
+
+    c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ),
+			_mm256_andnot_ps( t, a.v ) );
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v8float miscellaneous functions
+
+  inline v8float rsqrt_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_rsqrt_ps(a.v);
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+  #endif
+
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+    __m256 a_v = a.v, b_v;
+
+    b_v = _mm256_rsqrt_ps(a_v);
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+    b.v = _mm256_add_ps( b_v, _mm256_mul_ps( _mm256_set1_ps( 0.5f ),
+					     _mm256_sub_ps( b_v,
+							    _mm256_mul_ps( a_v,
+									   _mm256_mul_ps( b_v,
+											  _mm256_mul_ps( b_v, b_v ) ) ) ) ) );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_div_ps( _mm256_set1_ps( 1.0f ), _mm256_sqrt_ps( a.v ) );
+
+    return b;
+  }
+  #endif
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+  #endif
+
+  inline v8float rcp_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_rcp_ps( a.v );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+  #endif
+
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+    __m256 a_v = a.v, b_v;
+
+    b_v = _mm256_rcp_ps( a_v );
+    b.v = _mm256_sub_ps( _mm256_add_ps( b_v, b_v ),
+			 _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_div_ps( _mm256_set1_ps( 1.0f ), a.v );
+
+    return b;
+  }
+  #endif
+
+  inline v8float fma( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_add_ps( _mm256_mul_ps( a.v, b.v ), c.v );
+
+    // d.v = _mm256_fmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float fms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_sub_ps( _mm256_mul_ps( a.v, b.v ), c.v );
+
+    // d.v = _mm256_fmsub_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float fnms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_sub_ps( c.v, _mm256_mul_ps( a.v, b.v ) );
+
+    // d.v = _mm256_fnmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float clear_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_andnot_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v8float set_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_or_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v8float toggle_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_xor_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline void increment_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_add_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+  inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_sub_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+  inline void scale_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_mul_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+} // namespace v8
+
+#endif // _v8_avx_h_
diff --git a/src/util/v8/v8_avx2.h b/src/util/v8/v8_avx2.h
new file mode 100644
index 00000000..08ba6843
--- /dev/null
+++ b/src/util/v8/v8_avx2.h
@@ -0,0 +1,1878 @@
+#ifndef _v8_avx2_h_
+#define _v8_avx2_h_
+
+#ifndef IN_v8_h
+#error "Do not include v8_avx2.h directly; use v8.h"
+#endif
+
+#include <immintrin.h>
+#include <math.h>
+
+#define V8_ACCELERATION
+#define V8_AVX2_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+// Why does GNU not define this function?
+// #ifdef __GNUC__
+#ifndef __INTEL_COMPILER
+#define _mm256_set_m128(va, vb)					\
+        _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
+#endif
+
+namespace v8
+{
+  class v8;
+  class v8int;
+  class v8float;
+
+  ////////////////
+  // v8 base class
+
+  class v8
+  {
+    friend class v8int;
+    friend class v8float;
+
+    // v8 miscellaneous friends
+
+    friend inline int any( const v8 &a ) ALWAYS_INLINE;
+    friend inline int all( const v8 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v8 splat( const v8 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+    friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+				  v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE;
+
+    // v8 memory manipulation friends
+
+    friend inline void   load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE;
+    friend inline void  store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_8x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v8 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 8x2_tr variants.
+
+    friend inline void load_8x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+				    const void *a4, const void *a5,
+                                    const void *a6, const void *a7,
+                                    v8 &a ) ALWAYS_INLINE;
+
+    friend inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+				    const void * ALIGNED(8) a4,
+                                    const void * ALIGNED(8) a5,
+                                    const void * ALIGNED(8) a6,
+                                    const void * ALIGNED(8) a7,
+                                    v8 &a, v8 &b ) ALWAYS_INLINE;
+
+    friend inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE;
+
+    friend inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE;
+
+    friend inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d,
+                                    v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE;
+
+    friend inline void store_8x1_tr( const v8 &a,
+                                     void *a0, void *a1, void *a2, void *a3,
+                                     void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x2_tr( const v8 &a, const v8 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3,
+                                     void * ALIGNED(8) a4,
+                                     void * ALIGNED(8) a5,
+                                     void * ALIGNED(8) a6,
+                                     void * ALIGNED(8) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x4_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x8_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     const v8 &e, const v8 &f,
+                                     const v8 &g, const v8 &h,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[8];
+      float f[8];
+      __m256 v;
+    };
+
+  public:
+
+    v8() {}                    // Default constructor
+
+    v8( const v8 &a )          // Copy constructor
+    {
+      v = a.v;
+    }
+
+    ~v8() {}                   // Default destructor
+  };
+
+  // v8 miscellaneous functions
+
+  inline int any( const v8 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3] ||
+           a.i[4] || a.i[5] || a.i[6] || a.i[7];
+  }
+
+  inline int all( const v8 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3] &&
+           a.i[4] && a.i[5] && a.i[6] && a.i[7];
+  }
+
+  template<int n>
+  inline v8 splat( const v8 & a )
+  {
+    v8 b;
+
+    b.v = _mm256_set1_ps( a.v[n] );
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+  inline v8 shuffle( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+    b.i[4] = a.i[i4];
+    b.i[5] = a.i[i5];
+    b.i[6] = a.i[i6];
+    b.i[7] = a.i[i7];
+
+    return b;
+  }
+
+  inline void swap( v8 &a, v8 &b )
+  {
+    __m256 a_v = a.v;
+
+    a.v = b.v;
+
+    b.v = a_v;
+  }
+
+  inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+			 v8 &a4, v8 &a5, v8 &a6, v8 &a7 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    t0 = _mm256_unpacklo_ps( a0.v, a1.v );
+    t1 = _mm256_unpackhi_ps( a0.v, a1.v );
+    t2 = _mm256_unpacklo_ps( a2.v, a3.v );
+    t3 = _mm256_unpackhi_ps( a2.v, a3.v );
+    t4 = _mm256_unpacklo_ps( a4.v, a5.v );
+    t5 = _mm256_unpackhi_ps( a4.v, a5.v );
+    t6 = _mm256_unpacklo_ps( a6.v, a7.v );
+    t7 = _mm256_unpackhi_ps( a6.v, a7.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    a0.v = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    a1.v = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    a2.v = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    a3.v = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    a4.v = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    a5.v = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    a6.v = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    a7.v = _mm256_permute2f128_ps( u3, u7, 0x31 );
+  }
+
+  // v8 memory manipulation functions
+
+  inline void load_8x1( const void * ALIGNED(16) p,
+			v8 &a )
+  {
+    a.i[0] = ((const int * ALIGNED(16))p)[0];
+    a.i[1] = ((const int * ALIGNED(16))p)[1];
+    a.i[2] = ((const int * ALIGNED(16))p)[2];
+    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[4] = ((const int * ALIGNED(16))p)[4];
+    a.i[5] = ((const int * ALIGNED(16))p)[5];
+    a.i[6] = ((const int * ALIGNED(16))p)[6];
+    a.i[7] = ((const int * ALIGNED(16))p)[7];
+  }
+
+  inline void store_8x1( const v8 &a,
+			 void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void stream_8x1( const v8 &a,
+			  void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void clear_8x1( void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = 0;
+    ((int * ALIGNED(16))p)[1] = 0;
+    ((int * ALIGNED(16))p)[2] = 0;
+    ((int * ALIGNED(16))p)[3] = 0;
+    ((int * ALIGNED(16))p)[4] = 0;
+    ((int * ALIGNED(16))p)[5] = 0;
+    ((int * ALIGNED(16))p)[6] = 0;
+    ((int * ALIGNED(16))p)[7] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_8x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
+    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
+    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
+    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4];
+    ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5];
+    ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6];
+    ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7];
+  }
+
+  inline void swap_8x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
+
+    t = ((int * ALIGNED(16))a)[4];
+    ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4];
+    ((int * ALIGNED(16))b)[4] = t;
+
+    t = ((int * ALIGNED(16))a)[5];
+    ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5];
+    ((int * ALIGNED(16))b)[5] = t;
+
+    t = ((int * ALIGNED(16))a)[6];
+    ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6];
+    ((int * ALIGNED(16))b)[6] = t;
+
+    t = ((int * ALIGNED(16))a)[7];
+    ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7];
+    ((int * ALIGNED(16))b)[7] = t;
+  }
+
+  // v8 transposed memory manipulation functions
+
+  inline void load_8x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+                           const void *a4, const void *a5,
+                           const void *a6, const void *a7,
+			   v8 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+    a.i[4] = ((const int *)a4)[0];
+    a.i[5] = ((const int *)a5)[0];
+    a.i[6] = ((const int *)a6)[0];
+    a.i[7] = ((const int *)a7)[0];
+  }
+
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &a, v8 &b )
+  {
+    __m128 zero;
+    __m128 t0, t1, t2, t3;
+    __m256 u0, u1;
+
+    zero = _mm_setzero_ps();
+
+    t0 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a0 ), (__m64 *)a1 );
+    t1 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a2 ), (__m64 *)a3 );
+    t2 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a4 ), (__m64 *)a5 );
+    t3 = _mm_loadh_pi( _mm_loadl_pi( zero, (__m64 *)a6 ), (__m64 *)a7 );
+
+    u0 = _mm256_set_m128( t2, t0 );
+    u1 = _mm256_set_m128( t3, t1 );
+
+    a.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 2, 0, 2, 0 ) );
+    b.v = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 1, 3, 1 ) );
+  }
+
+  #if 0
+  // This is an alternate AVX-2 implementation.
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &b0, v8 &b1 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+    __m256 u0,     u2,     u4,     u6;
+
+    t0   = _mm256_load_ps( (const float *)a0 );
+    t1   = _mm256_load_ps( (const float *)a1 );
+    t2   = _mm256_load_ps( (const float *)a2 );
+    t3   = _mm256_load_ps( (const float *)a3 );
+    t4   = _mm256_load_ps( (const float *)a4 );
+    t5   = _mm256_load_ps( (const float *)a5 );
+    t6   = _mm256_load_ps( (const float *)a6 );
+    t7   = _mm256_load_ps( (const float *)a7 );
+
+    u0   = _mm256_unpacklo_ps( t0, t1 );
+    u2   = _mm256_unpacklo_ps( t2, t3 );
+    u4   = _mm256_unpacklo_ps( t4, t5 );
+    u6   = _mm256_unpacklo_ps( t6, t7 );
+
+    t0   = _mm256_shuffle_ps( u0, u2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t1   = _mm256_shuffle_ps( u0, u2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t4   = _mm256_shuffle_ps( u4, u6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t5   = _mm256_shuffle_ps( u4, u6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    b0.v = _mm256_permute2f128_ps( t0, t4, 0x20 );
+    b1.v = _mm256_permute2f128_ps( t1, t5, 0x20 );
+  }
+  #endif
+
+  inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+ 			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2]; 
+   }
+
+  inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d )
+  {
+    __m256 tmp0, tmp1, tmp2, tmp3;
+
+    a.v = _mm256_set_m128( _mm_load_ps( (const float *)a4 ),
+			   _mm_load_ps( (const float *)a0 ) );
+    b.v = _mm256_set_m128( _mm_load_ps( (const float *)a5 ),
+			   _mm_load_ps( (const float *)a1 ) );
+    c.v = _mm256_set_m128( _mm_load_ps( (const float *)a6 ),
+			   _mm_load_ps( (const float *)a2 ) );
+    d.v = _mm256_set_m128( _mm_load_ps( (const float *)a7 ),
+			   _mm_load_ps( (const float *)a3 ) );
+
+    tmp0 = _mm256_shuffle_ps( a.v, b.v, 0x44 );
+    tmp2 = _mm256_shuffle_ps( a.v, b.v, 0xEE );
+    tmp1 = _mm256_shuffle_ps( c.v, d.v, 0x44 );
+    tmp3 = _mm256_shuffle_ps( c.v, d.v, 0xEE );
+
+    a.v = _mm256_shuffle_ps( tmp0, tmp1, 0x88 );
+    b.v = _mm256_shuffle_ps( tmp0, tmp1, 0xDD );
+    c.v = _mm256_shuffle_ps( tmp2, tmp3, 0x88 );
+    d.v = _mm256_shuffle_ps( tmp2, tmp3, 0xDD );
+  }
+
+  // This is a cleaner reference AVX-2 implementation.
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &b0, v8 &b1, v8 &b2, v8 &b3,
+                           v8 &b4, v8 &b5, v8 &b6, v8 &b7 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0   = _mm256_load_ps( (const float *)a0 );
+    t1   = _mm256_load_ps( (const float *)a1 );
+    t2   = _mm256_load_ps( (const float *)a2 );
+    t3   = _mm256_load_ps( (const float *)a3 );
+    t4   = _mm256_load_ps( (const float *)a4 );
+    t5   = _mm256_load_ps( (const float *)a5 );
+    t6   = _mm256_load_ps( (const float *)a6 );
+    t7   = _mm256_load_ps( (const float *)a7 );
+
+    b0.v = _mm256_unpacklo_ps( t0, t1 );
+    b1.v = _mm256_unpackhi_ps( t0, t1 );
+    b2.v = _mm256_unpacklo_ps( t2, t3 );
+    b3.v = _mm256_unpackhi_ps( t2, t3 );
+    b4.v = _mm256_unpacklo_ps( t4, t5 );
+    b5.v = _mm256_unpackhi_ps( t4, t5 );
+    b6.v = _mm256_unpacklo_ps( t6, t7 );
+    b7.v = _mm256_unpackhi_ps( t6, t7 );
+
+    t0   = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t1   = _mm256_shuffle_ps( b0.v, b2.v, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t2   = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t3   = _mm256_shuffle_ps( b1.v, b3.v, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t4   = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t5   = _mm256_shuffle_ps( b4.v, b6.v, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t6   = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t7   = _mm256_shuffle_ps( b5.v, b7.v, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    b0.v = _mm256_permute2f128_ps( t0, t4, 0x20 );
+    b1.v = _mm256_permute2f128_ps( t1, t5, 0x20 );
+    b2.v = _mm256_permute2f128_ps( t2, t6, 0x20 );
+    b3.v = _mm256_permute2f128_ps( t3, t7, 0x20 );
+    b4.v = _mm256_permute2f128_ps( t0, t4, 0x31 );
+    b5.v = _mm256_permute2f128_ps( t1, t5, 0x31 );
+    b6.v = _mm256_permute2f128_ps( t2, t6, 0x31 );
+    b7.v = _mm256_permute2f128_ps( t3, t7, 0x31 );
+  }
+
+  #if 0
+  // This is the reference AVX-2 implementation.
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    a.v = _mm256_load_ps( (const float *)a0 );
+    b.v = _mm256_load_ps( (const float *)a1 );
+    c.v = _mm256_load_ps( (const float *)a2 );
+    d.v = _mm256_load_ps( (const float *)a3 );
+    e.v = _mm256_load_ps( (const float *)a4 );
+    f.v = _mm256_load_ps( (const float *)a5 );
+    g.v = _mm256_load_ps( (const float *)a6 );
+    h.v = _mm256_load_ps( (const float *)a7 );
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    a.v = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    b.v = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    c.v = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    d.v = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    e.v = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    f.v = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    g.v = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    h.v = _mm256_permute2f128_ps( u3, u7, 0x31 );
+  }
+  #endif
+
+  #if 0
+  // Replace _mm256_load_ps with _mm256_insertf128_ps.
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    a.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a0     ) ), _mm_load_ps( (const float *)a4     ), 1 );
+    b.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a1     ) ), _mm_load_ps( (const float *)a5     ), 1 );
+    c.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a2     ) ), _mm_load_ps( (const float *)a6     ), 1 );
+    d.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a3     ) ), _mm_load_ps( (const float *)a7     ), 1 );
+    e.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a0 + 4 ) ), _mm_load_ps( (const float *)a4 + 4 ), 1 );
+    f.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a1 + 4 ) ), _mm_load_ps( (const float *)a5 + 4 ), 1 );
+    g.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a2 + 4 ) ), _mm_load_ps( (const float *)a6 + 4 ), 1 );
+    h.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a3 + 4 ) ), _mm_load_ps( (const float *)a7 + 4 ), 1 );
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    a.v = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    b.v = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    c.v = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    d.v = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    e.v = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    f.v = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    g.v = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    h.v = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+  }
+  #endif
+
+  #if 0
+  // Replace _mm256_load_ps with _mm256_insertf128_ps. Replace two calls to
+  // _mm256_shuffle_ps with one call to _mm256_shuffle_ps and two calls to
+  // _mm256_blend_ps.
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 v0;
+
+    a.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a0     ) ), _mm_load_ps( (const float *)a4     ), 1 );
+    b.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a1     ) ), _mm_load_ps( (const float *)a5     ), 1 );
+    c.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a2     ) ), _mm_load_ps( (const float *)a6     ), 1 );
+    d.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a3     ) ), _mm_load_ps( (const float *)a7     ), 1 );
+    e.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a0 + 4 ) ), _mm_load_ps( (const float *)a4 + 4 ), 1 );
+    f.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a1 + 4 ) ), _mm_load_ps( (const float *)a5 + 4 ), 1 );
+    g.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a2 + 4 ) ), _mm_load_ps( (const float *)a6 + 4 ), 1 );
+    h.v = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm_load_ps( (const float *)a3 + 4 ) ), _mm_load_ps( (const float *)a7 + 4 ), 1 );
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    v0  = _mm256_shuffle_ps( t0, t2, 0x4E );
+
+    a.v = _mm256_blend_ps( t0, v0, 0xCC );
+    b.v = _mm256_blend_ps( t2, v0, 0x33 );
+
+    v0  = _mm256_shuffle_ps( t1, t3, 0x4E );
+
+    c.v = _mm256_blend_ps( t1, v0, 0xCC );
+    d.v = _mm256_blend_ps( t3, v0, 0x33 );
+
+    v0  = _mm256_shuffle_ps( t4, t6, 0x4E );
+
+    e.v = _mm256_blend_ps( t4, v0, 0xCC );
+    f.v = _mm256_blend_ps( t6, v0, 0x33 );
+
+    v0  = _mm256_shuffle_ps( t5, t7, 0x4E );
+
+    g.v = _mm256_blend_ps( t5, v0, 0xCC );
+    h.v = _mm256_blend_ps( t7, v0, 0x33 );
+  }
+  #endif
+
+  inline void store_8x1_tr( const v8 &a,
+                            void *a0, void *a1, void *a2, void *a3,
+                            void *a4, void *a5, void *a6, void *a7 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+    ((int *)a4)[0] = a.i[4];
+    ((int *)a5)[0] = a.i[5];
+    ((int *)a6)[0] = a.i[6];
+    ((int *)a7)[0] = a.i[7];
+  }
+
+  inline void store_8x2_tr( const v8 &a, const v8 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3,
+                            void * ALIGNED(8) a4, void * ALIGNED(8) a5,
+                            void * ALIGNED(8) a6, void * ALIGNED(8) a7 )
+  {
+    __m256 u0, u1;
+    __m128 t0, t1, t2, t3;
+
+    u0 = _mm256_unpacklo_ps( a.v, b.v );
+    u1 = _mm256_unpackhi_ps( a.v, b.v );
+
+    t0 = _mm256_extractf128_ps( u0, 0 );
+    t1 = _mm256_extractf128_ps( u1, 0 );
+    t2 = _mm256_extractf128_ps( u0, 1 );
+    t3 = _mm256_extractf128_ps( u1, 1 );
+
+    _mm_storel_pi( (__m64 *) a0, t0 );
+    _mm_storeh_pi( (__m64 *) a1, t0 );
+
+    _mm_storel_pi( (__m64 *) a2, t1 );
+    _mm_storeh_pi( (__m64 *) a3, t1 );
+
+    _mm_storel_pi( (__m64 *) a4, t2 );
+    _mm_storeh_pi( (__m64 *) a5, t2 );
+
+    _mm_storel_pi( (__m64 *) a6, t3 );
+    _mm_storeh_pi( (__m64 *) a7, t3 );
+  }
+
+  inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+  }
+
+  inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    __m256 u0, u1, u2, u3;
+    __m256 t0, t1, t2, t3;
+    __m128 s0, s1, s2, s3, s4, s5, s6, s7;
+
+    u0 = _mm256_unpacklo_ps( a.v, b.v );
+    u1 = _mm256_unpacklo_ps( c.v, d.v );
+    u2 = _mm256_unpackhi_ps( a.v, b.v );
+    u3 = _mm256_unpackhi_ps( c.v, d.v );
+
+    t0 = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t1 = _mm256_shuffle_ps( u0, u1, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    t2 = _mm256_shuffle_ps( u2, u3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    t3 = _mm256_shuffle_ps( u2, u3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    s0 = _mm256_extractf128_ps( t0, 0 );
+    s1 = _mm256_extractf128_ps( t1, 0 );
+    s2 = _mm256_extractf128_ps( t2, 0 );
+    s3 = _mm256_extractf128_ps( t3, 0 );
+
+    s4 = _mm256_extractf128_ps( t0, 1 );
+    s5 = _mm256_extractf128_ps( t1, 1 );
+    s6 = _mm256_extractf128_ps( t2, 1 );
+    s7 = _mm256_extractf128_ps( t3, 1 );
+
+    _mm_store_ps( (float *) a0, s0 );
+    _mm_store_ps( (float *) a1, s1 );
+    _mm_store_ps( (float *) a2, s2 );
+    _mm_store_ps( (float *) a3, s3 );
+    _mm_store_ps( (float *) a4, s4 );
+    _mm_store_ps( (float *) a5, s5 );
+    _mm_store_ps( (float *) a6, s6 );
+    _mm_store_ps( (float *) a7, s7 );
+  }
+
+  inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+			    const v8 &e, const v8 &f, const v8 &g, const v8 &h,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    __m256 t0, t1, t2, t3, t4, t5, t6, t7;
+
+    __m256 u0, u1, u2, u3, u4, u5, u6, u7;
+
+    t0 = _mm256_unpacklo_ps( a.v, b.v );
+    t1 = _mm256_unpackhi_ps( a.v, b.v );
+    t2 = _mm256_unpacklo_ps( c.v, d.v );
+    t3 = _mm256_unpackhi_ps( c.v, d.v );
+    t4 = _mm256_unpacklo_ps( e.v, f.v );
+    t5 = _mm256_unpackhi_ps( e.v, f.v );
+    t6 = _mm256_unpacklo_ps( g.v, h.v );
+    t7 = _mm256_unpackhi_ps( g.v, h.v );
+
+    u0 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u1 = _mm256_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u2 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u3 = _mm256_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u4 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u5 = _mm256_shuffle_ps( t4, t6, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    u6 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+    u7 = _mm256_shuffle_ps( t5, t7, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+
+    t0 = _mm256_permute2f128_ps( u0, u4, 0x20 );
+    t1 = _mm256_permute2f128_ps( u1, u5, 0x20 );
+    t2 = _mm256_permute2f128_ps( u2, u6, 0x20 );
+    t3 = _mm256_permute2f128_ps( u3, u7, 0x20 );
+    t4 = _mm256_permute2f128_ps( u0, u4, 0x31 );
+    t5 = _mm256_permute2f128_ps( u1, u5, 0x31 );
+    t6 = _mm256_permute2f128_ps( u2, u6, 0x31 );
+    t7 = _mm256_permute2f128_ps( u3, u7, 0x31 );
+
+    _mm256_store_ps( (float *)a0, t0 );
+    _mm256_store_ps( (float *)a1, t1 );
+    _mm256_store_ps( (float *)a2, t2 );
+    _mm256_store_ps( (float *)a3, t3 );
+    _mm256_store_ps( (float *)a4, t4 );
+    _mm256_store_ps( (float *)a5, t5 );
+    _mm256_store_ps( (float *)a6, t6 );
+    _mm256_store_ps( (float *)a7, t7 );
+  }
+
+  //////////////
+  // v8int class
+
+  class v8int : public v8
+  {
+    // v8int prefix unary operator friends
+
+    friend inline v8int operator  +( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  ~( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  !( const v8int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8int prefix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a ) ALWAYS_INLINE;
+
+    // v8int postfix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE;
+
+    // v8int binary operator friends
+
+    friend inline v8int operator  +( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  *( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  /( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  %( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  ^( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  &( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  |( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int logical operator friends
+
+    friend inline v8int operator  <( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8int abs( const v8int &a ) ALWAYS_INLINE;
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE;
+
+    // v8float unary operator friends
+
+    friend inline v8int operator  !( const v8float & a ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float miscellaneous friends
+
+    friend inline v8float clear_bits(  const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float set_bits(    const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8int constructors / destructors
+
+    v8int() {}                                // Default constructor
+
+    v8int( const v8int &a )                   // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v8int( const v8 &a )                      // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v8int( int a )                            // Init from scalar
+    {
+      union
+      {
+	int i;
+	float f;
+      } u;
+      u.i = a;
+      v = _mm256_set1_ps( u.f );
+    }
+
+    v8int( int i0, int i1, int i2, int i3,
+	   int i4, int i5, int i6, int i7 )   // Init from scalars
+    {
+      union
+      {
+	int i;
+	float f;
+      } u0, u1, u2, u3, u4, u5, u6, u7;
+
+      u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3;
+      u4.i = i4; u5.i = i5; u6.i = i6; u7.i = i7;
+
+      v = _mm256_setr_ps( u0.f, u1.f, u2.f, u3.f,
+			  u4.f, u5.f, u6.f, u7.f );
+    }
+
+    ~v8int() {}                               // Destructor
+
+    // v8int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v8int &operator op( const v8int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      i[4] op b.i[4];                             \
+      i[5] op b.i[5];                             \
+      i[6] op b.i[6];                             \
+      i[7] op b.i[7];                             \
+      return *this;                               \
+    }
+
+    inline v8int &operator =( const v8int &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+
+    inline v8int &operator ^=( const v8int &b )
+    {
+      v = _mm256_xor_ps( v, b.v );
+      return *this;
+    }
+
+    inline v8int &operator &=( const v8int &b )
+    {
+      v = _mm256_and_ps( v, b.v );
+      return *this;
+    }
+
+    inline v8int &operator |=( const v8int &b )
+    {
+      v = _mm256_or_ps( v, b.v );
+      return *this;
+    }
+
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v8int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v8int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v8int operator op( const v8int & a )   \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  inline v8int operator +( const v8int & a )
+  {
+    v8int b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  PREFIX_UNARY(-)
+
+  inline v8int operator !( const v8int & a )
+  {
+    v8int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+    b.i[4] = - ( !a.i[4] );
+    b.i[5] = - ( !a.i[5] );
+    b.i[6] = - ( !a.i[6] );
+    b.i[7] = - ( !a.i[7] );
+
+    return b;
+  }
+
+  inline v8int operator ~( const v8int & a )
+  {
+    v8int b;
+
+    union
+    {
+      int i;
+      float f;
+    } u;
+
+    u.i = -1;
+
+    b.v = _mm256_xor_ps( a.v, _mm256_set1_ps( u.f ) );
+
+    return b;
+  }
+
+# undef PREFIX_UNARY
+
+  // v8int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v8int operator op( v8int & a )         \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v8int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v8int operator op( v8int & a, int )   \
+  {					       \
+    v8int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    b.i[4] = ( a.i[4] op );                    \
+    b.i[5] = ( a.i[5] op );                    \
+    b.i[6] = ( a.i[6] op );                    \
+    b.i[7] = ( a.i[7] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v8int binary operators
+
+# define BINARY(op)                                             \
+  inline v8int operator op( const v8int &a, const v8int &b )    \
+  {								\
+    v8int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    c.i[4] = a.i[4] op b.i[4];                                  \
+    c.i[5] = a.i[5] op b.i[5];                                  \
+    c.i[6] = a.i[6] op b.i[6];                                  \
+    c.i[7] = a.i[7] op b.i[7];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+
+  inline v8int operator ^( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_xor_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v8int operator &( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_and_ps( a.v, b.v );
+
+    return c;
+  }
+
+  inline v8int operator |( const v8int &a, const v8int &b )
+  {
+    v8int c;
+
+    c.v = _mm256_or_ps( a.v, b.v );
+
+    return c;
+  }
+
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v8int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v8int operator op( const v8int &a, const v8int &b )   \
+  {							       \
+    v8int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    c.i[4] = - ( a.i[4] op b.i[4] );                           \
+    c.i[5] = - ( a.i[5] op b.i[5] );                           \
+    c.i[6] = - ( a.i[6] op b.i[6] );                           \
+    c.i[7] = - ( a.i[7] op b.i[7] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8int miscellaneous functions
+
+  inline v8int abs( const v8int &a )
+  {
+    v8int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+    b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4];
+    b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5];
+    b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6];
+    b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7];
+
+    return b;
+  }
+
+  inline v8 czero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.v = _mm256_andnot_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v8 notczero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.v = _mm256_and_ps( c.v, a.v );
+
+    return b;
+  }
+
+  inline v8 merge( const v8int &c, const v8 &t, const v8 &f )
+  {
+    __m256 c_v = c.v;
+
+    v8 tf;
+
+    tf.v = _mm256_or_ps( _mm256_andnot_ps( c_v, f.v ),
+			 _mm256_and_ps( c_v, t.v ) );
+
+    return tf;
+  }
+
+  ////////////////
+  // v8float class
+
+  class v8float : public v8
+  {
+    // v8float prefix unary operator friends
+
+    friend inline v8float operator  +( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  ~( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8int   operator  !( const v8float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8float prefix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a ) ALWAYS_INLINE;
+
+    // v8float postfix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE;
+
+    // v8float binary operator friends
+
+    friend inline v8float operator  +( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  *( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  /( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float math library friends
+
+#   define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v8float fn( const v8float &a,  \
+                                                   const v8float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v8float miscellaneous friends
+
+    friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rsqrt       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void     scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8float constructors / destructors
+
+    v8float() {}                                        // Default constructor
+
+    v8float( const v8float &a )                         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v8float( const v8 &a )                              // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v8float( float a )                                  // Init from scalar
+    {
+      v = _mm256_set1_ps( a );
+    }
+
+    v8float( float f0, float f1, float f2, float f3,
+	     float f4, float f5, float f6, float f7 )   // Init from scalars
+    {
+      v = _mm256_setr_ps( f0, f1, f2, f3, f4, f5, f6, f7 );
+    }
+
+    ~v8float() {}                                       // Destructor
+
+    // v8float assignment operators
+
+#   define ASSIGN(op,intrin)				\
+    inline v8float &operator op( const v8float &b )     \
+    {							\
+      v = intrin(v,b.v);				\
+      return *this;					\
+    }
+
+    inline v8float &operator =( const v8float &b )
+    {
+      v = b.v;
+      return *this;
+    }
+
+    ASSIGN(+=,_mm256_add_ps)
+    ASSIGN(-=,_mm256_sub_ps)
+    ASSIGN(*=,_mm256_mul_ps)
+    ASSIGN(/=,_mm256_div_ps)
+
+#   undef ASSIGN
+
+    // v8float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v8float prefix unary operators
+
+  inline v8float operator +( const v8float &a )
+  {
+    v8float b;
+
+    b.v = a.v;
+
+    return b;
+  }
+
+  inline v8float operator -( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_sub_ps( _mm256_setzero_ps(), a.v );
+
+    return b;
+  }
+
+  inline v8int operator !( const v8float &a )
+  {
+    v8int b;
+
+    b.v = _mm256_cmp_ps( _mm256_setzero_ps(), a.v, _CMP_EQ_OS );
+
+    return b;
+  }
+
+  // v8float prefix increment / decrement operators
+
+  inline v8float operator ++( v8float &a )
+  {
+    v8float b;
+    __m256 t = _mm256_add_ps( a.v, _mm256_set1_ps( 1.0f ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a )
+  {
+    v8float b;
+    __m256 t = _mm256_sub_ps( a.v, _mm256_set1_ps( 1.0f ) );
+
+    a.v = t;
+    b.v = t;
+
+    return b;
+  }
+
+  // v8float postfix increment / decrement operators
+
+  inline v8float operator ++( v8float &a, int )
+  {
+    v8float b;
+    __m256 a_v = a.v;
+
+    a.v = _mm256_add_ps( a_v, _mm256_set1_ps( 1.0f ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a, int )
+  {
+    v8float b;
+    __m256 a_v = a.v;
+
+    a.v = _mm256_sub_ps(a_v, _mm256_set1_ps( 1.0f ) );
+    b.v = a_v;
+
+    return b;
+  }
+
+  // v8float binary operators
+
+# define BINARY(op,intrin)                                           \
+  inline v8float operator op( const v8float &a, const v8float &b )   \
+  {								     \
+    v8float c;                                                       \
+    c.v = intrin( a.v, b.v );                                        \
+    return c;                                                        \
+  }
+
+  BINARY( +, _mm256_add_ps )
+  BINARY( -, _mm256_sub_ps )
+  BINARY( *, _mm256_mul_ps )
+  BINARY( /, _mm256_div_ps )
+
+# undef BINARY
+
+  // v8float logical operators
+
+# define LOGICAL(op,intrin,flag)                                   \
+  inline v8int operator op( const v8float &a, const v8float &b )   \
+  {								   \
+    v8int c;                                                       \
+    c.v = intrin( a.v, b.v, flag );				   \
+    return c;                                                      \
+  }
+
+  LOGICAL( <,  _mm256_cmp_ps, _CMP_LT_OS  )
+  LOGICAL( >,  _mm256_cmp_ps, _CMP_GT_OS  )
+  LOGICAL( ==, _mm256_cmp_ps, _CMP_EQ_OS  )
+  LOGICAL( !=, _mm256_cmp_ps, _CMP_NEQ_OS )
+  LOGICAL( <=, _mm256_cmp_ps, _CMP_LE_OS  )
+  LOGICAL( >=, _mm256_cmp_ps, _CMP_GE_OS  )
+
+  inline v8int operator &&( const v8float &a, const v8float &b )
+  {
+    v8int c;
+    __m256 vzero = _mm256_setzero_ps();
+
+    c.v = _mm256_and_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			 _mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+
+    return c;
+  }
+
+  inline v8int operator ||( const v8float &a, const v8float &b )
+  {
+    v8int c;
+    __m256 vzero = _mm256_setzero_ps();
+
+    c.v = _mm256_or_ps( _mm256_cmp_ps( a.v, vzero, _CMP_NEQ_OS ),
+			_mm256_cmp_ps( b.v, vzero, _CMP_NEQ_OS ) );
+
+    return c;
+  }
+
+# undef LOGICAL
+
+  // v8float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v8float fn( const v8float &a )         \
+  {						\
+    v8float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    b.f[4] = ::fn( a.f[4] );                    \
+    b.f[5] = ::fn( a.f[5] );                    \
+    b.f[6] = ::fn( a.f[6] );                    \
+    b.f[7] = ::fn( a.f[7] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v8float fn( const v8float &a, const v8float &b )       \
+  {								\
+    v8float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    c.f[4] = ::fn( a.f[4], b.f[4] );                            \
+    c.f[5] = ::fn( a.f[5], b.f[5] );                            \
+    c.f[6] = ::fn( a.f[6], b.f[6] );                            \
+    c.f[7] = ::fn( a.f[7], b.f[7] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  /*CMATH_FR1(fabs)*/ CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v8float fabs( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_andnot_ps( _mm256_set1_ps( -0.0f ), a.v );
+
+    return b;
+  }
+
+  inline v8float sqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_sqrt_ps( a.v );
+
+    return b;
+  }
+
+  inline v8float copysign( const v8float &a, const v8float &b )
+  {
+    v8float c;
+    __m256 t = _mm256_set1_ps( -0.0f );
+
+    c.v = _mm256_or_ps( _mm256_and_ps( t, b.v ),
+			_mm256_andnot_ps( t, a.v ) );
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v8float miscellaneous functions
+
+  inline v8float rsqrt_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_rsqrt_ps(a.v);
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+  #endif
+
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+    __m256 a_v = a.v, b_v;
+
+    b_v = _mm256_rsqrt_ps(a_v);
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+    b.v = _mm256_add_ps( b_v, _mm256_mul_ps( _mm256_set1_ps( 0.5f ),
+					     _mm256_sub_ps( b_v,
+							    _mm256_mul_ps( a_v,
+									   _mm256_mul_ps( b_v,
+											  _mm256_mul_ps( b_v, b_v ) ) ) ) ) );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_div_ps( _mm256_set1_ps( 1.0f ), _mm256_sqrt_ps( a.v ) );
+
+    return b;
+  }
+  #endif
+
+  #if 0
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+  #endif
+
+  inline v8float rcp_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_rcp_ps( a.v );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+  #endif
+
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+    __m256 a_v = a.v, b_v;
+
+    b_v = _mm256_rcp_ps( a_v );
+    b.v = _mm256_sub_ps( _mm256_add_ps( b_v, b_v ),
+			 _mm256_mul_ps( a_v, _mm256_mul_ps( b_v, b_v ) ) );
+
+    return b;
+  }
+
+  #if 0
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_div_ps( _mm256_set1_ps( 1.0f ), a.v );
+
+    return b;
+  }
+  #endif
+
+  inline v8float fma( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_fmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float fms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_fmsub_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float fnms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.v = _mm256_fnmadd_ps( a.v, b.v, c.v );
+
+    return d;
+  }
+
+  inline v8float clear_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_andnot_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v8float set_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_or_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline v8float toggle_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.v = _mm256_xor_ps( m.v, a.v );
+
+    return b;
+  }
+
+  inline void increment_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_add_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+  inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_sub_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+  inline void scale_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    _mm256_store_ps( p, _mm256_mul_ps( _mm256_load_ps( p ), a.v ) );
+  }
+
+} // namespace v8
+
+#endif // _v8_avx2_h_
diff --git a/src/util/v8/v8_portable.h b/src/util/v8/v8_portable.h
new file mode 100644
index 00000000..b8d6b0c8
--- /dev/null
+++ b/src/util/v8/v8_portable.h
@@ -0,0 +1,1785 @@
+#ifndef _v8_portable_h_
+#define _v8_portable_h_
+
+#ifndef IN_v8_h
+#error "Do not include v8_portable.h directly; use v8.h"
+#endif
+
+#include <math.h>
+
+#define V8_ACCELERATION
+#define V8_PORTABLE_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v8
+{
+  class v8;
+  class v8int;
+  class v8float;
+
+  ////////////////
+  // v8 base class
+
+  class v8
+  {
+    friend class v8int;
+    friend class v8float;
+
+    // v8 miscellaneous friends
+
+    friend inline int any( const v8 &a ) ALWAYS_INLINE;
+    friend inline int all( const v8 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v8 splat( const v8 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+    friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+				  v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE;
+
+    // v8 memory manipulation friends
+
+    friend inline void   load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE;
+    friend inline void  store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_8x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v8 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 8x2_tr variants.
+
+    friend inline void load_8x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+				    const void *a4, const void *a5,
+                                    const void *a6, const void *a7,
+                                    v8 &a ) ALWAYS_INLINE;
+
+    friend inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+				    const void * ALIGNED(8) a4,
+                                    const void * ALIGNED(8) a5,
+                                    const void * ALIGNED(8) a6,
+                                    const void * ALIGNED(8) a7,
+                                    v8 &a, v8 &b ) ALWAYS_INLINE;
+
+    friend inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE;
+
+    friend inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE;
+
+    friend inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d,
+                                    v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE;
+
+    friend inline void store_8x1_tr( const v8 &a,
+                                     void *a0, void *a1, void *a2, void *a3,
+                                     void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x2_tr( const v8 &a, const v8 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3,
+                                     void * ALIGNED(8) a4,
+                                     void * ALIGNED(8) a5,
+                                     void * ALIGNED(8) a6,
+                                     void * ALIGNED(8) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x4_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x8_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     const v8 &e, const v8 &f,
+                                     const v8 &g, const v8 &h,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[8];
+      float f[8];
+    };
+
+  public:
+
+    v8() {}                    // Default constructor
+
+    v8( const v8 &a )          // Copy constructor
+    {
+      i[0]=a.i[0]; i[1]=a.i[1]; i[2]=a.i[2]; i[3]=a.i[3];
+      i[4]=a.i[4]; i[5]=a.i[5]; i[6]=a.i[6]; i[7]=a.i[7];
+    }
+
+    ~v8() {}                   // Default destructor
+  };
+
+  // v8 miscellaneous functions
+
+  inline int any( const v8 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3] ||
+           a.i[4] || a.i[5] || a.i[6] || a.i[7];
+  }
+
+  inline int all( const v8 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3] &&
+           a.i[4] && a.i[5] && a.i[6] && a.i[7];
+  }
+
+  template<int n>
+  inline v8 splat( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[n];
+    b.i[1] = a.i[n];
+    b.i[2] = a.i[n];
+    b.i[3] = a.i[n];
+    b.i[4] = a.i[n];
+    b.i[5] = a.i[n];
+    b.i[6] = a.i[n];
+    b.i[7] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+  inline v8 shuffle( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+    b.i[4] = a.i[i4];
+    b.i[5] = a.i[i5];
+    b.i[6] = a.i[i6];
+    b.i[7] = a.i[i7];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v8 &a, v8 &b )
+  {
+    sw( a.i[0], b.i[0] );
+    sw( a.i[1], b.i[1] );
+    sw( a.i[2], b.i[2] );
+    sw( a.i[3], b.i[3] );
+    sw( a.i[4], b.i[4] );
+    sw( a.i[5], b.i[5] );
+    sw( a.i[6], b.i[6] );
+    sw( a.i[7], b.i[7] );
+  }
+
+  inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+			 v8 &a4, v8 &a5, v8 &a6, v8 &a7 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] );
+                                                  sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] );
+                                                                         sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] );
+                                                                                                sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] );
+                                                                                                                       sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] );
+                                                                                                                                              sw( a6.i[7],a7.i[6] );
+  }
+
+# undef sw
+
+  // v8 memory manipulation functions
+
+  inline void load_8x1( const void * ALIGNED(16) p,
+			v8 &a )
+  {
+    a.i[0] = ((const int * ALIGNED(16))p)[0];
+    a.i[1] = ((const int * ALIGNED(16))p)[1];
+    a.i[2] = ((const int * ALIGNED(16))p)[2];
+    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[4] = ((const int * ALIGNED(16))p)[4];
+    a.i[5] = ((const int * ALIGNED(16))p)[5];
+    a.i[6] = ((const int * ALIGNED(16))p)[6];
+    a.i[7] = ((const int * ALIGNED(16))p)[7];
+  }
+
+  inline void store_8x1( const v8 &a,
+			 void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void stream_8x1( const v8 &a,
+			  void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void clear_8x1( void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = 0;
+    ((int * ALIGNED(16))p)[1] = 0;
+    ((int * ALIGNED(16))p)[2] = 0;
+    ((int * ALIGNED(16))p)[3] = 0;
+    ((int * ALIGNED(16))p)[4] = 0;
+    ((int * ALIGNED(16))p)[5] = 0;
+    ((int * ALIGNED(16))p)[6] = 0;
+    ((int * ALIGNED(16))p)[7] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_8x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
+    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
+    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
+    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4];
+    ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5];
+    ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6];
+    ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7];
+  }
+
+  inline void swap_8x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
+
+    t = ((int * ALIGNED(16))a)[4];
+    ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4];
+    ((int * ALIGNED(16))b)[4] = t;
+
+    t = ((int * ALIGNED(16))a)[5];
+    ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5];
+    ((int * ALIGNED(16))b)[5] = t;
+
+    t = ((int * ALIGNED(16))a)[6];
+    ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6];
+    ((int * ALIGNED(16))b)[6] = t;
+
+    t = ((int * ALIGNED(16))a)[7];
+    ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7];
+    ((int * ALIGNED(16))b)[7] = t;
+  }
+
+  // v8 transposed memory manipulation functions
+
+  inline void load_8x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+                           const void *a4, const void *a5,
+                           const void *a6, const void *a7,
+			   v8 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+    a.i[4] = ((const int *)a4)[0];
+    a.i[5] = ((const int *)a5)[0];
+    a.i[6] = ((const int *)a6)[0];
+    a.i[7] = ((const int *)a7)[0];
+  }
+
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &a, v8 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+
+    a.i[4] = ((const int * ALIGNED(8))a4)[0];
+    b.i[4] = ((const int * ALIGNED(8))a4)[1];
+
+    a.i[5] = ((const int * ALIGNED(8))a5)[0];
+    b.i[5] = ((const int * ALIGNED(8))a5)[1];
+
+    a.i[6] = ((const int * ALIGNED(8))a6)[0];
+    b.i[6] = ((const int * ALIGNED(8))a6)[1];
+
+    a.i[7] = ((const int * ALIGNED(8))a7)[0];
+    b.i[7] = ((const int * ALIGNED(8))a7)[1];
+  }
+
+  inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+ 			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2]; 
+   }
+
+  inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+  }
+
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+    e.i[0] = ((const int * ALIGNED(16))a0)[4];
+    f.i[0] = ((const int * ALIGNED(16))a0)[5];
+    g.i[0] = ((const int * ALIGNED(16))a0)[6];
+    h.i[0] = ((const int * ALIGNED(16))a0)[7];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+    e.i[1] = ((const int * ALIGNED(16))a1)[4];
+    f.i[1] = ((const int * ALIGNED(16))a1)[5];
+    g.i[1] = ((const int * ALIGNED(16))a1)[6];
+    h.i[1] = ((const int * ALIGNED(16))a1)[7];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+    e.i[2] = ((const int * ALIGNED(16))a2)[4];
+    f.i[2] = ((const int * ALIGNED(16))a2)[5];
+    g.i[2] = ((const int * ALIGNED(16))a2)[6];
+    h.i[2] = ((const int * ALIGNED(16))a2)[7];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    e.i[3] = ((const int * ALIGNED(16))a3)[4];
+    f.i[3] = ((const int * ALIGNED(16))a3)[5];
+    g.i[3] = ((const int * ALIGNED(16))a3)[6];
+    h.i[3] = ((const int * ALIGNED(16))a3)[7];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+    e.i[4] = ((const int * ALIGNED(16))a4)[4];
+    f.i[4] = ((const int * ALIGNED(16))a4)[5];
+    g.i[4] = ((const int * ALIGNED(16))a4)[6];
+    h.i[4] = ((const int * ALIGNED(16))a4)[7];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+    e.i[5] = ((const int * ALIGNED(16))a5)[4];
+    f.i[5] = ((const int * ALIGNED(16))a5)[5];
+    g.i[5] = ((const int * ALIGNED(16))a5)[6];
+    h.i[5] = ((const int * ALIGNED(16))a5)[7];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+    e.i[6] = ((const int * ALIGNED(16))a6)[4];
+    f.i[6] = ((const int * ALIGNED(16))a6)[5];
+    g.i[6] = ((const int * ALIGNED(16))a6)[6];
+    h.i[6] = ((const int * ALIGNED(16))a6)[7];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+    e.i[7] = ((const int * ALIGNED(16))a7)[4];
+    f.i[7] = ((const int * ALIGNED(16))a7)[5];
+    g.i[7] = ((const int * ALIGNED(16))a7)[6];
+    h.i[7] = ((const int * ALIGNED(16))a7)[7];
+  }
+
+  inline void store_8x1_tr( const v8 &a,
+                            void *a0, void *a1, void *a2, void *a3,
+                            void *a4, void *a5, void *a6, void *a7 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+    ((int *)a4)[0] = a.i[4];
+    ((int *)a5)[0] = a.i[5];
+    ((int *)a6)[0] = a.i[6];
+    ((int *)a7)[0] = a.i[7];
+  }
+
+  inline void store_8x2_tr( const v8 &a, const v8 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3,
+                            void * ALIGNED(8) a4, void * ALIGNED(8) a5,
+                            void * ALIGNED(8) a6, void * ALIGNED(8) a7 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+
+    ((int * ALIGNED(8))a4)[0] = a.i[4];
+    ((int * ALIGNED(8))a4)[1] = b.i[4];
+
+    ((int * ALIGNED(8))a5)[0] = a.i[5];
+    ((int * ALIGNED(8))a5)[1] = b.i[5];
+
+    ((int * ALIGNED(8))a6)[0] = a.i[6];
+    ((int * ALIGNED(8))a6)[1] = b.i[6];
+
+    ((int * ALIGNED(8))a7)[0] = a.i[7];
+    ((int * ALIGNED(8))a7)[1] = b.i[7];
+  }
+
+  inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+  }
+
+  inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+  }
+
+  inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+			    const v8 &e, const v8 &f, const v8 &g, const v8 &h,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+    ((int * ALIGNED(16))a0)[4] = e.i[0];
+    ((int * ALIGNED(16))a0)[5] = f.i[0];
+    ((int * ALIGNED(16))a0)[6] = g.i[0];
+    ((int * ALIGNED(16))a0)[7] = h.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+    ((int * ALIGNED(16))a1)[4] = e.i[1];
+    ((int * ALIGNED(16))a1)[5] = f.i[1];
+    ((int * ALIGNED(16))a1)[6] = g.i[1];
+    ((int * ALIGNED(16))a1)[7] = h.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+    ((int * ALIGNED(16))a2)[4] = e.i[2];
+    ((int * ALIGNED(16))a2)[5] = f.i[2];
+    ((int * ALIGNED(16))a2)[6] = g.i[2];
+    ((int * ALIGNED(16))a2)[7] = h.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ((int * ALIGNED(16))a3)[4] = e.i[3];
+    ((int * ALIGNED(16))a3)[5] = f.i[3];
+    ((int * ALIGNED(16))a3)[6] = g.i[3];
+    ((int * ALIGNED(16))a3)[7] = h.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+    ((int * ALIGNED(16))a4)[4] = e.i[4];
+    ((int * ALIGNED(16))a4)[5] = f.i[4];
+    ((int * ALIGNED(16))a4)[6] = g.i[4];
+    ((int * ALIGNED(16))a4)[7] = h.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+    ((int * ALIGNED(16))a5)[4] = e.i[5];
+    ((int * ALIGNED(16))a5)[5] = f.i[5];
+    ((int * ALIGNED(16))a5)[6] = g.i[5];
+    ((int * ALIGNED(16))a5)[7] = h.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+    ((int * ALIGNED(16))a6)[4] = e.i[6];
+    ((int * ALIGNED(16))a6)[5] = f.i[6];
+    ((int * ALIGNED(16))a6)[6] = g.i[6];
+    ((int * ALIGNED(16))a6)[7] = h.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+    ((int * ALIGNED(16))a7)[4] = e.i[7];
+    ((int * ALIGNED(16))a7)[5] = f.i[7];
+    ((int * ALIGNED(16))a7)[6] = g.i[7];
+    ((int * ALIGNED(16))a7)[7] = h.i[7];
+  }
+
+  //////////////
+  // v8int class
+
+  class v8int : public v8
+  {
+    // v8int prefix unary operator friends
+
+    friend inline v8int operator  +( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  ~( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  !( const v8int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8int prefix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a ) ALWAYS_INLINE;
+
+    // v8int postfix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE;
+
+    // v8int binary operator friends
+
+    friend inline v8int operator  +( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  *( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  /( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  %( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  ^( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  &( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  |( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int logical operator friends
+
+    friend inline v8int operator  <( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8int abs( const v8int &a ) ALWAYS_INLINE;
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE;
+
+    // v8float unary operator friends
+
+    friend inline v8int operator  !( const v8float & a ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float miscellaneous friends
+
+    friend inline v8float clear_bits(  const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float set_bits(    const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8int constructors / destructors
+
+    v8int() {}                                // Default constructor
+
+    v8int( const v8int &a )                   // Copy constructor
+    {
+      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+      i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7];
+    }
+
+    v8int( const v8 &a )                      // Init from mixed
+    {
+      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+      i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7];
+    }
+
+    v8int( int a )                            // Init from scalar
+    {
+      i[0] = a; i[1] = a; i[2] = a; i[3] = a;
+      i[4] = a; i[5] = a; i[6] = a; i[7] = a;
+    }
+
+    v8int( int i0, int i1, int i2, int i3,
+	   int i4, int i5, int i6, int i7 )   // Init from scalars
+    {
+      i[0] = i0; i[1] = i1; i[2] = i2; i[3] = i3;
+      i[4] = i4; i[5] = i5; i[6] = i6; i[7] = i7;
+    }
+
+    ~v8int() {}                               // Destructor
+
+    // v8int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v8int &operator op( const v8int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      i[4] op b.i[4];                             \
+      i[5] op b.i[5];                             \
+      i[6] op b.i[6];                             \
+      i[7] op b.i[7];                             \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v8int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v8int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v8int operator op( const v8int & a )   \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v8int operator !( const v8int & a )
+  {
+    v8int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+    b.i[4] = - ( !a.i[4] );
+    b.i[5] = - ( !a.i[5] );
+    b.i[6] = - ( !a.i[6] );
+    b.i[7] = - ( !a.i[7] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v8int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v8int operator op( v8int & a )         \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v8int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v8int operator op( v8int & a, int )   \
+  {					       \
+    v8int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    b.i[4] = ( a.i[4] op );                    \
+    b.i[5] = ( a.i[5] op );                    \
+    b.i[6] = ( a.i[6] op );                    \
+    b.i[7] = ( a.i[7] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v8int binary operators
+
+# define BINARY(op)                                             \
+  inline v8int operator op( const v8int &a, const v8int &b )    \
+  {								\
+    v8int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    c.i[4] = a.i[4] op b.i[4];                                  \
+    c.i[5] = a.i[5] op b.i[5];                                  \
+    c.i[6] = a.i[6] op b.i[6];                                  \
+    c.i[7] = a.i[7] op b.i[7];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v8int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v8int operator op( const v8int &a, const v8int &b )   \
+  {							       \
+    v8int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    c.i[4] = - ( a.i[4] op b.i[4] );                           \
+    c.i[5] = - ( a.i[5] op b.i[5] );                           \
+    c.i[6] = - ( a.i[6] op b.i[6] );                           \
+    c.i[7] = - ( a.i[7] op b.i[7] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8int miscellaneous functions
+
+  inline v8int abs( const v8int &a )
+  {
+    v8int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+    b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4];
+    b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5];
+    b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6];
+    b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7];
+
+    return b;
+  }
+
+  inline v8 czero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[0] & ~c.i[0];
+    b.i[1] = a.i[1] & ~c.i[1];
+    b.i[2] = a.i[2] & ~c.i[2];
+    b.i[3] = a.i[3] & ~c.i[3];
+    b.i[4] = a.i[4] & ~c.i[4];
+    b.i[5] = a.i[5] & ~c.i[5];
+    b.i[6] = a.i[6] & ~c.i[6];
+    b.i[7] = a.i[7] & ~c.i[7];
+
+    return b;
+  }
+
+  inline v8 notczero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[0] & c.i[0];
+    b.i[1] = a.i[1] & c.i[1];
+    b.i[2] = a.i[2] & c.i[2];
+    b.i[3] = a.i[3] & c.i[3];
+    b.i[4] = a.i[4] & c.i[4];
+    b.i[5] = a.i[5] & c.i[5];
+    b.i[6] = a.i[6] & c.i[6];
+    b.i[7] = a.i[7] & c.i[7];
+
+    return b;
+  }
+
+  inline v8 merge( const v8int &c, const v8 &t, const v8 &f )
+  {
+    v8 m;
+
+    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+    m.i[4] = ( f.i[4] & ~c.i[4] ) | ( t.i[4] & c.i[4] );
+    m.i[5] = ( f.i[5] & ~c.i[5] ) | ( t.i[5] & c.i[5] );
+    m.i[6] = ( f.i[6] & ~c.i[6] ) | ( t.i[6] & c.i[6] );
+    m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] );
+
+    return m;
+  }
+
+  ////////////////
+  // v8float class
+
+  class v8float : public v8
+  {
+    // v8float prefix unary operator friends
+
+    friend inline v8float operator  +( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  ~( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8int   operator  !( const v8float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8float prefix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a ) ALWAYS_INLINE;
+
+    // v8float postfix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE;
+
+    // v8float binary operator friends
+
+    friend inline v8float operator  +( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  *( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  /( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float math library friends
+
+#   define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v8float fn( const v8float &a,  \
+                                                   const v8float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v8float miscellaneous friends
+
+    friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rsqrt       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void     scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8float constructors / destructors
+
+    v8float() {}                                        // Default constructor
+
+    v8float( const v8float &a )                         // Copy constructor
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+      f[4] = a.f[4];
+      f[5] = a.f[5];
+      f[6] = a.f[6];
+      f[7] = a.f[7];
+    }
+
+    v8float( const v8 &a )                              // Init from mixed
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+      f[4] = a.f[4];
+      f[5] = a.f[5];
+      f[6] = a.f[6];
+      f[7] = a.f[7];
+    }
+
+    v8float( float a )                                  // Init from scalar
+    {
+      f[0] = a;
+      f[1] = a;
+      f[2] = a;
+      f[3] = a;
+      f[4] = a;
+      f[5] = a;
+      f[6] = a;
+      f[7] = a;
+    }
+
+    v8float( float f0, float f1, float f2, float f3,
+	     float f4, float f5, float f6, float f7 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
+      f[4] = f4;
+      f[5] = f5;
+      f[6] = f6;
+      f[7] = f7;
+    }
+
+    ~v8float() {}                                       // Destructor
+
+    // v8float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v8float &operator op( const v8float &b )     \
+    {							\
+      f[0] op b.f[0];		             		\
+      f[1] op b.f[1];                                   \
+      f[2] op b.f[2];                                   \
+      f[3] op b.f[3];                                   \
+      f[4] op b.f[4];		             		\
+      f[5] op b.f[5];                                   \
+      f[6] op b.f[6];                                   \
+      f[7] op b.f[7];                                   \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v8float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v8float prefix unary operators
+
+  inline v8float operator +( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = +a.f[0];
+    b.f[1] = +a.f[1];
+    b.f[2] = +a.f[2];
+    b.f[3] = +a.f[3];
+    b.f[4] = +a.f[4];
+    b.f[5] = +a.f[5];
+    b.f[6] = +a.f[6];
+    b.f[7] = +a.f[7];
+
+    return b;
+  }
+
+  inline v8float operator -( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = -a.f[0];
+    b.f[1] = -a.f[1];
+    b.f[2] = -a.f[2];
+    b.f[3] = -a.f[3];
+    b.f[4] = -a.f[4];
+    b.f[5] = -a.f[5];
+    b.f[6] = -a.f[6];
+    b.f[7] = -a.f[7];
+
+    return b;
+  }
+
+  inline v8int operator !( const v8float &a )
+  {
+    v8int b;
+
+    b.i[0] = a.i[0] ? 0 : -1;
+    b.i[1] = a.i[1] ? 0 : -1;
+    b.i[2] = a.i[2] ? 0 : -1;
+    b.i[3] = a.i[3] ? 0 : -1;
+    b.i[4] = a.i[4] ? 0 : -1;
+    b.i[5] = a.i[5] ? 0 : -1;
+    b.i[6] = a.i[6] ? 0 : -1;
+    b.i[7] = a.i[7] ? 0 : -1;
+
+    return b;
+  }
+
+  // v8float prefix increment / decrement operators
+
+  inline v8float operator ++( v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ++a.f[0];
+    b.f[1] = ++a.f[1];
+    b.f[2] = ++a.f[2];
+    b.f[3] = ++a.f[3];
+    b.f[4] = ++a.f[4];
+    b.f[5] = ++a.f[5];
+    b.f[6] = ++a.f[6];
+    b.f[7] = ++a.f[7];
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = --a.f[0];
+    b.f[1] = --a.f[1];
+    b.f[2] = --a.f[2];
+    b.f[3] = --a.f[3];
+    b.f[4] = --a.f[4];
+    b.f[5] = --a.f[5];
+    b.f[6] = --a.f[6];
+    b.f[7] = --a.f[7];
+
+    return b;
+  }
+
+  // v8float postfix increment / decrement operators
+
+  inline v8float operator ++( v8float &a, int )
+  {
+    v8float b;
+
+    b.f[0] = a.f[0]++;
+    b.f[1] = a.f[1]++;
+    b.f[2] = a.f[2]++;
+    b.f[3] = a.f[3]++;
+    b.f[4] = a.f[4]++;
+    b.f[5] = a.f[5]++;
+    b.f[6] = a.f[6]++;
+    b.f[7] = a.f[7]++;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a, int )
+  {
+    v8float b;
+
+    b.f[0] = a.f[0]--;
+    b.f[1] = a.f[1]--;
+    b.f[2] = a.f[2]--;
+    b.f[3] = a.f[3]--;
+    b.f[4] = a.f[4]--;
+    b.f[5] = a.f[5]--;
+    b.f[6] = a.f[6]--;
+    b.f[7] = a.f[7]--;
+
+    return b;
+  }
+
+  // v8float binary operators
+
+# define BINARY(op)                                                  \
+  inline v8float operator op( const v8float &a, const v8float &b )   \
+  {								     \
+    v8float c;                                                       \
+    c.f[0] = a.f[0] op b.f[0];                                       \
+    c.f[1] = a.f[1] op b.f[1];                                       \
+    c.f[2] = a.f[2] op b.f[2];                                       \
+    c.f[3] = a.f[3] op b.f[3];                                       \
+    c.f[4] = a.f[4] op b.f[4];                                       \
+    c.f[5] = a.f[5] op b.f[5];                                       \
+    c.f[6] = a.f[6] op b.f[6];                                       \
+    c.f[7] = a.f[7] op b.f[7];                                       \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v8float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v8int operator op( const v8float &a, const v8float &b )   \
+  {								   \
+    v8int c;                                                       \
+    c.i[0] = -( a.f[0] op b.f[0] );                                \
+    c.i[1] = -( a.f[1] op b.f[1] );                                \
+    c.i[2] = -( a.f[2] op b.f[2] );                                \
+    c.i[3] = -( a.f[3] op b.f[3] );                                \
+    c.i[4] = -( a.f[4] op b.f[4] );                                \
+    c.i[5] = -( a.f[5] op b.f[5] );                                \
+    c.i[6] = -( a.f[6] op b.f[6] );                                \
+    c.i[7] = -( a.f[7] op b.f[7] );                                \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v8float fn( const v8float &a )         \
+  {						\
+    v8float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    b.f[4] = ::fn( a.f[4] );                    \
+    b.f[5] = ::fn( a.f[5] );                    \
+    b.f[6] = ::fn( a.f[6] );                    \
+    b.f[7] = ::fn( a.f[7] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v8float fn( const v8float &a, const v8float &b )       \
+  {								\
+    v8float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    c.f[4] = ::fn( a.f[4], b.f[4] );                            \
+    c.f[5] = ::fn( a.f[5], b.f[5] );                            \
+    c.f[6] = ::fn( a.f[6], b.f[6] );                            \
+    c.f[7] = ::fn( a.f[7], b.f[7] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v8float copysign( const v8float &a, const v8float &b )
+  {
+    v8float c;
+    float t;
+
+    t = ::fabs( a.f[0] );
+    if( b.f[0] < 0 ) t = -t;
+    c.f[0] = t;
+
+    t = ::fabs( a.f[1] );
+    if( b.f[1] < 0 ) t = -t;
+    c.f[1] = t;
+
+    t = ::fabs( a.f[2] );
+    if( b.f[2] < 0 ) t = -t;
+    c.f[2] = t;
+
+    t = ::fabs( a.f[3] );
+    if( b.f[3] < 0 ) t = -t;
+    c.f[3] = t;
+
+    t = ::fabs( a.f[4] );
+    if( b.f[4] < 0 ) t = -t;
+    c.f[4] = t;
+
+    t = ::fabs( a.f[5] );
+    if( b.f[5] < 0 ) t = -t;
+    c.f[5] = t;
+
+    t = ::fabs( a.f[6] );
+    if( b.f[6] < 0 ) t = -t;
+    c.f[6] = t;
+
+    t = ::fabs( a.f[7] );
+    if( b.f[7] < 0 ) t = -t;
+    c.f[7] = t;
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v8float miscellaneous functions
+
+  inline v8float rsqrt_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+
+  inline v8float rcp_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+
+  inline v8float fma( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = a.f[0] * b.f[0] + c.f[0];
+    d.f[1] = a.f[1] * b.f[1] + c.f[1];
+    d.f[2] = a.f[2] * b.f[2] + c.f[2];
+    d.f[3] = a.f[3] * b.f[3] + c.f[3];
+    d.f[4] = a.f[4] * b.f[4] + c.f[4];
+    d.f[5] = a.f[5] * b.f[5] + c.f[5];
+    d.f[6] = a.f[6] * b.f[6] + c.f[6];
+    d.f[7] = a.f[7] * b.f[7] + c.f[7];
+
+    return d;
+  }
+
+  inline v8float fms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = a.f[0] * b.f[0] - c.f[0];
+    d.f[1] = a.f[1] * b.f[1] - c.f[1];
+    d.f[2] = a.f[2] * b.f[2] - c.f[2];
+    d.f[3] = a.f[3] * b.f[3] - c.f[3];
+    d.f[4] = a.f[4] * b.f[4] - c.f[4];
+    d.f[5] = a.f[5] * b.f[5] - c.f[5];
+    d.f[6] = a.f[6] * b.f[6] - c.f[6];
+    d.f[7] = a.f[7] * b.f[7] - c.f[7];
+
+    return d;
+  }
+
+  inline v8float fnms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = c.f[0] - a.f[0] * b.f[0];
+    d.f[1] = c.f[1] - a.f[1] * b.f[1];
+    d.f[2] = c.f[2] - a.f[2] * b.f[2];
+    d.f[3] = c.f[3] - a.f[3] * b.f[3];
+    d.f[4] = c.f[4] - a.f[4] * b.f[4];
+    d.f[5] = c.f[5] - a.f[5] * b.f[5];
+    d.f[6] = c.f[6] - a.f[6] * b.f[6];
+    d.f[7] = c.f[7] - a.f[7] * b.f[7];
+
+    return d;
+  }
+
+  inline v8float clear_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = ( ~m.i[0] ) & a.i[0];
+    b.i[1] = ( ~m.i[1] ) & a.i[1];
+    b.i[2] = ( ~m.i[2] ) & a.i[2];
+    b.i[3] = ( ~m.i[3] ) & a.i[3];
+    b.i[4] = ( ~m.i[4] ) & a.i[4];
+    b.i[5] = ( ~m.i[5] ) & a.i[5];
+    b.i[6] = ( ~m.i[6] ) & a.i[6];
+    b.i[7] = ( ~m.i[7] ) & a.i[7];
+
+    return b;
+  }
+
+  inline v8float set_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = m.i[0] | a.i[0];
+    b.i[1] = m.i[1] | a.i[1];
+    b.i[2] = m.i[2] | a.i[2];
+    b.i[3] = m.i[3] | a.i[3];
+    b.i[4] = m.i[4] | a.i[4];
+    b.i[5] = m.i[5] | a.i[5];
+    b.i[6] = m.i[6] | a.i[6];
+    b.i[7] = m.i[7] | a.i[7];
+
+    return b;
+  }
+
+  inline v8float toggle_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = m.i[0] ^ a.i[0];
+    b.i[1] = m.i[1] ^ a.i[1];
+    b.i[2] = m.i[2] ^ a.i[2];
+    b.i[3] = m.i[3] ^ a.i[3];
+    b.i[4] = m.i[4] ^ a.i[4];
+    b.i[5] = m.i[5] ^ a.i[5];
+    b.i[6] = m.i[6] ^ a.i[6];
+    b.i[7] = m.i[7] ^ a.i[7];
+
+    return b;
+  }
+
+  inline void increment_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] += a.f[0];
+    p[1] += a.f[1];
+    p[2] += a.f[2];
+    p[3] += a.f[3];
+    p[4] += a.f[4];
+    p[5] += a.f[5];
+    p[6] += a.f[6];
+    p[7] += a.f[7];
+  }
+
+  inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] -= a.f[0];
+    p[1] -= a.f[1];
+    p[2] -= a.f[2];
+    p[3] -= a.f[3];
+    p[4] -= a.f[4];
+    p[5] -= a.f[5];
+    p[6] -= a.f[6];
+    p[7] -= a.f[7];
+  }
+
+  inline void scale_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] *= a.f[0];
+    p[1] *= a.f[1];
+    p[2] *= a.f[2];
+    p[3] *= a.f[3];
+    p[4] *= a.f[4];
+    p[5] *= a.f[5];
+    p[6] *= a.f[6];
+    p[7] *= a.f[7];
+  }
+
+} // namespace v8
+
+#endif // _v8_portable_h_
diff --git a/src/util/v8/v8_portable_v0.h b/src/util/v8/v8_portable_v0.h
new file mode 100644
index 00000000..b8d6b0c8
--- /dev/null
+++ b/src/util/v8/v8_portable_v0.h
@@ -0,0 +1,1785 @@
+#ifndef _v8_portable_h_
+#define _v8_portable_h_
+
+#ifndef IN_v8_h
+#error "Do not include v8_portable.h directly; use v8.h"
+#endif
+
+#include <math.h>
+
+#define V8_ACCELERATION
+#define V8_PORTABLE_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v8
+{
+  class v8;
+  class v8int;
+  class v8float;
+
+  ////////////////
+  // v8 base class
+
+  class v8
+  {
+    friend class v8int;
+    friend class v8float;
+
+    // v8 miscellaneous friends
+
+    friend inline int any( const v8 &a ) ALWAYS_INLINE;
+    friend inline int all( const v8 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v8 splat( const v8 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+    friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+				  v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE;
+
+    // v8 memory manipulation friends
+
+    friend inline void   load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE;
+    friend inline void  store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_8x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v8 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 8x2_tr variants.
+
+    friend inline void load_8x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+				    const void *a4, const void *a5,
+                                    const void *a6, const void *a7,
+                                    v8 &a ) ALWAYS_INLINE;
+
+    friend inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+				    const void * ALIGNED(8) a4,
+                                    const void * ALIGNED(8) a5,
+                                    const void * ALIGNED(8) a6,
+                                    const void * ALIGNED(8) a7,
+                                    v8 &a, v8 &b ) ALWAYS_INLINE;
+
+    friend inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE;
+
+    friend inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE;
+
+    friend inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d,
+                                    v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE;
+
+    friend inline void store_8x1_tr( const v8 &a,
+                                     void *a0, void *a1, void *a2, void *a3,
+                                     void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x2_tr( const v8 &a, const v8 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3,
+                                     void * ALIGNED(8) a4,
+                                     void * ALIGNED(8) a5,
+                                     void * ALIGNED(8) a6,
+                                     void * ALIGNED(8) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x4_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x8_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     const v8 &e, const v8 &f,
+                                     const v8 &g, const v8 &h,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[8];
+      float f[8];
+    };
+
+  public:
+
+    v8() {}                    // Default constructor
+
+    v8( const v8 &a )          // Copy constructor
+    {
+      i[0]=a.i[0]; i[1]=a.i[1]; i[2]=a.i[2]; i[3]=a.i[3];
+      i[4]=a.i[4]; i[5]=a.i[5]; i[6]=a.i[6]; i[7]=a.i[7];
+    }
+
+    ~v8() {}                   // Default destructor
+  };
+
+  // v8 miscellaneous functions
+
+  inline int any( const v8 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3] ||
+           a.i[4] || a.i[5] || a.i[6] || a.i[7];
+  }
+
+  inline int all( const v8 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3] &&
+           a.i[4] && a.i[5] && a.i[6] && a.i[7];
+  }
+
+  template<int n>
+  inline v8 splat( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[n];
+    b.i[1] = a.i[n];
+    b.i[2] = a.i[n];
+    b.i[3] = a.i[n];
+    b.i[4] = a.i[n];
+    b.i[5] = a.i[n];
+    b.i[6] = a.i[n];
+    b.i[7] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+  inline v8 shuffle( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+    b.i[4] = a.i[i4];
+    b.i[5] = a.i[i5];
+    b.i[6] = a.i[i6];
+    b.i[7] = a.i[i7];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v8 &a, v8 &b )
+  {
+    sw( a.i[0], b.i[0] );
+    sw( a.i[1], b.i[1] );
+    sw( a.i[2], b.i[2] );
+    sw( a.i[3], b.i[3] );
+    sw( a.i[4], b.i[4] );
+    sw( a.i[5], b.i[5] );
+    sw( a.i[6], b.i[6] );
+    sw( a.i[7], b.i[7] );
+  }
+
+  inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+			 v8 &a4, v8 &a5, v8 &a6, v8 &a7 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] );
+                                                  sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] );
+                                                                         sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] );
+                                                                                                sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] );
+                                                                                                                       sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] );
+                                                                                                                                              sw( a6.i[7],a7.i[6] );
+  }
+
+# undef sw
+
+  // v8 memory manipulation functions
+
+  inline void load_8x1( const void * ALIGNED(16) p,
+			v8 &a )
+  {
+    a.i[0] = ((const int * ALIGNED(16))p)[0];
+    a.i[1] = ((const int * ALIGNED(16))p)[1];
+    a.i[2] = ((const int * ALIGNED(16))p)[2];
+    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[4] = ((const int * ALIGNED(16))p)[4];
+    a.i[5] = ((const int * ALIGNED(16))p)[5];
+    a.i[6] = ((const int * ALIGNED(16))p)[6];
+    a.i[7] = ((const int * ALIGNED(16))p)[7];
+  }
+
+  inline void store_8x1( const v8 &a,
+			 void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void stream_8x1( const v8 &a,
+			  void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = a.i[0];
+    ((int * ALIGNED(16))p)[1] = a.i[1];
+    ((int * ALIGNED(16))p)[2] = a.i[2];
+    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ((int * ALIGNED(16))p)[4] = a.i[4];
+    ((int * ALIGNED(16))p)[5] = a.i[5];
+    ((int * ALIGNED(16))p)[6] = a.i[6];
+    ((int * ALIGNED(16))p)[7] = a.i[7];
+  }
+
+  inline void clear_8x1( void * ALIGNED(16) p )
+  {
+    ((int * ALIGNED(16))p)[0] = 0;
+    ((int * ALIGNED(16))p)[1] = 0;
+    ((int * ALIGNED(16))p)[2] = 0;
+    ((int * ALIGNED(16))p)[3] = 0;
+    ((int * ALIGNED(16))p)[4] = 0;
+    ((int * ALIGNED(16))p)[5] = 0;
+    ((int * ALIGNED(16))p)[6] = 0;
+    ((int * ALIGNED(16))p)[7] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_8x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
+    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
+    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
+    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ((int * ALIGNED(16))dst)[4] = ((const int * ALIGNED(16))src)[4];
+    ((int * ALIGNED(16))dst)[5] = ((const int * ALIGNED(16))src)[5];
+    ((int * ALIGNED(16))dst)[6] = ((const int * ALIGNED(16))src)[6];
+    ((int * ALIGNED(16))dst)[7] = ((const int * ALIGNED(16))src)[7];
+  }
+
+  inline void swap_8x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    t = ((int * ALIGNED(16))a)[0];
+    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
+    ((int * ALIGNED(16))b)[0] = t;
+
+    t = ((int * ALIGNED(16))a)[1];
+    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
+    ((int * ALIGNED(16))b)[1] = t;
+
+    t = ((int * ALIGNED(16))a)[2];
+    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
+    ((int * ALIGNED(16))b)[2] = t;
+
+    t = ((int * ALIGNED(16))a)[3];
+    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
+    ((int * ALIGNED(16))b)[3] = t;
+
+    t = ((int * ALIGNED(16))a)[4];
+    ((int * ALIGNED(16))a)[4] = ((int * ALIGNED(16))b)[4];
+    ((int * ALIGNED(16))b)[4] = t;
+
+    t = ((int * ALIGNED(16))a)[5];
+    ((int * ALIGNED(16))a)[5] = ((int * ALIGNED(16))b)[5];
+    ((int * ALIGNED(16))b)[5] = t;
+
+    t = ((int * ALIGNED(16))a)[6];
+    ((int * ALIGNED(16))a)[6] = ((int * ALIGNED(16))b)[6];
+    ((int * ALIGNED(16))b)[6] = t;
+
+    t = ((int * ALIGNED(16))a)[7];
+    ((int * ALIGNED(16))a)[7] = ((int * ALIGNED(16))b)[7];
+    ((int * ALIGNED(16))b)[7] = t;
+  }
+
+  // v8 transposed memory manipulation functions
+
+  inline void load_8x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+                           const void *a4, const void *a5,
+                           const void *a6, const void *a7,
+			   v8 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+    a.i[4] = ((const int *)a4)[0];
+    a.i[5] = ((const int *)a5)[0];
+    a.i[6] = ((const int *)a6)[0];
+    a.i[7] = ((const int *)a7)[0];
+  }
+
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &a, v8 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+
+    a.i[4] = ((const int * ALIGNED(8))a4)[0];
+    b.i[4] = ((const int * ALIGNED(8))a4)[1];
+
+    a.i[5] = ((const int * ALIGNED(8))a5)[0];
+    b.i[5] = ((const int * ALIGNED(8))a5)[1];
+
+    a.i[6] = ((const int * ALIGNED(8))a6)[0];
+    b.i[6] = ((const int * ALIGNED(8))a6)[1];
+
+    a.i[7] = ((const int * ALIGNED(8))a7)[0];
+    b.i[7] = ((const int * ALIGNED(8))a7)[1];
+  }
+
+  inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+ 			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2]; 
+   }
+
+  inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+  }
+
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+    e.i[0] = ((const int * ALIGNED(16))a0)[4];
+    f.i[0] = ((const int * ALIGNED(16))a0)[5];
+    g.i[0] = ((const int * ALIGNED(16))a0)[6];
+    h.i[0] = ((const int * ALIGNED(16))a0)[7];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+    e.i[1] = ((const int * ALIGNED(16))a1)[4];
+    f.i[1] = ((const int * ALIGNED(16))a1)[5];
+    g.i[1] = ((const int * ALIGNED(16))a1)[6];
+    h.i[1] = ((const int * ALIGNED(16))a1)[7];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+    e.i[2] = ((const int * ALIGNED(16))a2)[4];
+    f.i[2] = ((const int * ALIGNED(16))a2)[5];
+    g.i[2] = ((const int * ALIGNED(16))a2)[6];
+    h.i[2] = ((const int * ALIGNED(16))a2)[7];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    e.i[3] = ((const int * ALIGNED(16))a3)[4];
+    f.i[3] = ((const int * ALIGNED(16))a3)[5];
+    g.i[3] = ((const int * ALIGNED(16))a3)[6];
+    h.i[3] = ((const int * ALIGNED(16))a3)[7];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+    e.i[4] = ((const int * ALIGNED(16))a4)[4];
+    f.i[4] = ((const int * ALIGNED(16))a4)[5];
+    g.i[4] = ((const int * ALIGNED(16))a4)[6];
+    h.i[4] = ((const int * ALIGNED(16))a4)[7];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+    e.i[5] = ((const int * ALIGNED(16))a5)[4];
+    f.i[5] = ((const int * ALIGNED(16))a5)[5];
+    g.i[5] = ((const int * ALIGNED(16))a5)[6];
+    h.i[5] = ((const int * ALIGNED(16))a5)[7];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+    e.i[6] = ((const int * ALIGNED(16))a6)[4];
+    f.i[6] = ((const int * ALIGNED(16))a6)[5];
+    g.i[6] = ((const int * ALIGNED(16))a6)[6];
+    h.i[6] = ((const int * ALIGNED(16))a6)[7];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+    e.i[7] = ((const int * ALIGNED(16))a7)[4];
+    f.i[7] = ((const int * ALIGNED(16))a7)[5];
+    g.i[7] = ((const int * ALIGNED(16))a7)[6];
+    h.i[7] = ((const int * ALIGNED(16))a7)[7];
+  }
+
+  inline void store_8x1_tr( const v8 &a,
+                            void *a0, void *a1, void *a2, void *a3,
+                            void *a4, void *a5, void *a6, void *a7 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+    ((int *)a4)[0] = a.i[4];
+    ((int *)a5)[0] = a.i[5];
+    ((int *)a6)[0] = a.i[6];
+    ((int *)a7)[0] = a.i[7];
+  }
+
+  inline void store_8x2_tr( const v8 &a, const v8 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3,
+                            void * ALIGNED(8) a4, void * ALIGNED(8) a5,
+                            void * ALIGNED(8) a6, void * ALIGNED(8) a7 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+
+    ((int * ALIGNED(8))a4)[0] = a.i[4];
+    ((int * ALIGNED(8))a4)[1] = b.i[4];
+
+    ((int * ALIGNED(8))a5)[0] = a.i[5];
+    ((int * ALIGNED(8))a5)[1] = b.i[5];
+
+    ((int * ALIGNED(8))a6)[0] = a.i[6];
+    ((int * ALIGNED(8))a6)[1] = b.i[6];
+
+    ((int * ALIGNED(8))a7)[0] = a.i[7];
+    ((int * ALIGNED(8))a7)[1] = b.i[7];
+  }
+
+  inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+  }
+
+  inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+  }
+
+  inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+			    const v8 &e, const v8 &f, const v8 &g, const v8 &h,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+    ((int * ALIGNED(16))a0)[4] = e.i[0];
+    ((int * ALIGNED(16))a0)[5] = f.i[0];
+    ((int * ALIGNED(16))a0)[6] = g.i[0];
+    ((int * ALIGNED(16))a0)[7] = h.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+    ((int * ALIGNED(16))a1)[4] = e.i[1];
+    ((int * ALIGNED(16))a1)[5] = f.i[1];
+    ((int * ALIGNED(16))a1)[6] = g.i[1];
+    ((int * ALIGNED(16))a1)[7] = h.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+    ((int * ALIGNED(16))a2)[4] = e.i[2];
+    ((int * ALIGNED(16))a2)[5] = f.i[2];
+    ((int * ALIGNED(16))a2)[6] = g.i[2];
+    ((int * ALIGNED(16))a2)[7] = h.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ((int * ALIGNED(16))a3)[4] = e.i[3];
+    ((int * ALIGNED(16))a3)[5] = f.i[3];
+    ((int * ALIGNED(16))a3)[6] = g.i[3];
+    ((int * ALIGNED(16))a3)[7] = h.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+    ((int * ALIGNED(16))a4)[4] = e.i[4];
+    ((int * ALIGNED(16))a4)[5] = f.i[4];
+    ((int * ALIGNED(16))a4)[6] = g.i[4];
+    ((int * ALIGNED(16))a4)[7] = h.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+    ((int * ALIGNED(16))a5)[4] = e.i[5];
+    ((int * ALIGNED(16))a5)[5] = f.i[5];
+    ((int * ALIGNED(16))a5)[6] = g.i[5];
+    ((int * ALIGNED(16))a5)[7] = h.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+    ((int * ALIGNED(16))a6)[4] = e.i[6];
+    ((int * ALIGNED(16))a6)[5] = f.i[6];
+    ((int * ALIGNED(16))a6)[6] = g.i[6];
+    ((int * ALIGNED(16))a6)[7] = h.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+    ((int * ALIGNED(16))a7)[4] = e.i[7];
+    ((int * ALIGNED(16))a7)[5] = f.i[7];
+    ((int * ALIGNED(16))a7)[6] = g.i[7];
+    ((int * ALIGNED(16))a7)[7] = h.i[7];
+  }
+
+  //////////////
+  // v8int class
+
+  class v8int : public v8
+  {
+    // v8int prefix unary operator friends
+
+    friend inline v8int operator  +( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  ~( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  !( const v8int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8int prefix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a ) ALWAYS_INLINE;
+
+    // v8int postfix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE;
+
+    // v8int binary operator friends
+
+    friend inline v8int operator  +( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  *( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  /( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  %( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  ^( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  &( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  |( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int logical operator friends
+
+    friend inline v8int operator  <( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8int abs( const v8int &a ) ALWAYS_INLINE;
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE;
+
+    // v8float unary operator friends
+
+    friend inline v8int operator  !( const v8float & a ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float miscellaneous friends
+
+    friend inline v8float clear_bits(  const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float set_bits(    const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8int constructors / destructors
+
+    v8int() {}                                // Default constructor
+
+    v8int( const v8int &a )                   // Copy constructor
+    {
+      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+      i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7];
+    }
+
+    v8int( const v8 &a )                      // Init from mixed
+    {
+      i[0] = a.i[0]; i[1] = a.i[1]; i[2] = a.i[2]; i[3] = a.i[3];
+      i[4] = a.i[4]; i[5] = a.i[5]; i[6] = a.i[6]; i[7] = a.i[7];
+    }
+
+    v8int( int a )                            // Init from scalar
+    {
+      i[0] = a; i[1] = a; i[2] = a; i[3] = a;
+      i[4] = a; i[5] = a; i[6] = a; i[7] = a;
+    }
+
+    v8int( int i0, int i1, int i2, int i3,
+	   int i4, int i5, int i6, int i7 )   // Init from scalars
+    {
+      i[0] = i0; i[1] = i1; i[2] = i2; i[3] = i3;
+      i[4] = i4; i[5] = i5; i[6] = i6; i[7] = i7;
+    }
+
+    ~v8int() {}                               // Destructor
+
+    // v8int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v8int &operator op( const v8int &b )   \
+    {						  \
+      i[0] op b.i[0];                             \
+      i[1] op b.i[1];                             \
+      i[2] op b.i[2];                             \
+      i[3] op b.i[3];                             \
+      i[4] op b.i[4];                             \
+      i[5] op b.i[5];                             \
+      i[6] op b.i[6];                             \
+      i[7] op b.i[7];                             \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v8int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v8int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v8int operator op( const v8int & a )   \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v8int operator !( const v8int & a )
+  {
+    v8int b;
+
+    b.i[0] = - ( !a.i[0] );
+    b.i[1] = - ( !a.i[1] );
+    b.i[2] = - ( !a.i[2] );
+    b.i[3] = - ( !a.i[3] );
+    b.i[4] = - ( !a.i[4] );
+    b.i[5] = - ( !a.i[5] );
+    b.i[6] = - ( !a.i[6] );
+    b.i[7] = - ( !a.i[7] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v8int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v8int operator op( v8int & a )         \
+  {						\
+    v8int b;                                    \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
+    b.i[4] = ( op a.i[4] );                     \
+    b.i[5] = ( op a.i[5] );                     \
+    b.i[6] = ( op a.i[6] );                     \
+    b.i[7] = ( op a.i[7] );                     \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v8int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v8int operator op( v8int & a, int )   \
+  {					       \
+    v8int b;                                   \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
+    b.i[4] = ( a.i[4] op );                    \
+    b.i[5] = ( a.i[5] op );                    \
+    b.i[6] = ( a.i[6] op );                    \
+    b.i[7] = ( a.i[7] op );                    \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v8int binary operators
+
+# define BINARY(op)                                             \
+  inline v8int operator op( const v8int &a, const v8int &b )    \
+  {								\
+    v8int c;                                                    \
+    c.i[0] = a.i[0] op b.i[0];                                  \
+    c.i[1] = a.i[1] op b.i[1];                                  \
+    c.i[2] = a.i[2] op b.i[2];                                  \
+    c.i[3] = a.i[3] op b.i[3];                                  \
+    c.i[4] = a.i[4] op b.i[4];                                  \
+    c.i[5] = a.i[5] op b.i[5];                                  \
+    c.i[6] = a.i[6] op b.i[6];                                  \
+    c.i[7] = a.i[7] op b.i[7];                                  \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v8int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v8int operator op( const v8int &a, const v8int &b )   \
+  {							       \
+    v8int c;                                                   \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
+    c.i[4] = - ( a.i[4] op b.i[4] );                           \
+    c.i[5] = - ( a.i[5] op b.i[5] );                           \
+    c.i[6] = - ( a.i[6] op b.i[6] );                           \
+    c.i[7] = - ( a.i[7] op b.i[7] );                           \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8int miscellaneous functions
+
+  inline v8int abs( const v8int &a )
+  {
+    v8int b;
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+    b.i[4] = ( a.i[4] >= 0 ) ? a.i[4] : -a.i[4];
+    b.i[5] = ( a.i[5] >= 0 ) ? a.i[5] : -a.i[5];
+    b.i[6] = ( a.i[6] >= 0 ) ? a.i[6] : -a.i[6];
+    b.i[7] = ( a.i[7] >= 0 ) ? a.i[7] : -a.i[7];
+
+    return b;
+  }
+
+  inline v8 czero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[0] & ~c.i[0];
+    b.i[1] = a.i[1] & ~c.i[1];
+    b.i[2] = a.i[2] & ~c.i[2];
+    b.i[3] = a.i[3] & ~c.i[3];
+    b.i[4] = a.i[4] & ~c.i[4];
+    b.i[5] = a.i[5] & ~c.i[5];
+    b.i[6] = a.i[6] & ~c.i[6];
+    b.i[7] = a.i[7] & ~c.i[7];
+
+    return b;
+  }
+
+  inline v8 notczero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[0] & c.i[0];
+    b.i[1] = a.i[1] & c.i[1];
+    b.i[2] = a.i[2] & c.i[2];
+    b.i[3] = a.i[3] & c.i[3];
+    b.i[4] = a.i[4] & c.i[4];
+    b.i[5] = a.i[5] & c.i[5];
+    b.i[6] = a.i[6] & c.i[6];
+    b.i[7] = a.i[7] & c.i[7];
+
+    return b;
+  }
+
+  inline v8 merge( const v8int &c, const v8 &t, const v8 &f )
+  {
+    v8 m;
+
+    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+    m.i[4] = ( f.i[4] & ~c.i[4] ) | ( t.i[4] & c.i[4] );
+    m.i[5] = ( f.i[5] & ~c.i[5] ) | ( t.i[5] & c.i[5] );
+    m.i[6] = ( f.i[6] & ~c.i[6] ) | ( t.i[6] & c.i[6] );
+    m.i[7] = ( f.i[7] & ~c.i[7] ) | ( t.i[7] & c.i[7] );
+
+    return m;
+  }
+
+  ////////////////
+  // v8float class
+
+  class v8float : public v8
+  {
+    // v8float prefix unary operator friends
+
+    friend inline v8float operator  +( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  ~( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8int   operator  !( const v8float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8float prefix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a ) ALWAYS_INLINE;
+
+    // v8float postfix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE;
+
+    // v8float binary operator friends
+
+    friend inline v8float operator  +( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  *( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  /( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float math library friends
+
+#   define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v8float fn( const v8float &a,  \
+                                                   const v8float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v8float miscellaneous friends
+
+    friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rsqrt       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void     scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8float constructors / destructors
+
+    v8float() {}                                        // Default constructor
+
+    v8float( const v8float &a )                         // Copy constructor
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+      f[4] = a.f[4];
+      f[5] = a.f[5];
+      f[6] = a.f[6];
+      f[7] = a.f[7];
+    }
+
+    v8float( const v8 &a )                              // Init from mixed
+    {
+      f[0] = a.f[0];
+      f[1] = a.f[1];
+      f[2] = a.f[2];
+      f[3] = a.f[3];
+      f[4] = a.f[4];
+      f[5] = a.f[5];
+      f[6] = a.f[6];
+      f[7] = a.f[7];
+    }
+
+    v8float( float a )                                  // Init from scalar
+    {
+      f[0] = a;
+      f[1] = a;
+      f[2] = a;
+      f[3] = a;
+      f[4] = a;
+      f[5] = a;
+      f[6] = a;
+      f[7] = a;
+    }
+
+    v8float( float f0, float f1, float f2, float f3,
+	     float f4, float f5, float f6, float f7 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
+      f[4] = f4;
+      f[5] = f5;
+      f[6] = f6;
+      f[7] = f7;
+    }
+
+    ~v8float() {}                                       // Destructor
+
+    // v8float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v8float &operator op( const v8float &b )     \
+    {							\
+      f[0] op b.f[0];		             		\
+      f[1] op b.f[1];                                   \
+      f[2] op b.f[2];                                   \
+      f[3] op b.f[3];                                   \
+      f[4] op b.f[4];		             		\
+      f[5] op b.f[5];                                   \
+      f[6] op b.f[6];                                   \
+      f[7] op b.f[7];                                   \
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v8float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v8float prefix unary operators
+
+  inline v8float operator +( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = +a.f[0];
+    b.f[1] = +a.f[1];
+    b.f[2] = +a.f[2];
+    b.f[3] = +a.f[3];
+    b.f[4] = +a.f[4];
+    b.f[5] = +a.f[5];
+    b.f[6] = +a.f[6];
+    b.f[7] = +a.f[7];
+
+    return b;
+  }
+
+  inline v8float operator -( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = -a.f[0];
+    b.f[1] = -a.f[1];
+    b.f[2] = -a.f[2];
+    b.f[3] = -a.f[3];
+    b.f[4] = -a.f[4];
+    b.f[5] = -a.f[5];
+    b.f[6] = -a.f[6];
+    b.f[7] = -a.f[7];
+
+    return b;
+  }
+
+  inline v8int operator !( const v8float &a )
+  {
+    v8int b;
+
+    b.i[0] = a.i[0] ? 0 : -1;
+    b.i[1] = a.i[1] ? 0 : -1;
+    b.i[2] = a.i[2] ? 0 : -1;
+    b.i[3] = a.i[3] ? 0 : -1;
+    b.i[4] = a.i[4] ? 0 : -1;
+    b.i[5] = a.i[5] ? 0 : -1;
+    b.i[6] = a.i[6] ? 0 : -1;
+    b.i[7] = a.i[7] ? 0 : -1;
+
+    return b;
+  }
+
+  // v8float prefix increment / decrement operators
+
+  inline v8float operator ++( v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ++a.f[0];
+    b.f[1] = ++a.f[1];
+    b.f[2] = ++a.f[2];
+    b.f[3] = ++a.f[3];
+    b.f[4] = ++a.f[4];
+    b.f[5] = ++a.f[5];
+    b.f[6] = ++a.f[6];
+    b.f[7] = ++a.f[7];
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = --a.f[0];
+    b.f[1] = --a.f[1];
+    b.f[2] = --a.f[2];
+    b.f[3] = --a.f[3];
+    b.f[4] = --a.f[4];
+    b.f[5] = --a.f[5];
+    b.f[6] = --a.f[6];
+    b.f[7] = --a.f[7];
+
+    return b;
+  }
+
+  // v8float postfix increment / decrement operators
+
+  inline v8float operator ++( v8float &a, int )
+  {
+    v8float b;
+
+    b.f[0] = a.f[0]++;
+    b.f[1] = a.f[1]++;
+    b.f[2] = a.f[2]++;
+    b.f[3] = a.f[3]++;
+    b.f[4] = a.f[4]++;
+    b.f[5] = a.f[5]++;
+    b.f[6] = a.f[6]++;
+    b.f[7] = a.f[7]++;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a, int )
+  {
+    v8float b;
+
+    b.f[0] = a.f[0]--;
+    b.f[1] = a.f[1]--;
+    b.f[2] = a.f[2]--;
+    b.f[3] = a.f[3]--;
+    b.f[4] = a.f[4]--;
+    b.f[5] = a.f[5]--;
+    b.f[6] = a.f[6]--;
+    b.f[7] = a.f[7]--;
+
+    return b;
+  }
+
+  // v8float binary operators
+
+# define BINARY(op)                                                  \
+  inline v8float operator op( const v8float &a, const v8float &b )   \
+  {								     \
+    v8float c;                                                       \
+    c.f[0] = a.f[0] op b.f[0];                                       \
+    c.f[1] = a.f[1] op b.f[1];                                       \
+    c.f[2] = a.f[2] op b.f[2];                                       \
+    c.f[3] = a.f[3] op b.f[3];                                       \
+    c.f[4] = a.f[4] op b.f[4];                                       \
+    c.f[5] = a.f[5] op b.f[5];                                       \
+    c.f[6] = a.f[6] op b.f[6];                                       \
+    c.f[7] = a.f[7] op b.f[7];                                       \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v8float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v8int operator op( const v8float &a, const v8float &b )   \
+  {								   \
+    v8int c;                                                       \
+    c.i[0] = -( a.f[0] op b.f[0] );                                \
+    c.i[1] = -( a.f[1] op b.f[1] );                                \
+    c.i[2] = -( a.f[2] op b.f[2] );                                \
+    c.i[3] = -( a.f[3] op b.f[3] );                                \
+    c.i[4] = -( a.f[4] op b.f[4] );                                \
+    c.i[5] = -( a.f[5] op b.f[5] );                                \
+    c.i[6] = -( a.f[6] op b.f[6] );                                \
+    c.i[7] = -( a.f[7] op b.f[7] );                                \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v8float fn( const v8float &a )         \
+  {						\
+    v8float b;                                  \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
+    b.f[4] = ::fn( a.f[4] );                    \
+    b.f[5] = ::fn( a.f[5] );                    \
+    b.f[6] = ::fn( a.f[6] );                    \
+    b.f[7] = ::fn( a.f[7] );                    \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v8float fn( const v8float &a, const v8float &b )       \
+  {								\
+    v8float c;                                                  \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
+    c.f[4] = ::fn( a.f[4], b.f[4] );                            \
+    c.f[5] = ::fn( a.f[5], b.f[5] );                            \
+    c.f[6] = ::fn( a.f[6], b.f[6] );                            \
+    c.f[7] = ::fn( a.f[7], b.f[7] );                            \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v8float copysign( const v8float &a, const v8float &b )
+  {
+    v8float c;
+    float t;
+
+    t = ::fabs( a.f[0] );
+    if( b.f[0] < 0 ) t = -t;
+    c.f[0] = t;
+
+    t = ::fabs( a.f[1] );
+    if( b.f[1] < 0 ) t = -t;
+    c.f[1] = t;
+
+    t = ::fabs( a.f[2] );
+    if( b.f[2] < 0 ) t = -t;
+    c.f[2] = t;
+
+    t = ::fabs( a.f[3] );
+    if( b.f[3] < 0 ) t = -t;
+    c.f[3] = t;
+
+    t = ::fabs( a.f[4] );
+    if( b.f[4] < 0 ) t = -t;
+    c.f[4] = t;
+
+    t = ::fabs( a.f[5] );
+    if( b.f[5] < 0 ) t = -t;
+    c.f[5] = t;
+
+    t = ::fabs( a.f[6] );
+    if( b.f[6] < 0 ) t = -t;
+    c.f[6] = t;
+
+    t = ::fabs( a.f[7] );
+    if( b.f[7] < 0 ) t = -t;
+    c.f[7] = t;
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v8float miscellaneous functions
+
+  inline v8float rsqrt_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = ::sqrt( 1.0f / a.f[0] );
+    b.f[1] = ::sqrt( 1.0f / a.f[1] );
+    b.f[2] = ::sqrt( 1.0f / a.f[2] );
+    b.f[3] = ::sqrt( 1.0f / a.f[3] );
+    b.f[4] = ::sqrt( 1.0f / a.f[4] );
+    b.f[5] = ::sqrt( 1.0f / a.f[5] );
+    b.f[6] = ::sqrt( 1.0f / a.f[6] );
+    b.f[7] = ::sqrt( 1.0f / a.f[7] );
+
+    return b;
+  }
+
+  inline v8float rcp_approx( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    b.f[0] = 1.0f / a.f[0];
+    b.f[1] = 1.0f / a.f[1];
+    b.f[2] = 1.0f / a.f[2];
+    b.f[3] = 1.0f / a.f[3];
+    b.f[4] = 1.0f / a.f[4];
+    b.f[5] = 1.0f / a.f[5];
+    b.f[6] = 1.0f / a.f[6];
+    b.f[7] = 1.0f / a.f[7];
+
+    return b;
+  }
+
+  inline v8float fma( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = a.f[0] * b.f[0] + c.f[0];
+    d.f[1] = a.f[1] * b.f[1] + c.f[1];
+    d.f[2] = a.f[2] * b.f[2] + c.f[2];
+    d.f[3] = a.f[3] * b.f[3] + c.f[3];
+    d.f[4] = a.f[4] * b.f[4] + c.f[4];
+    d.f[5] = a.f[5] * b.f[5] + c.f[5];
+    d.f[6] = a.f[6] * b.f[6] + c.f[6];
+    d.f[7] = a.f[7] * b.f[7] + c.f[7];
+
+    return d;
+  }
+
+  inline v8float fms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = a.f[0] * b.f[0] - c.f[0];
+    d.f[1] = a.f[1] * b.f[1] - c.f[1];
+    d.f[2] = a.f[2] * b.f[2] - c.f[2];
+    d.f[3] = a.f[3] * b.f[3] - c.f[3];
+    d.f[4] = a.f[4] * b.f[4] - c.f[4];
+    d.f[5] = a.f[5] * b.f[5] - c.f[5];
+    d.f[6] = a.f[6] * b.f[6] - c.f[6];
+    d.f[7] = a.f[7] * b.f[7] - c.f[7];
+
+    return d;
+  }
+
+  inline v8float fnms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    d.f[0] = c.f[0] - a.f[0] * b.f[0];
+    d.f[1] = c.f[1] - a.f[1] * b.f[1];
+    d.f[2] = c.f[2] - a.f[2] * b.f[2];
+    d.f[3] = c.f[3] - a.f[3] * b.f[3];
+    d.f[4] = c.f[4] - a.f[4] * b.f[4];
+    d.f[5] = c.f[5] - a.f[5] * b.f[5];
+    d.f[6] = c.f[6] - a.f[6] * b.f[6];
+    d.f[7] = c.f[7] - a.f[7] * b.f[7];
+
+    return d;
+  }
+
+  inline v8float clear_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = ( ~m.i[0] ) & a.i[0];
+    b.i[1] = ( ~m.i[1] ) & a.i[1];
+    b.i[2] = ( ~m.i[2] ) & a.i[2];
+    b.i[3] = ( ~m.i[3] ) & a.i[3];
+    b.i[4] = ( ~m.i[4] ) & a.i[4];
+    b.i[5] = ( ~m.i[5] ) & a.i[5];
+    b.i[6] = ( ~m.i[6] ) & a.i[6];
+    b.i[7] = ( ~m.i[7] ) & a.i[7];
+
+    return b;
+  }
+
+  inline v8float set_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = m.i[0] | a.i[0];
+    b.i[1] = m.i[1] | a.i[1];
+    b.i[2] = m.i[2] | a.i[2];
+    b.i[3] = m.i[3] | a.i[3];
+    b.i[4] = m.i[4] | a.i[4];
+    b.i[5] = m.i[5] | a.i[5];
+    b.i[6] = m.i[6] | a.i[6];
+    b.i[7] = m.i[7] | a.i[7];
+
+    return b;
+  }
+
+  inline v8float toggle_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    b.i[0] = m.i[0] ^ a.i[0];
+    b.i[1] = m.i[1] ^ a.i[1];
+    b.i[2] = m.i[2] ^ a.i[2];
+    b.i[3] = m.i[3] ^ a.i[3];
+    b.i[4] = m.i[4] ^ a.i[4];
+    b.i[5] = m.i[5] ^ a.i[5];
+    b.i[6] = m.i[6] ^ a.i[6];
+    b.i[7] = m.i[7] ^ a.i[7];
+
+    return b;
+  }
+
+  inline void increment_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] += a.f[0];
+    p[1] += a.f[1];
+    p[2] += a.f[2];
+    p[3] += a.f[3];
+    p[4] += a.f[4];
+    p[5] += a.f[5];
+    p[6] += a.f[6];
+    p[7] += a.f[7];
+  }
+
+  inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] -= a.f[0];
+    p[1] -= a.f[1];
+    p[2] -= a.f[2];
+    p[3] -= a.f[3];
+    p[4] -= a.f[4];
+    p[5] -= a.f[5];
+    p[6] -= a.f[6];
+    p[7] -= a.f[7];
+  }
+
+  inline void scale_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    p[0] *= a.f[0];
+    p[1] *= a.f[1];
+    p[2] *= a.f[2];
+    p[3] *= a.f[3];
+    p[4] *= a.f[4];
+    p[5] *= a.f[5];
+    p[6] *= a.f[6];
+    p[7] *= a.f[7];
+  }
+
+} // namespace v8
+
+#endif // _v8_portable_h_
diff --git a/src/util/v8/v8_portable_v1.h b/src/util/v8/v8_portable_v1.h
new file mode 100644
index 00000000..310c1390
--- /dev/null
+++ b/src/util/v8/v8_portable_v1.h
@@ -0,0 +1,1523 @@
+#ifndef _v8_portable_h_
+#define _v8_portable_h_
+
+#ifndef IN_v8_h
+#error "Do not include v8_portable.h directly; use v8.h"
+#endif
+
+#include <math.h>
+
+#define V8_ACCELERATION
+#define V8_PORTABLE_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+// This does not work with gcc 5.3.1 and the -fopenmp-simd
+// flag.  Does not seem to work with -fopenmp either.  Not
+// sure why.  It does work with the Intel compiler.  Need
+// to try later versions of gcc.
+// #define ALWAYS_VECTORIZE _Pragma( "omp simd" )
+
+// #define ALWAYS_VECTORIZE _Pragma( "simd" )
+
+#define ALWAYS_VECTORIZE \
+  _Pragma( "simd" ) \
+  _Pragma( "vector aligned" )
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v8
+{
+  class v8;
+  class v8int;
+  class v8float;
+
+  ////////////////
+  // v8 base class
+
+  class v8
+  {
+    friend class v8int;
+    friend class v8float;
+
+    // v8 miscellaneous friends
+
+    friend inline int any( const v8 &a ) ALWAYS_INLINE;
+    friend inline int all( const v8 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v8 splat( const v8 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+    friend inline v8 shuffle( const v8 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v8 &a, v8 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+				  v8 &a4, v8 &a5, v8 &a6, v8 &a7 ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 merge( const v8int &c, const v8 &a, const v8 &b ) ALWAYS_INLINE;
+
+    // v8 memory manipulation friends
+
+    friend inline void   load_8x1( const void * ALIGNED(16) p, v8 &a ) ALWAYS_INLINE;
+    friend inline void  store_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_8x1( const v8 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_8x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_8x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_8x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v8 transposed memory manipulation friends
+    // Note: Half aligned values are permissible in the 8x2_tr variants.
+
+    friend inline void load_8x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+				    const void *a4, const void *a5,
+                                    const void *a6, const void *a7,
+                                    v8 &a ) ALWAYS_INLINE;
+
+    friend inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+				    const void * ALIGNED(8) a4,
+                                    const void * ALIGNED(8) a5,
+                                    const void * ALIGNED(8) a6,
+                                    const void * ALIGNED(8) a7,
+                                    v8 &a, v8 &b ) ALWAYS_INLINE;
+
+    friend inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c ) ALWAYS_INLINE;
+
+    friend inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d ) ALWAYS_INLINE;
+
+    friend inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+				    const void * ALIGNED(16) a4,
+                                    const void * ALIGNED(16) a5,
+                                    const void * ALIGNED(16) a6,
+                                    const void * ALIGNED(16) a7,
+                                    v8 &a, v8 &b, v8 &c, v8 &d,
+                                    v8 &e, v8 &f, v8 &g, v8 &h ) ALWAYS_INLINE;
+
+    friend inline void store_8x1_tr( const v8 &a,
+                                     void *a0, void *a1, void *a2, void *a3,
+                                     void *a4, void *a5, void *a6, void *a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x2_tr( const v8 &a, const v8 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3,
+                                     void * ALIGNED(8) a4,
+                                     void * ALIGNED(8) a5,
+                                     void * ALIGNED(8) a6,
+                                     void * ALIGNED(8) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x4_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+    friend inline void store_8x8_tr( const v8 &a, const v8 &b,
+                                     const v8 &c, const v8 &d,
+                                     const v8 &e, const v8 &f,
+                                     const v8 &g, const v8 &h,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3,
+                                     void * ALIGNED(16) a4,
+                                     void * ALIGNED(16) a5,
+                                     void * ALIGNED(16) a6,
+                                     void * ALIGNED(16) a7 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[8];
+      float f[8];
+    };
+
+  public:
+
+    v8() {}                    // Default constructor
+
+    v8( const v8 &a )          // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	i[j] = a.i[j];
+    }
+
+    ~v8() {}                   // Default destructor
+  };
+
+  // v8 miscellaneous functions
+
+  inline int any( const v8 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3] ||
+           a.i[4] || a.i[5] || a.i[6] || a.i[7];
+  }
+
+  inline int all( const v8 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3] &&
+           a.i[4] && a.i[5] && a.i[6] && a.i[7];
+  }
+
+  template<int n>
+  inline v8 splat( const v8 & a )
+  {
+    v8 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+  inline v8 shuffle( const v8 & a )
+  {
+    v8 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+    b.i[4] = a.i[i4];
+    b.i[5] = a.i[i5];
+    b.i[6] = a.i[i6];
+    b.i[7] = a.i[i7];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v8 &a, v8 &b )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      sw( a.i[j], b.i[j] );
+  }
+
+  inline void transpose( v8 &a0, v8 &a1, v8 &a2, v8 &a3,
+			 v8 &a4, v8 &a5, v8 &a6, v8 &a7 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] ); sw( a0.i[4],a4.i[0] ); sw( a0.i[5],a5.i[0] ); sw( a0.i[6],a6.i[0] ); sw( a0.i[7],a7.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] ); sw( a1.i[4],a4.i[1] ); sw( a1.i[5],a5.i[1] ); sw( a1.i[6],a6.i[1] ); sw( a1.i[7],a7.i[1] );
+                                                  sw( a2.i[3],a3.i[2] ); sw( a2.i[4],a4.i[2] ); sw( a2.i[5],a5.i[2] ); sw( a2.i[6],a6.i[2] ); sw( a2.i[7],a7.i[2] );
+                                                                         sw( a3.i[4],a4.i[3] ); sw( a3.i[5],a5.i[3] ); sw( a3.i[6],a6.i[3] ); sw( a3.i[7],a7.i[3] );
+                                                                                                sw( a4.i[5],a5.i[4] ); sw( a4.i[6],a6.i[4] ); sw( a4.i[7],a7.i[4] );
+                                                                                                                       sw( a5.i[6],a6.i[5] ); sw( a5.i[7],a7.i[5] );
+                                                                                                                                              sw( a6.i[7],a7.i[6] );
+  }
+
+# undef sw
+
+  // v8 memory manipulation functions
+
+  inline void load_8x1( const void * ALIGNED(16) p,
+			v8 &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      a.i[j] = ((const int * ALIGNED(16))p)[j];
+  }
+
+  inline void store_8x1( const v8 &a,
+			 void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void stream_8x1( const v8 &a,
+			  void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void clear_8x1( void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      ((int * ALIGNED(16))p)[j] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_8x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
+  }
+
+  inline void swap_8x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+    {
+      t = ((int * ALIGNED(16))a)[j];
+      ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
+      ((int * ALIGNED(16))b)[j] = t;
+    }
+  }
+
+  // v8 transposed memory manipulation functions
+
+  inline void load_8x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+                           const void *a4, const void *a5,
+                           const void *a6, const void *a7,
+			   v8 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+    a.i[4] = ((const int *)a4)[0];
+    a.i[5] = ((const int *)a5)[0];
+    a.i[6] = ((const int *)a6)[0];
+    a.i[7] = ((const int *)a7)[0];
+  }
+
+  inline void load_8x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+			   const void * ALIGNED(8) a4,
+                           const void * ALIGNED(8) a5,
+                           const void * ALIGNED(8) a6,
+                           const void * ALIGNED(8) a7,
+                           v8 &a, v8 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+
+    a.i[4] = ((const int * ALIGNED(8))a4)[0];
+    b.i[4] = ((const int * ALIGNED(8))a4)[1];
+
+    a.i[5] = ((const int * ALIGNED(8))a5)[0];
+    b.i[5] = ((const int * ALIGNED(8))a5)[1];
+
+    a.i[6] = ((const int * ALIGNED(8))a6)[0];
+    b.i[6] = ((const int * ALIGNED(8))a6)[1];
+
+    a.i[7] = ((const int * ALIGNED(8))a7)[0];
+    b.i[7] = ((const int * ALIGNED(8))a7)[1];
+  }
+
+  inline void load_8x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+ 			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2]; 
+   }
+
+  inline void load_8x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+  }
+
+  inline void load_8x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+			   const void * ALIGNED(16) a4,
+                           const void * ALIGNED(16) a5,
+                           const void * ALIGNED(16) a6,
+                           const void * ALIGNED(16) a7,
+                           v8 &a, v8 &b, v8 &c, v8 &d,
+                           v8 &e, v8 &f, v8 &g, v8 &h )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+    e.i[0] = ((const int * ALIGNED(16))a0)[4];
+    f.i[0] = ((const int * ALIGNED(16))a0)[5];
+    g.i[0] = ((const int * ALIGNED(16))a0)[6];
+    h.i[0] = ((const int * ALIGNED(16))a0)[7];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+    e.i[1] = ((const int * ALIGNED(16))a1)[4];
+    f.i[1] = ((const int * ALIGNED(16))a1)[5];
+    g.i[1] = ((const int * ALIGNED(16))a1)[6];
+    h.i[1] = ((const int * ALIGNED(16))a1)[7];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+    e.i[2] = ((const int * ALIGNED(16))a2)[4];
+    f.i[2] = ((const int * ALIGNED(16))a2)[5];
+    g.i[2] = ((const int * ALIGNED(16))a2)[6];
+    h.i[2] = ((const int * ALIGNED(16))a2)[7];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    e.i[3] = ((const int * ALIGNED(16))a3)[4];
+    f.i[3] = ((const int * ALIGNED(16))a3)[5];
+    g.i[3] = ((const int * ALIGNED(16))a3)[6];
+    h.i[3] = ((const int * ALIGNED(16))a3)[7];
+
+    a.i[4] = ((const int * ALIGNED(16))a4)[0];
+    b.i[4] = ((const int * ALIGNED(16))a4)[1];
+    c.i[4] = ((const int * ALIGNED(16))a4)[2];
+    d.i[4] = ((const int * ALIGNED(16))a4)[3];
+    e.i[4] = ((const int * ALIGNED(16))a4)[4];
+    f.i[4] = ((const int * ALIGNED(16))a4)[5];
+    g.i[4] = ((const int * ALIGNED(16))a4)[6];
+    h.i[4] = ((const int * ALIGNED(16))a4)[7];
+
+    a.i[5] = ((const int * ALIGNED(16))a5)[0];
+    b.i[5] = ((const int * ALIGNED(16))a5)[1];
+    c.i[5] = ((const int * ALIGNED(16))a5)[2];
+    d.i[5] = ((const int * ALIGNED(16))a5)[3];
+    e.i[5] = ((const int * ALIGNED(16))a5)[4];
+    f.i[5] = ((const int * ALIGNED(16))a5)[5];
+    g.i[5] = ((const int * ALIGNED(16))a5)[6];
+    h.i[5] = ((const int * ALIGNED(16))a5)[7];
+
+    a.i[6] = ((const int * ALIGNED(16))a6)[0];
+    b.i[6] = ((const int * ALIGNED(16))a6)[1];
+    c.i[6] = ((const int * ALIGNED(16))a6)[2];
+    d.i[6] = ((const int * ALIGNED(16))a6)[3];
+    e.i[6] = ((const int * ALIGNED(16))a6)[4];
+    f.i[6] = ((const int * ALIGNED(16))a6)[5];
+    g.i[6] = ((const int * ALIGNED(16))a6)[6];
+    h.i[6] = ((const int * ALIGNED(16))a6)[7];
+
+    a.i[7] = ((const int * ALIGNED(16))a7)[0];
+    b.i[7] = ((const int * ALIGNED(16))a7)[1];
+    c.i[7] = ((const int * ALIGNED(16))a7)[2];
+    d.i[7] = ((const int * ALIGNED(16))a7)[3];
+    e.i[7] = ((const int * ALIGNED(16))a7)[4];
+    f.i[7] = ((const int * ALIGNED(16))a7)[5];
+    g.i[7] = ((const int * ALIGNED(16))a7)[6];
+    h.i[7] = ((const int * ALIGNED(16))a7)[7];
+  }
+
+  inline void store_8x1_tr( const v8 &a,
+                            void *a0, void *a1, void *a2, void *a3,
+                            void *a4, void *a5, void *a6, void *a7 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+    ((int *)a4)[0] = a.i[4];
+    ((int *)a5)[0] = a.i[5];
+    ((int *)a6)[0] = a.i[6];
+    ((int *)a7)[0] = a.i[7];
+  }
+
+  inline void store_8x2_tr( const v8 &a, const v8 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3,
+                            void * ALIGNED(8) a4, void * ALIGNED(8) a5,
+                            void * ALIGNED(8) a6, void * ALIGNED(8) a7 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+
+    ((int * ALIGNED(8))a4)[0] = a.i[4];
+    ((int * ALIGNED(8))a4)[1] = b.i[4];
+
+    ((int * ALIGNED(8))a5)[0] = a.i[5];
+    ((int * ALIGNED(8))a5)[1] = b.i[5];
+
+    ((int * ALIGNED(8))a6)[0] = a.i[6];
+    ((int * ALIGNED(8))a6)[1] = b.i[6];
+
+    ((int * ALIGNED(8))a7)[0] = a.i[7];
+    ((int * ALIGNED(8))a7)[1] = b.i[7];
+  }
+
+  inline void store_8x3_tr( const v8 &a, const v8 &b, const v8 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+  }
+
+  inline void store_8x4_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+  }
+
+  inline void store_8x8_tr( const v8 &a, const v8 &b, const v8 &c, const v8 &d,
+			    const v8 &e, const v8 &f, const v8 &g, const v8 &h,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3,
+                            void * ALIGNED(16) a4, void * ALIGNED(16) a5,
+                            void * ALIGNED(16) a6, void * ALIGNED(16) a7 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+    ((int * ALIGNED(16))a0)[4] = e.i[0];
+    ((int * ALIGNED(16))a0)[5] = f.i[0];
+    ((int * ALIGNED(16))a0)[6] = g.i[0];
+    ((int * ALIGNED(16))a0)[7] = h.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+    ((int * ALIGNED(16))a1)[4] = e.i[1];
+    ((int * ALIGNED(16))a1)[5] = f.i[1];
+    ((int * ALIGNED(16))a1)[6] = g.i[1];
+    ((int * ALIGNED(16))a1)[7] = h.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+    ((int * ALIGNED(16))a2)[4] = e.i[2];
+    ((int * ALIGNED(16))a2)[5] = f.i[2];
+    ((int * ALIGNED(16))a2)[6] = g.i[2];
+    ((int * ALIGNED(16))a2)[7] = h.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ((int * ALIGNED(16))a3)[4] = e.i[3];
+    ((int * ALIGNED(16))a3)[5] = f.i[3];
+    ((int * ALIGNED(16))a3)[6] = g.i[3];
+    ((int * ALIGNED(16))a3)[7] = h.i[3];
+
+    ((int * ALIGNED(16))a4)[0] = a.i[4];
+    ((int * ALIGNED(16))a4)[1] = b.i[4];
+    ((int * ALIGNED(16))a4)[2] = c.i[4];
+    ((int * ALIGNED(16))a4)[3] = d.i[4];
+    ((int * ALIGNED(16))a4)[4] = e.i[4];
+    ((int * ALIGNED(16))a4)[5] = f.i[4];
+    ((int * ALIGNED(16))a4)[6] = g.i[4];
+    ((int * ALIGNED(16))a4)[7] = h.i[4];
+
+    ((int * ALIGNED(16))a5)[0] = a.i[5];
+    ((int * ALIGNED(16))a5)[1] = b.i[5];
+    ((int * ALIGNED(16))a5)[2] = c.i[5];
+    ((int * ALIGNED(16))a5)[3] = d.i[5];
+    ((int * ALIGNED(16))a5)[4] = e.i[5];
+    ((int * ALIGNED(16))a5)[5] = f.i[5];
+    ((int * ALIGNED(16))a5)[6] = g.i[5];
+    ((int * ALIGNED(16))a5)[7] = h.i[5];
+
+    ((int * ALIGNED(16))a6)[0] = a.i[6];
+    ((int * ALIGNED(16))a6)[1] = b.i[6];
+    ((int * ALIGNED(16))a6)[2] = c.i[6];
+    ((int * ALIGNED(16))a6)[3] = d.i[6];
+    ((int * ALIGNED(16))a6)[4] = e.i[6];
+    ((int * ALIGNED(16))a6)[5] = f.i[6];
+    ((int * ALIGNED(16))a6)[6] = g.i[6];
+    ((int * ALIGNED(16))a6)[7] = h.i[6];
+
+    ((int * ALIGNED(16))a7)[0] = a.i[7];
+    ((int * ALIGNED(16))a7)[1] = b.i[7];
+    ((int * ALIGNED(16))a7)[2] = c.i[7];
+    ((int * ALIGNED(16))a7)[3] = d.i[7];
+    ((int * ALIGNED(16))a7)[4] = e.i[7];
+    ((int * ALIGNED(16))a7)[5] = f.i[7];
+    ((int * ALIGNED(16))a7)[6] = g.i[7];
+    ((int * ALIGNED(16))a7)[7] = h.i[7];
+  }
+
+  //////////////
+  // v8int class
+
+  class v8int : public v8
+  {
+    // v8int prefix unary operator friends
+
+    friend inline v8int operator  +( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  ~( const v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator  !( const v8int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8int prefix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a ) ALWAYS_INLINE;
+
+    // v8int postfix increment / decrement operator friends
+
+    friend inline v8int operator ++( v8int & a, int ) ALWAYS_INLINE;
+    friend inline v8int operator --( v8int & a, int ) ALWAYS_INLINE;
+
+    // v8int binary operator friends
+
+    friend inline v8int operator  +( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  -( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  *( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  /( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  %( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  ^( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  &( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  |( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <<( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >>( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int logical operator friends
+
+    friend inline v8int operator  <( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8int &a, const v8int &b ) ALWAYS_INLINE;
+
+    // v8int miscellaneous friends
+
+    friend inline v8int abs( const v8int &a ) ALWAYS_INLINE;
+    friend inline v8    czero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    friend inline v8 notczero( const v8int &c, const v8 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v8 merge( const v8int &c, const v8 &t, const v8 &f ) ALWAYS_INLINE;
+
+    // v8float unary operator friends
+
+    friend inline v8int operator  !( const v8float & a ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float miscellaneous friends
+
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8int constructors / destructors
+
+    v8int() {}                                // Default constructor
+
+    v8int( const v8int &a )                   // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	i[j] = a.i[j];
+    }
+
+    v8int( const v8 &a )                      // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	i[j] = a.i[j];
+    }
+
+    v8int( int a )                            // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	i[j] = a;
+    }
+
+    v8int( int i0, int i1, int i2, int i3,    // Init from scalars
+	   int i4, int i5, int i6, int i7 )
+    {
+      i[0] = i0;
+      i[1] = i1;
+      i[2] = i2;
+      i[3] = i3;
+      i[4] = i4;
+      i[5] = i5;
+      i[6] = i6;
+      i[7] = i7;
+    }
+
+    ~v8int() {}                               // Destructor
+
+    // v8int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v8int &operator op( const v8int &b )   \
+    {						  \
+      ALWAYS_VECTORIZE                            \
+      for( int j = 0; j < 8; j++ )                \
+        i[j] op b.i[j];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v8int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v8int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v8int operator op( const v8int & a )   \
+  {						\
+    v8int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 8; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v8int operator !( const v8int & a )
+  {
+    v8int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = - ( !a.i[j] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v8int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v8int operator op( v8int & a )         \
+  {						\
+    v8int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 8; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v8int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v8int operator op( v8int & a, int )   \
+  {					       \
+    v8int b;                                   \
+    ALWAYS_VECTORIZE                           \
+    for( int j = 0; j < 8; j++ )               \
+      b.i[j] = ( a.i[j] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v8int binary operators
+
+# define BINARY(op)                                             \
+  inline v8int operator op( const v8int &a, const v8int &b )    \
+  {								\
+    v8int c;                                                    \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 8; j++ )                                \
+      c.i[j] = a.i[j] op b.i[j];                                \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v8int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v8int operator op( const v8int &a, const v8int &b )   \
+  {							       \
+    v8int c;                                                   \
+    ALWAYS_VECTORIZE                                           \
+    for( int j = 0; j < 8; j++ )                               \
+      c.i[j] = - ( a.i[j] op b.i[j] );                         \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8int miscellaneous functions
+
+  inline v8int abs( const v8int &a )
+  {
+    v8int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j];
+
+    return b;
+  }
+
+  inline v8 czero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
+
+    return b;
+  }
+
+  inline v8 notczero( const v8int &c, const v8 &a )
+  {
+    v8 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = a.i[j] & c.i[j];
+
+    return b;
+  }
+
+  inline v8 merge( const v8int &c, const v8 &t, const v8 &f )
+  {
+    v8 m;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+
+    return m;
+  }
+
+  ////////////////
+  // v8float class
+
+  class v8float : public v8
+  {
+    // v8float prefix unary operator friends
+
+    friend inline v8float operator  +( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator  ~( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8int   operator  !( const v8float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v8float prefix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a ) ALWAYS_INLINE;
+
+    // v8float postfix increment / decrement operator friends
+
+    friend inline v8float operator ++( v8float &a, int ) ALWAYS_INLINE;
+    friend inline v8float operator --( v8float &a, int ) ALWAYS_INLINE;
+
+    // v8float binary operator friends
+
+    friend inline v8float operator  +( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  -( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  *( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8float operator  /( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float logical operator friends
+
+    friend inline v8int operator  <( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator  >( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ==( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator !=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator <=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator >=( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator &&( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+    friend inline v8int operator ||( const v8float &a, const v8float &b ) ALWAYS_INLINE;
+
+    // v8float math library friends
+
+#   define CMATH_FR1(fn) friend inline v8float fn( const v8float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v8float fn( const v8float &a,  \
+                                                   const v8float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v8float miscellaneous friends
+
+    friend inline v8float rsqrt_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rsqrt       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp_approx( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float rcp       ( const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float fma ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fms ( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float fnms( const v8float &a, const v8float &b, const v8float &c ) ALWAYS_INLINE;
+    friend inline v8float  clear_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float    set_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline v8float toggle_bits( const v8int &m, const v8float &a ) ALWAYS_INLINE;
+    friend inline void increment_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+    friend inline void     scale_8x1( float * ALIGNED(16) p, const v8float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v8float constructors / destructors
+
+    v8float() {}                                        // Default constructor
+
+    v8float( const v8float &a )                         // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	f[j] = a.f[j];
+    }
+
+    v8float( const v8 &a )                              // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	f[j] = a.f[j];
+    }
+
+    v8float( float a )                                  // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 8; j++ )
+	f[j] = a;
+    }
+
+    v8float( float f0, float f1, float f2, float f3,
+	     float f4, float f5, float f6, float f7 )   // Init from scalars
+    {
+      f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;
+      f[4] = f4; f[5] = f5; f[6] = f6; f[7] = f7;
+    }
+
+    ~v8float() {}                                       // Destructor
+
+    // v8float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v8float &operator op( const v8float &b )     \
+    {							\
+      ALWAYS_VECTORIZE                                  \
+      for( int j = 0; j < 8; j++ )                      \
+        f[j] op b.f[j];		             		\
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v8float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v8float prefix unary operators
+
+  inline v8float operator +( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = +a.f[j];
+
+    return b;
+  }
+
+  inline v8float operator -( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = -a.f[j];
+
+    return b;
+  }
+
+  inline v8int operator !( const v8float &a )
+  {
+    v8int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = a.i[j] ? 0 : -1;
+
+    return b;
+  }
+
+  // v8float prefix increment / decrement operators
+
+  inline v8float operator ++( v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = ++a.f[j];
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = --a.f[j];
+
+    return b;
+  }
+
+  // v8float postfix increment / decrement operators
+
+  inline v8float operator ++( v8float &a, int )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = a.f[j]++;
+
+    return b;
+  }
+
+  inline v8float operator --( v8float &a, int )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = a.f[j]--;
+
+    return b;
+  }
+
+  // v8float binary operators
+
+# define BINARY(op)                                                  \
+  inline v8float operator op( const v8float &a, const v8float &b )   \
+  {								     \
+    v8float c;                                                       \
+    ALWAYS_VECTORIZE                                                 \
+    for( int j = 0; j < 8; j++ )                                     \
+      c.f[j] = a.f[j] op b.f[j];                                     \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v8float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v8int operator op( const v8float &a, const v8float &b )   \
+  {								   \
+    v8int c;                                                       \
+    ALWAYS_VECTORIZE                                               \
+    for( int j = 0; j < 8; j++ )                                   \
+      c.i[j] = - ( a.f[j] op b.f[j] );                             \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v8float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v8float fn( const v8float &a )         \
+  {						\
+    v8float b;                                  \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 8; j++ )                \
+      b.f[j] = ::fn( a.f[j] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v8float fn( const v8float &a, const v8float &b )       \
+  {								\
+    v8float c;                                                  \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 8; j++ )                                \
+      c.f[j] = ::fn( a.f[j], b.f[j] );                          \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v8float copysign( const v8float &a, const v8float &b )
+  {
+    v8float c;
+    float t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+    {
+      t = ::fabs( a.f[j] );
+      if( b.f[j] < 0 ) t = -t;
+      c.f[j] = t;
+    }
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v8float miscellaneous functions
+
+  inline v8float rsqrt_approx( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v8float rsqrt( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v8float rcp_approx( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v8float rcp( const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v8float fma( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      d.f[j] = a.f[j] * b.f[j] + c.f[j];
+
+    return d;
+  }
+
+  inline v8float fms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      d.f[j] = a.f[j] * b.f[j] - c.f[j];
+
+    return d;
+  }
+
+  inline v8float fnms( const v8float &a, const v8float &b, const v8float &c )
+  {
+    v8float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      d.f[j] = c.f[j] - a.f[j] * b.f[j];
+
+    return d;
+  }
+
+  inline v8float clear_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = ( ~m.i[j] ) & a.i[j];
+
+    return b;
+  }
+
+  inline v8float set_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = m.i[j] | a.i[j];
+
+    return b;
+  }
+
+  inline v8float toggle_bits( const v8int &m, const v8float &a )
+  {
+    v8float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      b.i[j] = m.i[j] ^ a.i[j];
+
+    return b;
+  }
+
+  inline void increment_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      p[j] += a.f[j];
+  }
+
+  inline void decrement_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      p[j] -= a.f[j];
+  }
+
+  inline void scale_8x1( float * ALIGNED(16) p, const v8float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 8; j++ )
+      p[j] *= a.f[j];
+  }
+
+} // namespace v8
+
+#endif // _v8_portable_h_
diff --git a/src/vpic/advance.cc b/src/vpic/advance.cc
index 2a83e89b..4b2d13b6 100644
--- a/src/vpic/advance.cc
+++ b/src/vpic/advance.cc
@@ -205,4 +205,3 @@ int vpic_simulation::advance(void) {
 
   return 1;
 }
-
diff --git a/src/vpic/diagnostics.cc b/src/vpic/diagnostics.cc
index ff50bcd3..9007045a 100644
--- a/src/vpic/diagnostics.cc
+++ b/src/vpic/diagnostics.cc
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -18,7 +18,7 @@
 	_iz  = _iy/int(py);   /* iz = iz */                   \
 	_iy -= _iz*int(py);   /* iy = iy */                   \
 	(ix) = _ix;                                           \
-} END_PRIMITIVE 
+} END_PRIMITIVE
 
 /*------------------------------------------------------------------------------
  * Compute poynting flux at left boundary
@@ -26,13 +26,14 @@
  * Note: This should maybe be made more general by adding which face to use.
  *
  * Note: This implementation is taken from Brian's GB deck
- * 
+ *
  * Inputs:
  *   e0 Peak instantaneous E field in "natural units"
  *----------------------------------------------------------------------------*/
 // FIXME: THIS COULD BE WRITTEN MUCH CLEANER NOW
 double vpic_simulation::poynting_flux(double e0) {
-	double psum, gpsum;
+	double psum = 0.0;
+  double gpsum = 0.0;
 	int stride = (grid->ny-1)*(grid->nz-1);
 
 	float * pvec = new float[stride];
@@ -40,7 +41,7 @@ double vpic_simulation::poynting_flux(double e0) {
 	if(!pvec) {
 		ERROR(("Failed pvec allocation in poynting flux diagnostic"));
 	} // if
-	
+
 	memset(pvec, 0, stride);
 
 	int ix, k1, k2;
@@ -67,7 +68,7 @@ double vpic_simulation::poynting_flux(double e0) {
 	for(int i(0); i<stride; i++) {
 		psum += pvec[i];
 	} // for
-	
+
 	// Collect sums over all ranks
 	mp_allsum_d( &psum, &gpsum, 1 );
 
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 26af5749..31d689fb 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -8,20 +8,20 @@
  *
  * snell - revised to add strided dumps, time history dumps, others  20080404
  */
- 
+
 #include <cassert>
 
 #include "vpic.h"
 #include "dumpmacros.h"
 #include "../util/io/FileUtils.h"
- 
+
 /* -1 means no ranks talk */
 #define VERBOSE_rank -1
 
 // FIXME: NEW FIELDS IN THE GRID READ/WRITE WAS HACKED UP TO BE BACKWARD
 // COMPATIBLE WITH EXISTING EXTERNAL 3RD PARTY VISUALIZATION SOFTWARE.
 // IN THE LONG RUN, THIS EXTERNAL SOFTWARE WILL NEED TO BE UPDATED.
- 
+
 int vpic_simulation::dump_mkdir(const char * dname) {
 	return FileUtils::makeDirectory(dname);
 } // dump_mkdir
@@ -33,7 +33,7 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) {
 /*****************************************************************************
  * ASCII dump IO
  *****************************************************************************/
- 
+
 void
 vpic_simulation::dump_energies( const char *fname,
                                 int append ) {
@@ -41,9 +41,9 @@ vpic_simulation::dump_energies( const char *fname,
   species_t *sp;
   FileIO fileIO;
   FileIOStatus status(fail);
- 
+
   if( !fname ) ERROR(("Invalid file name"));
- 
+
   if( rank()==0 ) {
     status = fileIO.open(fname, append ? io_append : io_write);
     if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
@@ -55,34 +55,34 @@ vpic_simulation::dump_energies( const char *fname,
         fileIO.print( "\n" );
         fileIO.print( "%% timestep = %e\n", grid->dt );
       }
-      fileIO.print( "%li", (long)step() );
+      fileIO.print( "%li ", (long)step() );
     }
   }
- 
+
   field_array->kernel->energy_f( en_f, field_array );
   if( rank()==0 && status!=fail )
-    fileIO.print( " %e %e %e %e %e %e",
+    fileIO.print( "%e %e %e %e %e %e",
                   en_f[0], en_f[1], en_f[2],
                   en_f[3], en_f[4], en_f[5] );
- 
+
   LIST_FOR_EACH(sp,species_list) {
     en_p = energy_p( sp, interpolator_array );
     if( rank()==0 && status!=fail ) fileIO.print( " %e", en_p );
   }
- 
+
   if( rank()==0 && status!=fail ) {
     fileIO.print( "\n" );
     if( fileIO.close() ) ERROR(("File close failed on dump energies!!!"));
   }
 }
- 
+
 // Note: dump_species/materials assume that names do not contain any \n!
- 
+
 void
 vpic_simulation::dump_species( const char *fname ) {
   species_t *sp;
   FileIO fileIO;
- 
+
   if( rank() ) return;
   if( !fname ) ERROR(( "Invalid file name" ));
   MESSAGE(( "Dumping species to \"%s\"", fname ));
@@ -92,7 +92,7 @@ vpic_simulation::dump_species( const char *fname ) {
     fileIO.print( "%s %i %e %e", sp->name, sp->id, sp->q, sp->m );
   if( fileIO.close() ) ERROR(( "File close failed on dump species!!!" ));
 }
- 
+
 void
 vpic_simulation::dump_materials( const char *fname ) {
   FileIO fileIO;
@@ -110,11 +110,11 @@ vpic_simulation::dump_materials( const char *fname ) {
                   m->sigmax, m->sigmay, m->sigmaz );
   if( fileIO.close() ) ERROR(( "File close failed on dump materials!!!" ));
 }
- 
+
 /*****************************************************************************
  * Binary dump IO
  *****************************************************************************/
- 
+
 /*
 enum dump_types {
   grid_dump = 0,
@@ -124,7 +124,8 @@ enum dump_types {
   restart_dump = 4
 };
 */
- 
+
+// TODO: should this be an enum?
 namespace dump_type {
   const int grid_dump = 0;
   const int field_dump = 1;
@@ -133,20 +134,20 @@ namespace dump_type {
   const int restart_dump = 4;
   const int history_dump = 5;
 } // namespace
- 
+
 void
 vpic_simulation::dump_grid( const char *fbase ) {
   char fname[256];
   FileIO fileIO;
   int dim[4];
- 
+
   if( !fbase ) ERROR(( "Invalid filename" ));
   if( rank()==0 ) MESSAGE(( "Dumping grid to \"%s\"", fbase ));
- 
+
   sprintf( fname, "%s.%i", fbase, rank() );
   FileIOStatus status = fileIO.open(fname, io_write);
   if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
- 
+
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
   nxout = grid->nx;
   nyout = grid->ny;
@@ -156,43 +157,43 @@ vpic_simulation::dump_grid( const char *fbase ) {
   dzout = grid->dz;
 
   WRITE_HEADER_V0( dump_type::grid_dump, -1, 0, fileIO );
- 
+
   dim[0] = 3;
   dim[1] = 3;
   dim[2] = 3;
   WRITE_ARRAY_HEADER( grid->bc, 3, dim, fileIO );
   fileIO.write( grid->bc, dim[0]*dim[1]*dim[2] );
- 
+
   dim[0] = nproc()+1;
   WRITE_ARRAY_HEADER( grid->range, 1, dim, fileIO );
   fileIO.write( grid->range, dim[0] );
- 
+
   dim[0] = 6;
   dim[1] = grid->nx+2;
   dim[2] = grid->ny+2;
   dim[3] = grid->nz+2;
   WRITE_ARRAY_HEADER( grid->neighbor, 4, dim, fileIO );
   fileIO.write( grid->neighbor, dim[0]*dim[1]*dim[2]*dim[3] );
- 
+
   if( fileIO.close() ) ERROR(( "File close failed on dump grid!!!" ));
 }
- 
+
 void
 vpic_simulation::dump_fields( const char *fbase, int ftag ) {
   char fname[256];
   FileIO fileIO;
   int dim[3];
- 
+
   if( !fbase ) ERROR(( "Invalid filename" ));
- 
+
   if( rank()==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase ));
- 
+
   if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
   else       sprintf( fname, "%s.%i", fbase, rank() );
- 
+
   FileIOStatus status = fileIO.open(fname, io_write);
   if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
- 
+
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
   nxout = grid->nx;
   nyout = grid->ny;
@@ -202,7 +203,7 @@ vpic_simulation::dump_fields( const char *fbase, int ftag ) {
   dzout = grid->dz;
 
   WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO );
- 
+
   dim[0] = grid->nx+2;
   dim[1] = grid->ny+2;
   dim[2] = grid->nz+2;
@@ -210,7 +211,7 @@ vpic_simulation::dump_fields( const char *fbase, int ftag ) {
   fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] );
   if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" ));
 }
- 
+
 void
 vpic_simulation::dump_hydro( const char *sp_name,
                              const char *fbase,
@@ -219,24 +220,24 @@ vpic_simulation::dump_hydro( const char *sp_name,
   char fname[256];
   FileIO fileIO;
   int dim[3];
- 
+
   sp = find_species_name( sp_name, species_list );
   if( !sp ) ERROR(( "Invalid species \"%s\"", sp_name ));
- 
+
   clear_hydro_array( hydro_array );
   accumulate_hydro_p( hydro_array, sp, interpolator_array );
   synchronize_hydro_array( hydro_array );
- 
+
   if( !fbase ) ERROR(( "Invalid filename" ));
- 
+
   if( rank()==0 )
     MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
- 
+
   if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
   else       sprintf( fname, "%s.%i", fbase, rank() );
   FileIOStatus status = fileIO.open(fname, io_write);
   if( status==fail) ERROR(( "Could not open \"%s\".", fname ));
- 
+
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
   nxout = grid->nx;
   nyout = grid->ny;
@@ -246,7 +247,7 @@ vpic_simulation::dump_hydro( const char *sp_name,
   dzout = grid->dz;
 
   WRITE_HEADER_V0( dump_type::hydro_dump,sp->id,sp->q/sp->m,fileIO);
- 
+
   dim[0] = grid->nx+2;
   dim[1] = grid->ny+2;
   dim[2] = grid->nz+2;
@@ -254,7 +255,7 @@ vpic_simulation::dump_hydro( const char *sp_name,
   fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] );
   if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
 }
- 
+
 void
 vpic_simulation::dump_particles( const char *sp_name,
                                  const char *fbase,
@@ -265,22 +266,22 @@ vpic_simulation::dump_particles( const char *sp_name,
   int dim[1], buf_start;
   static particle_t * ALIGNED(128) p_buf = NULL;
 # define PBUF_SIZE 32768 // 1MB of particles
- 
+
   sp = find_species_name( sp_name, species_list );
   if( !sp ) ERROR(( "Invalid species name \"%s\".", sp_name ));
- 
+
   if( !fbase ) ERROR(( "Invalid filename" ));
- 
+
   if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 );
- 
+
   if( rank()==0 )
     MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase));
- 
+
   if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
   else       sprintf( fname, "%s.%i", fbase, rank() );
   FileIOStatus status = fileIO.open(fname, io_write);
   if( status==fail ) ERROR(( "Could not open \"%s\"", fname ));
- 
+
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
   nxout = grid->nx;
   nyout = grid->ny;
@@ -290,18 +291,18 @@ vpic_simulation::dump_particles( const char *sp_name,
   dzout = grid->dz;
 
   WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO );
- 
+
   dim[0] = sp->np;
   WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO );
- 
+
   // Copy a PBUF_SIZE hunk of the particle list into the particle
   // buffer, timecenter it and write it out. This is done this way to
   // guarantee the particle list unchanged while not requiring too
   // much memory.
- 
+
   // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE
   // LARGE.
- 
+
   particle_t * sp_p = sp->p;      sp->p      = p_buf;
   int sp_np         = sp->np;     sp->np     = 0;
   int sp_max_np     = sp->max_np; sp->max_np = PBUF_SIZE;
@@ -310,14 +311,14 @@ vpic_simulation::dump_particles( const char *sp_name,
     COPY( sp->p, &sp_p[buf_start], sp->np );
     center_p( sp, interpolator_array );
     fileIO.write( sp->p, sp->np );
-  } 
+  }
   sp->p      = sp_p;
   sp->np     = sp_np;
   sp->max_np = sp_max_np;
- 
+
   if( fileIO.close() ) ERROR(("File close failed on dump particles!!!"));
 }
- 
+
 /*------------------------------------------------------------------------------
  * New dump logic
  *---------------------------------------------------------------------------*/
@@ -390,69 +391,69 @@ void
 vpic_simulation::global_header( const char * base,
                                 std::vector<DumpParameters *> dumpParams ) {
   if( rank() ) return;
-  
+
   // Open the file for output
   char filename[256];
   sprintf(filename, "%s.vpc", base);
-  
+
   FileIO fileIO;
   FileIOStatus status;
-  
+
   status = fileIO.open(filename, io_write);
   if(status == fail) ERROR(("Failed opening file: %s", filename));
-  
+
   print_hashed_comment(fileIO, "Header version information");
   fileIO.print("VPIC_HEADER_VERSION 1.0.0\n\n");
-  
+
   print_hashed_comment(fileIO,
                        "Header size for data file headers in bytes");
   fileIO.print("DATA_HEADER_SIZE 123\n\n");
-  
+
   // Global grid inforation
   print_hashed_comment(fileIO, "Time step increment");
   fileIO.print("GRID_DELTA_T %f\n\n", grid->dt);
-  
+
   print_hashed_comment(fileIO, "GRID_CVAC");
   fileIO.print("GRID_CVAC %f\n\n", grid->cvac);
-  
+
   print_hashed_comment(fileIO, "GRID_EPS0");
   fileIO.print("GRID_EPS0 %f\n\n", grid->eps0);
-  
+
   print_hashed_comment(fileIO, "Grid extents in the x-dimension");
   fileIO.print("GRID_EXTENTS_X %f %f\n\n", grid->x0, grid->x1);
-  
+
   print_hashed_comment(fileIO, "Grid extents in the y-dimension");
   fileIO.print("GRID_EXTENTS_Y %f %f\n\n", grid->y0, grid->y1);
-  
+
   print_hashed_comment(fileIO, "Grid extents in the z-dimension");
   fileIO.print("GRID_EXTENTS_Z %f %f\n\n", grid->z0, grid->z1);
-  
+
   print_hashed_comment(fileIO, "Spatial step increment in x-dimension");
   fileIO.print("GRID_DELTA_X %f\n\n", grid->dx);
-  
+
   print_hashed_comment(fileIO, "Spatial step increment in y-dimension");
   fileIO.print("GRID_DELTA_Y %f\n\n", grid->dy);
-  
+
   print_hashed_comment(fileIO, "Spatial step increment in z-dimension");
   fileIO.print("GRID_DELTA_Z %f\n\n", grid->dz);
-  
+
   print_hashed_comment(fileIO, "Domain partitions in x-dimension");
   fileIO.print("GRID_TOPOLOGY_X %d\n\n", px);
-  
+
   print_hashed_comment(fileIO, "Domain partitions in y-dimension");
   fileIO.print("GRID_TOPOLOGY_Y %d\n\n", py);
-  
+
   print_hashed_comment(fileIO, "Domain partitions in z-dimension");
   fileIO.print("GRID_TOPOLOGY_Z %d\n\n", pz);
-  
+
   // Global data inforation
   assert(dumpParams.size() >= 2);
-  
+
   print_hashed_comment(fileIO, "Field data information");
   fileIO.print("FIELD_DATA_DIRECTORY %s\n", dumpParams[0]->baseDir);
   fileIO.print("FIELD_DATA_BASE_FILENAME %s\n",
                dumpParams[0]->baseFileName);
-  
+
   // Create a variable list of field values to output.
   size_t numvars = std::min(dumpParams[0]->output_vars.bitsum(field_indeces,
                                                               total_field_groups),
@@ -461,20 +462,20 @@ vpic_simulation::global_header( const char * base,
   for(size_t v(0), c(0); v<total_field_groups; v++)
     if(dumpParams[0]->output_vars.bitset(field_indeces[v]))
       varlist[c++] = v;
-  
+
   // output variable list
   fileIO.print("FIELD_DATA_VARIABLES %d\n", numvars);
-  
+
   for(size_t v(0); v<numvars; v++)
     fileIO.print("\"%s\" %s %s %s %d\n", fieldInfo[varlist[v]].name,
                  fieldInfo[varlist[v]].degree, fieldInfo[varlist[v]].elements,
                  fieldInfo[varlist[v]].type, fieldInfo[varlist[v]].size);
-  
+
   fileIO.print("\n");
-  
+
   delete[] varlist;
   varlist = NULL;
-  
+
   // Create a variable list for each species to output
   print_hashed_comment(fileIO, "Number of species with output data");
   fileIO.print("NUM_OUTPUT_SPECIES %d\n\n", dumpParams.size()-1);
@@ -483,33 +484,33 @@ vpic_simulation::global_header( const char * base,
     numvars = std::min(dumpParams[i]->output_vars.bitsum(hydro_indeces,
                                                          total_hydro_groups),
                        total_hydro_groups);
-    
+
     sprintf(species_comment, "Species(%d) data information", (int)i);
     print_hashed_comment(fileIO, species_comment);
     fileIO.print("SPECIES_DATA_DIRECTORY %s\n",
                  dumpParams[i]->baseDir);
     fileIO.print("SPECIES_DATA_BASE_FILENAME %s\n",
                  dumpParams[i]->baseFileName);
-    
+
     fileIO.print("HYDRO_DATA_VARIABLES %d\n", numvars);
-    
+
     varlist = new size_t[numvars];
     for(size_t v(0), c(0); v<total_hydro_groups; v++)
       if(dumpParams[i]->output_vars.bitset(hydro_indeces[v]))
         varlist[c++] = v;
-    
+
     for(size_t v(0); v<numvars; v++)
       fileIO.print("\"%s\" %s %s %s %d\n", hydroInfo[varlist[v]].name,
                    hydroInfo[varlist[v]].degree, hydroInfo[varlist[v]].elements,
                    hydroInfo[varlist[v]].type, hydroInfo[varlist[v]].size);
-    
-    
+
+
     delete[] varlist;
     varlist = NULL;
-    
+
     if(i<dumpParams.size()-1) fileIO.print("\n");
   }
-  
+
 
   if( fileIO.close() ) ERROR(( "File close failed on global header!!!" ));
 }
@@ -521,23 +522,23 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
   char timeDir[256];
   sprintf(timeDir, "%s/T.%ld", dumpParams.baseDir, (long)step());
   dump_mkdir(timeDir);
-  
+
   // Open the file for output
   char filename[256];
   sprintf(filename, "%s/T.%ld/%s.%ld.%d", dumpParams.baseDir, (long)step(),
           dumpParams.baseFileName, (long)step(), rank());
-  
+
   FileIO fileIO;
   FileIOStatus status;
-  
+
   status = fileIO.open(filename, io_write);
   if( status==fail ) ERROR(( "Failed opening file: %s", filename ));
-  
+
   // convenience
   const size_t istride(dumpParams.stride_x);
   const size_t jstride(dumpParams.stride_y);
   const size_t kstride(dumpParams.stride_z);
-  
+
   // Check stride values.
   if(remainder(grid->nx, istride) != 0)
     ERROR(("x stride must be an integer factor of nx"));
@@ -545,9 +546,9 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
     ERROR(("y stride must be an integer factor of ny"));
   if(remainder(grid->nz, kstride) != 0)
     ERROR(("z stride must be an integer factor of nz"));
-  
+
   int dim[3];
-  
+
   /* define to do C-style indexing */
 # define f(x,y,z) f[ VOXEL(x,y,z, grid->nx,grid->ny,grid->nz) ]
 
@@ -568,13 +569,13 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
    * plus every "stride" elements in that dimension. */
 
   if(dumpParams.format == band) {
-    
+
     WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO);
-    
+
     dim[0] = nxout+2;
     dim[1] = nyout+2;
     dim[2] = nzout+2;
-    
+
     if( rank()==VERBOSE_rank ) {
       std::cerr << "nxout: " << nxout << std::endl;
       std::cerr << "nyout: " << nyout << std::endl;
@@ -583,19 +584,19 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
       std::cerr << "ny: " << grid->ny << std::endl;
       std::cerr << "nz: " << grid->nz << std::endl;
     }
-    
+
     WRITE_ARRAY_HEADER(field_array->f, 3, dim, fileIO);
-    
+
     // Create a variable list of field values to output.
     size_t numvars = std::min(dumpParams.output_vars.bitsum(),
                               total_field_variables);
     size_t * varlist = new size_t[numvars];
-    
+
     for(size_t i(0), c(0); i<total_field_variables; i++)
       if(dumpParams.output_vars.bitset(i)) varlist[c++] = i;
-    
+
     if( rank()==VERBOSE_rank ) printf("\nBEGIN_OUTPUT\n");
-    
+
     // more efficient for standard case
     if(istride == 1 && jstride == 1 && kstride == 1)
       for(size_t v(0); v<numvars; v++) {
@@ -625,19 +626,19 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
       } if(rank()==VERBOSE_rank) std::cout << std::endl << "PLANE_BREAK " << koff << std::endl;
       } if(rank()==VERBOSE_rank) std::cout << std::endl << "BLOCK_BREAK" << std::endl;
       }
-    
+
     delete[] varlist;
 
   } else { // band_interleave
-    
+
     WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO);
-    
+
     dim[0] = nxout+2;
     dim[1] = nyout+2;
     dim[2] = nzout+2;
-    
+
     WRITE_ARRAY_HEADER(field_array->f, 3, dim, fileIO);
-    
+
     if(istride == 1 && jstride == 1 && kstride == 1)
       fileIO.write(field_array->f, dim[0]*dim[1]*dim[2]);
     else
@@ -649,7 +650,7 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
       }
       }
   }
-  
+
 # undef f
 
   if( fileIO.close() ) ERROR(( "File close failed on field dump!!!" ));
@@ -663,12 +664,12 @@ vpic_simulation::hydro_dump( const char * speciesname,
   char timeDir[256];
   sprintf(timeDir, "%s/T.%ld", dumpParams.baseDir, (long)step());
   dump_mkdir(timeDir);
-  
+
   // Open the file for output
   char filename[256];
   sprintf( filename, "%s/T.%ld/%s.%ld.%d", dumpParams.baseDir, (long)step(),
            dumpParams.baseFileName, (long)step(), rank() );
-  
+
   FileIO fileIO;
   FileIOStatus status;
 
@@ -681,12 +682,12 @@ vpic_simulation::hydro_dump( const char * speciesname,
   clear_hydro_array( hydro_array );
   accumulate_hydro_p( hydro_array, sp, interpolator_array );
   synchronize_hydro_array( hydro_array );
-  
+
   // convenience
   const size_t istride(dumpParams.stride_x);
   const size_t jstride(dumpParams.stride_y);
   const size_t kstride(dumpParams.stride_z);
-  
+
   // Check stride values.
   if(remainder(grid->nx, istride) != 0)
     ERROR(("x stride must be an integer factor of nx"));
@@ -694,9 +695,9 @@ vpic_simulation::hydro_dump( const char * speciesname,
     ERROR(("y stride must be an integer factor of ny"));
   if(remainder(grid->nz, kstride) != 0)
     ERROR(("z stride must be an integer factor of nz"));
-  
+
   int dim[3];
-  
+
   /* define to do C-style indexing */
 # define hydro(x,y,z) hydro_array->h[VOXEL(x,y,z, grid->nx,grid->ny,grid->nz)]
 
@@ -707,7 +708,7 @@ vpic_simulation::hydro_dump( const char * speciesname,
   dxout = (grid->dx)*istride;
   dyout = (grid->dy)*jstride;
   dzout = (grid->dz)*kstride;
-  
+
   /* Banded output will write data as a single block-array as opposed to
    * the Array-of-Structure format that is used for native storage.
    *
@@ -717,15 +718,15 @@ vpic_simulation::hydro_dump( const char * speciesname,
    * plus every "stride" elements in that dimension.
    */
   if(dumpParams.format == band) {
-    
+
     WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO);
-    
+
     dim[0] = nxout+2;
     dim[1] = nyout+2;
     dim[2] = nzout+2;
-    
+
     WRITE_ARRAY_HEADER(hydro_array->h, 3, dim, fileIO);
-    
+
     /*
      * Create a variable list of hydro values to output.
      */
@@ -734,7 +735,7 @@ vpic_simulation::hydro_dump( const char * speciesname,
     size_t * varlist = new size_t[numvars];
     for(size_t i(0), c(0); i<total_hydro_variables; i++)
       if( dumpParams.output_vars.bitset(i) ) varlist[c++] = i;
-    
+
     // More efficient for standard case
     if(istride == 1 && jstride == 1 && kstride == 1)
 
@@ -757,19 +758,19 @@ vpic_simulation::hydro_dump( const char * speciesname,
       }
       }
       }
-    
+
     delete[] varlist;
-    
+
   } else { // band_interleave
-    
+
     WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO);
-    
+
     dim[0] = nxout;
     dim[1] = nyout;
     dim[2] = nzout;
-    
+
     WRITE_ARRAY_HEADER(hydro_array->h, 3, dim, fileIO);
-    
+
     if(istride == 1 && jstride == 1 && kstride == 1)
 
       fileIO.write(hydro_array->h, dim[0]*dim[1]*dim[2]);
@@ -784,7 +785,7 @@ vpic_simulation::hydro_dump( const char * speciesname,
       }
       }
   }
-  
+
 # undef hydro
 
   if( fileIO.close() ) ERROR(( "File close failed on hydro dump!!!" ));
diff --git a/src/vpic/dumpmacros.h b/src/vpic/dumpmacros.h
index 23fb3281..9e46bf6b 100644
--- a/src/vpic/dumpmacros.h
+++ b/src/vpic/dumpmacros.h
@@ -1,7 +1,7 @@
 #ifndef dumpmacros_h
 #define dumpmacros_h
 
-/* FIXME: WHEN THESE MACROS WERE HOISTED AND VARIOUS HACKS DONE TO THEm
+/* FIXME: WHEN THESE MACROS WERE HOISTED AND VARIOUS HACKS DONE TO THEM
    THEY BECAME _VERY_ _DANGEROUS. */
 
 #define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \
diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index 3426e0a7..4f150afc 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -80,11 +80,25 @@ vpic_simulation::vpic_simulation() {
   num_div_e_round = 2;
   num_div_b_round = 2;
 
-  int                           n_rng = serial.n_pipeline;
-  if( n_rng<thread.n_pipeline ) n_rng = thread.n_pipeline;
-# if defined(CELL_PPU_BUILD) && defined(USE_CELL_SPUS)
-  if( n_rng<spu.n_pipeline    ) n_rng = spu.n_pipeline;
-# endif
+#if defined(VPIC_USE_PTHREADS)                         // Pthreads case.
+  int                              n_rng = serial.n_pipeline;
+  if ( n_rng < thread.n_pipeline ) n_rng = thread.n_pipeline;
+
+#elif defined(VPIC_USE_OPENMP)                         // OpenMP case.
+  int                              n_rng = omp_helper.n_pipeline;
+
+#else                                                  // Error case.
+  #error "VPIC_USE_OPENMP or VPIC_USE_PTHREADS must be specified"
+
+#endif
+
+  // int                           n_rng = serial.n_pipeline;
+  // if( n_rng<thread.n_pipeline ) n_rng = thread.n_pipeline;
+
+  // # if defined(CELL_PPU_BUILD) && defined(USE_CELL_SPUS)
+  //   if( n_rng<spu.n_pipeline    ) n_rng = spu.n_pipeline;
+  // # endif
+
   n_rng++; 
 
   entropy      = new_rng_pool( n_rng, 0, 0 );
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index e4d14544..b309bd55 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -9,10 +9,10 @@
  * snell - revised to add new dumps, 20080310
  *
  */
- 
+
 #ifndef vpic_h
 #define vpic_h
- 
+
 #include <vector>
 #include <cmath>
 
@@ -24,16 +24,12 @@
 #include "../util/bitfield.h"
 #include "../util/checksum.h"
 #include "../util/system.h"
- 
-#ifndef USER_GLOBAL_SIZE
-#define USER_GLOBAL_SIZE 16384
-#endif
- 
+
 #ifndef NVARHISMX
 #define NVARHISMX 250
 #endif
 //  #include "dumpvars.h"
- 
+
 typedef FileIO FILETYPE;
 
 const uint32_t all			(0xffffffff);
@@ -125,11 +121,11 @@ class vpic_simulation {
   void modify( const char *fname );
   int advance( void );
   void finalize( void );
- 
+
 private:
- 
+
   // Directly initialized by user
- 
+
   int verbose;              // Should system be verbose
   int num_step;             // Number of steps to take
   int num_comm_round;       // Num comm round
@@ -139,7 +135,7 @@ class vpic_simulation {
   int clean_div_b_interval; // How often to clean div b
   int num_div_b_round;      // How many clean div b rounds per div b interval
   int sync_shared_interval; // How often to synchronize shared faces
- 
+
   // FIXME: THESE INTERVALS SHOULDN'T BE PART OF vpic_simulation
   // THE BIG LIST FOLLOWING IT SHOULD BE CLEANED UP TOO
 
@@ -148,7 +144,7 @@ class vpic_simulation {
   int hydro_interval;
   int field_interval;
   int particle_interval;
- 
+
   size_t nxout, nyout, nzout;
   size_t px, py, pz;
   float dxout, dyout, dzout;
@@ -171,9 +167,9 @@ class vpic_simulation {
   int stepdigit;
   int rankdigit;
   int ifenergies;
- 
+
   // Helper initialized by user
- 
+
   /* There are enough synchronous and local random number generators
      to permit the host thread plus all the pipeline threads for one
      dispatcher to simultaneously produce both synchronous and local
@@ -196,15 +192,10 @@ class vpic_simulation {
                                              // emitter helpers
   collision_op_t       * collision_op_list;  // collision helpers
 
-  // User defined checkpt preserved variables
-  // Note: user_global is aliased with user_global_t (see deck_wrapper.cxx)
- 
-  char user_global[USER_GLOBAL_SIZE];
- 
   /*----------------------------------------------------------------------------
    * Diagnostics
    ---------------------------------------------------------------------------*/
-  double poynting_flux(double e0);		
+  double poynting_flux(double e0);
 
   /*----------------------------------------------------------------------------
    * Check Sums
@@ -222,7 +213,7 @@ class vpic_simulation {
 
   ///////////////
   // Dump helpers
- 
+
   int dump_mkdir(const char * dname);
   int dump_cwd(char * dname, size_t size);
 
@@ -230,7 +221,7 @@ class vpic_simulation {
   void dump_energies( const char *fname, int append = 1 );
   void dump_materials( const char *fname );
   void dump_species( const char *fname );
- 
+
   // Binary dumps
   void dump_grid( const char *fbase );
   void dump_fields( const char *fbase, int fname_tag = 1 );
@@ -238,7 +229,7 @@ class vpic_simulation {
                    int fname_tag = 1 );
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
- 
+
   // convenience functions for simlog output
   void create_field_list(char * strlist, DumpParameters & dumpParams);
   void create_hydro_list(char * strlist, DumpParameters & dumpParams);
@@ -340,7 +331,7 @@ class vpic_simulation {
 
   // The below functions automatically create partition simple grids with
   // simple boundary conditions on the edges.
- 
+
   inline void
   define_periodic_grid( double xl,  double yl,  double zl,
                         double xh,  double yh,  double zh,
@@ -351,7 +342,7 @@ class vpic_simulation {
                             (int)gnx, (int)gny, (int)gnz,
                             (int)gpx, (int)gpy, (int)gpz );
   }
- 
+
   inline void
   define_absorbing_grid( double xl,  double yl,  double zl,
                          double xh,  double yh,  double zh,
@@ -363,7 +354,7 @@ class vpic_simulation {
                              (int)gpx, (int)gpy, (int)gpz,
                              pbc );
   }
- 
+
   inline void
   define_reflecting_grid( double xl,  double yl,  double zl,
                           double xh,  double yh,  double zh,
@@ -374,33 +365,33 @@ class vpic_simulation {
                          (int)gnx, (int)gny, (int)gnz,
                          (int)gpx, (int)gpy, (int)gpz );
   }
- 
+
   // The below macros allow custom domains to be created
- 
+
   // Creates a particle reflecting metal box in the local domain
   inline void
   size_domain( double lnx, double lny, double lnz ) {
     size_grid(grid,(int)lnx,(int)lny,(int)lnz);
   }
- 
+
   // Attaches a local domain boundary to another domain
   inline void join_domain( int boundary, double rank ) {
     join_grid( grid, boundary, (int)rank );
   }
- 
+
   // Sets the field boundary condition of a local domain boundary
   inline void set_domain_field_bc( int boundary, int fbc ) {
     set_fbc( grid, boundary, fbc );
   }
- 
+
   // Sets the particle boundary condition of a local domain boundary
   inline void set_domain_particle_bc( int boundary, int pbc ) {
     set_pbc( grid, boundary, pbc );
   }
- 
+
   ///////////////////
   // Material helpers
- 
+
   inline material_t *
   define_material( const char * name,
                    double eps,
@@ -413,7 +404,7 @@ class vpic_simulation {
                                       sigma, sigma, sigma,
                                       zeta,  zeta,  zeta ), &material_list );
   }
- 
+
   inline material_t *
   define_material( const char * name,
                    double epsx,        double epsy,       double epsz,
@@ -426,7 +417,7 @@ class vpic_simulation {
                                       sigmax, sigmay, sigmaz,
                                       zetax,  zetay,  zetaz ), &material_list );
   }
- 
+
   inline material_t *
   lookup_material( const char * name ) {
     return find_material_name( name, material_list );
@@ -436,10 +427,10 @@ class vpic_simulation {
   lookup_material( material_id id ) {
     return find_material_id( id, material_list );
   }
- 
+
   //////////////////////
   // Field array helpers
- 
+
   // If fa is provided, define_field_advance will use it (and take ownership
   // of the it).  Otherwise the standard field array will be used with the
   // optionally provided radition damping parameter.
@@ -447,7 +438,7 @@ class vpic_simulation {
   inline void
   define_field_array( field_array_t * fa = NULL, double damp = 0 ) {
     int nx1 = grid->nx + 1, ny1 = grid->ny+1, nz1 = grid->nz+1;
- 
+
     if( grid->nx<1 || grid->ny<1 || grid->nz<1 )
       ERROR(( "Define your grid before defining the field array" ));
     if( !material_list )
@@ -458,7 +449,7 @@ class vpic_simulation {
     interpolator_array = new_interpolator_array( grid );
     accumulator_array  = new_accumulator_array( grid );
     hydro_array        = new_hydro_array( grid );
- 
+
     // Pre-size communications buffers. This is done to get most memory
     // allocation over with before the simulation starts running
 
@@ -468,7 +459,7 @@ class vpic_simulation {
     mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 1, 0),nz1*nx1*sizeof(hydro_t));
     mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 0,-1),nx1*ny1*sizeof(hydro_t));
     mp_size_recv_buffer(grid->mp,BOUNDARY( 0, 0, 1),nx1*ny1*sizeof(hydro_t));
- 
+
     mp_size_send_buffer(grid->mp,BOUNDARY(-1, 0, 0),ny1*nz1*sizeof(hydro_t));
     mp_size_send_buffer(grid->mp,BOUNDARY( 1, 0, 0),ny1*nz1*sizeof(hydro_t));
     mp_size_send_buffer(grid->mp,BOUNDARY( 0,-1, 0),nz1*nx1*sizeof(hydro_t));
@@ -478,10 +469,10 @@ class vpic_simulation {
   }
 
   // Other field helpers are provided by macros in deck_wrapper.cxx
- 
+
   //////////////////
   // Species helpers
- 
+
   // FIXME: SILLY PROMOTIONS
   inline species_t *
   define_species( const char *name,
@@ -500,11 +491,11 @@ class vpic_simulation {
         max_local_nm = 16*(MAX_PIPELINE+1);
     }
     return append_species( species( name, (float)q, (float)m,
-                                    (int)max_local_np, (int)max_local_nm,
+                                    (size_t)max_local_np, (size_t)max_local_nm,
                                     (int)sort_interval, (int)sort_out_of_place,
                                     grid ), &species_list );
   }
- 
+
   inline species_t *
   find_species( const char *name ) {
      return find_species_name( name, species_list );
@@ -514,20 +505,20 @@ class vpic_simulation {
   find_species( int32_t id ) {
      return find_species_id( id, species_list );
   }
- 
+
   ///////////////////
   // Particle helpers
- 
+
   // Note: Don't use injection with aging during initialization
 
-  // Defaults in the declaration below enable backwards compatibility.  
+  // Defaults in the declaration below enable backwards compatibility.
 
   void
   inject_particle( species_t * sp,
                    double x,  double y,  double z,
                    double ux, double uy, double uz,
                    double w,  double age = 0, int update_rhob = 1 );
- 
+
   // Inject particle raw is for power users!
   // No nannyism _at_ _all_:
   // - Availability of free stoarge is _not_ checked.
@@ -536,7 +527,7 @@ class vpic_simulation {
   // - Injection with displacment may use up movers (i.e. don't use
   //   injection with displacement during initialization).
   // This injection is _ultra_ _fast_.
- 
+
   inline void
   inject_particle_raw( species_t * RESTRICT sp,
                        float dx, float dy, float dz, int32_t i,
@@ -545,7 +536,7 @@ class vpic_simulation {
     p->dx = dx; p->dy = dy; p->dz = dz; p->i = i;
     p->ux = ux; p->uy = uy; p->uz = uz; p->w = w;
   }
- 
+
   // This variant does a raw inject and moves the particles
 
   inline void
@@ -562,10 +553,10 @@ class vpic_simulation {
     if( update_rhob ) accumulate_rhob( field_array->f, p, grid, -sp->q );
     sp->nm += move_p( sp->p, pm, accumulator_array->a, grid, sp->q );
   }
- 
+
   //////////////////////////////////
   // Random number generator helpers
- 
+
   // seed_rand seed the all the random number generators.  The seed
   // used for the individual generators is based off the user provided
   // seed such each local generator in each process (rng[0:r-1]) gets
@@ -588,12 +579,12 @@ class vpic_simulation {
     double dx = drand( rng );
     return low*(1-dx) + high*dx;
   }
- 
+
   // Normal random number with mean mu and standard deviation sigma
   inline double normal( rng_t * rng, double mu, double sigma ) {
     return mu + sigma*drandn( rng );
   }
- 
+
   /////////////////////////////////
   // Emitter and particle bc helpers
 
@@ -613,7 +604,7 @@ class vpic_simulation {
   // define_surface_emitter works and language limitations of
   // strict C++ prevent this.)
 
-  inline emitter_t * 
+  inline emitter_t *
   define_emitter( emitter_t * e ) {
     return append_emitter( e, &emitter_list );
   }
@@ -630,18 +621,18 @@ class vpic_simulation {
 
   ////////////////////////
   // Miscellaneous helpers
- 
+
   inline void abort( double code ) {
     nanodelay(2000000000); mp_abort((((int)code)<<17)+1);
   }
- 
+
   // Truncate "a" to the nearest integer multiple of "b"
   inline double trunc_granular( double a, double b ) { return b*int(a/b); }
- 
+
   // Compute the remainder of a/b
   inline double remainder( double a, double b ) { return std::remainder(a,b); }
   // remainder(a,b);
- 
+
   // Compute the Courant length on a regular mesh
   inline double courant_length( double lx, double ly, double lz,
 				double nx, double ny, double nz ) {
@@ -651,7 +642,7 @@ class vpic_simulation {
     if( nz>1 ) w0 = nz/lz, w1 += w0*w0;
     return sqrt(1/w1);
   }
- 
+
   //////////////////////////////////////////////////////////
   // These friends are used by the checkpt / restore service
 
@@ -661,7 +652,7 @@ class vpic_simulation {
 
   ////////////////////////////////////////////////////////////
   // User input deck provided functions (see deck_wrapper.cxx)
- 
+
   void user_initialization( int argc, char **argv );
   void user_particle_injection(void);
   void user_current_injection(void);
@@ -669,5 +660,5 @@ class vpic_simulation {
   void user_diagnostics(void);
   void user_particle_collisions(void);
 };
- 
+
 #endif // vpic_h
diff --git a/test/integrated/CMakeLists.txt b/test/integrated/CMakeLists.txt
index b0938bd4..d78fb9b4 100644
--- a/test/integrated/CMakeLists.txt
+++ b/test/integrated/CMakeLists.txt
@@ -1,12 +1,3 @@
-# add the tests
-set(MPI_NUM_RANKS 1)
-set(ARGS "1 1")
-
-foreach(test accel cyclo inbndj interpe outbndj pcomm)
-  build_a_vpic(${test} ${CMAKE_CURRENT_SOURCE_DIR}/${test}.deck)
-endforeach()
-foreach(test accel cyclo inbndj interpe outbndj)
-  add_test(${test} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${test} ${MPIEXEC_POSTFLAGS} ${ARGS})
-endforeach()
-
-add_test(pcomm ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 8 ${MPIEXEC_PREFLAGS} pcomm ${MPIEXEC_POSTFLAGS} ${ARGS})
+add_subdirectory(particle_push)
+add_subdirectory(legacy)
+add_subdirectory(to_completion)
diff --git a/test/integrated/config.py b/test/integrated/config.py
deleted file mode 100644
index 05085fc8..00000000
--- a/test/integrated/config.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#------------------------------------------------------------------------------#
-# Unit test configuration
-#------------------------------------------------------------------------------#
-
-bld.test('accel',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=accel.deck', [1])
-bld.test('cyclo',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=cyclo.deck', [1])
-bld.test('inbndj',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=inbndj.deck', [1])
-bld.test('interpe',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=interpe.deck', [1])
-bld.test('outbndj',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=outbndj.deck', [1])
-bld.test('pcomm',
-	'${top_srcdir}/deck/main.cxx ${top_srcdir}/deck/wrapper.cxx',
-	'-DINPUT_DECK=pcomm.deck', [8])
diff --git a/test/integrated/energy_comparison/CMakeLists.txt b/test/integrated/energy_comparison/CMakeLists.txt
new file mode 100644
index 00000000..cb29533b
--- /dev/null
+++ b/test/integrated/energy_comparison/CMakeLists.txt
@@ -0,0 +1,45 @@
+set(MPIEXEC_NUMPROC 1)
+set(ARGS "")
+
+list(APPEND DEFAULT_ARG_TESTS
+    weibel # This test is a simple run which should not die
+    dump) # This is a simple run which should dump restart files
+
+list(APPEND RESTART_DECK dump) # Reuse existing deck and start half way
+list(APPEND RESTART_BINARY restore)
+
+foreach(test ${DEFAULT_ARG_TESTS})
+    build_a_vpic(${test} ${CMAKE_CURRENT_SOURCE_DIR}/${test}.deck)
+endforeach()
+
+foreach(test ${DEFAULT_ARG_TESTS})
+    add_test(${test} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_NUMPROC}
+        ${MPIEXEC_PREFLAGS} ${test} ${MPIEXEC_POSTFLAGS} ${ARGS})
+endforeach()
+
+# Try a parallel run
+set (PARALLEL_TEST parallel)
+build_a_vpic(${PARALLEL_TEST} ${CMAKE_CURRENT_SOURCE_DIR}/simple.deck)
+add_test(${PARALLEL_TEST} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
+    ${MPIEXEC_NUMPROC_PARALLEL} ${MPIEXEC_PREFLAGS} ${PARALLEL_TEST}
+    ${MPIEXEC_POSTFLAGS} ${ARGS})
+
+# Try a threaded run
+set (THREADED_TEST threaded)
+list(APPEND THREADED_ARGS --tpp ${MPIEXEC_NUMPROC_PARALLEL})
+
+build_a_vpic(${THREADED_TEST} ${CMAKE_CURRENT_SOURCE_DIR}/simple.deck)
+add_test(${THREADED_TEST} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_NUMPROC}
+    ${MPIEXEC_PREFLAGS} ${THREADED_TEST} ${MPIEXEC_POSTFLAGS} ${THREADED_ARGS})
+
+# TODO: Do we want to try an MPI + Threaded runs
+
+# Test Restart (restore) functionality
+
+list(APPEND CHECKPOINT_FILE "${CMAKE_CURRENT_BINARY_DIR}/checkpt.1")
+list(APPEND RESTART_ARGS --restore ${CHECKPOINT_FILE})
+
+build_a_vpic(${RESTART_BINARY} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck)
+add_test(${RESTART_BINARY} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
+    ${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${RESTART_BINARY}
+    ${MPIEXEC_POSTFLAGS} ${RESTART_ARGS})
diff --git a/test/integrated/energy_comparison/weibel.deck b/test/integrated/energy_comparison/weibel.deck
new file mode 100644
index 00000000..d4f5ade9
--- /dev/null
+++ b/test/integrated/energy_comparison/weibel.deck
@@ -0,0 +1,420 @@
+// Magnetic reconnection in a Harris equilibrium thin current sheet
+//
+// This input deck reproduces the PIC simulations found in:
+//   William Daughton. "Nonlinear dynamics of thin current sheets." Phys.
+//   Plasmas. 9(9): 3668-3678. September 2002.
+//
+// This input deck was written by:
+//   Kevin J Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// August 2003      - original version
+// October 2003     - heavily revised to utilize input deck syntactic sugar
+// March/April 2004 - rewritten for domain decomposition V4PIC
+
+// If you want to use global variables (for example, to store the dump
+// intervals for your diagnostics section), it must be done in the globals
+// section. Variables declared the globals section will be preserved across
+// restart dumps. For example, if the globals section is:
+//   begin_globals {
+//     double variable;
+//   } end_globals
+// the double "variable" will be visible to other input deck sections as
+// "global->variable". Note: Variables declared in the globals section are set
+// to zero before the user's initialization block is executed. Up to 16K
+// of global variables can be defined.
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+begin_initialization {
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  // Arguments can be passed from the command line to the input deck
+  // if( num_cmdline_arguments!=3 ) {
+  //   sim_log( "Usage: " << cmdline_argument[0] << " mass_ratio seed" );
+  //   abort(0);
+  // }
+  seed_entropy(1); //seed_entropy( atoi( cmdline_argument[2] ) );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  //double L    = 1; // Length normalization (sheet thickness)
+  double de   = 1; // Length normalization (electron inertial length)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = 1836; //25; //atof(cmdline_argument[1]); // Ion mass / electron mass
+  double vthe = 0.25/sqrt(2.0); //0.0424264068711;       //0.424264068711;       // Electron thermal velocity
+  double vthi = 0.25/sqrt(2.0); //0.0424264068711;       //0.424264068711;       // Ion thermal velocity
+  double vthex =0.05/sqrt(2.0); //0.0141421356237;      // 0.141421356237;      // Electron thermal velocity in x-direction.
+  double vthix =0.05/sqrt(2.0); //0.0141421356237;      // 0.141421356237;Ion thermal velocity in x-direction.
+
+  double n0      = 1.0;    //  Background plasma density
+  double b0 = 0.0;         // In plane magnetic field.
+  double tauwpe    = 200000;    // simulation wpe's to run
+
+  // Numerical parameters
+  double topology_x = nproc();  // Number of domains in x, y, and z
+  double topology_y = 1;
+  double topology_z = 1;  // For load balance, best to keep "1" or "2" for Harris sheet
+  double Lx        = 2.09439510239320; //4.62*de; //6.7*de; //10.0*de;  // How big should the box be in the x direction
+  double Ly        = 1; //0.0721875*de;  // How big should the box be in the y direction
+  double Lz        = 1; //0.0721875*de;  // How big should the box be in the z direction
+  double nx        = 16; //64; //64; //32;    // Global resolution in the x direction
+  double ny        = 1;    // Global resolution in the y direction
+  double nz        = 1; //32;     // Global resolution in the z direction
+  double nppc      = 200; //800; //200; //2048; //1024; //128;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99f; //0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.0; // Level of radiation damping
+
+
+  // Derived quantities
+  double mi = me*mi_me;             // Ion mass
+  double wpe  = c/de;               // electron plasma frequency
+  double wpi  = wpe/sqrt(mi_me);    // ion plasma frequency
+  double di   = c/wpi;              // ion inertial length
+
+  double hx = Lx/nx;
+  double hy = Ly/ny;
+  double hz = Lz/nz;
+
+  double Npe = n0*Ly*Lz*Lx;    // Number physical electrons.
+  double Npi = Npe;            // Number of physical ions in box
+  double Ne  = nppc*nx*ny*nz;  // total macro electrons in box
+
+  Ne = trunc_granular(Ne,nproc());
+  double Ni   = Ne;                                   // Total macro ions in box
+
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  // printf("in harris.cxx: dt=%.7f\n",  dt);
+  // exit(1);
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = 700; //4000; // int(tauwpe/(wpe*dt));
+  status_interval      = 0; //2000;
+  sync_shared_interval = 0; //status_interval;
+  clean_div_e_interval = 0; //turn off cleaning (GY)//status_interval;
+  clean_div_b_interval = 0; //status_interval; //(GY)
+
+  global->energies_interval  = 1; //status_interval;
+  global->fields_interval    = status_interval;
+  global->ehydro_interval    = status_interval;
+  global->ihydro_interval    = status_interval;
+  global->eparticle_interval = status_interval; // Do not dump
+  global->iparticle_interval = status_interval; // Do not dump
+  global->restart_interval   = status_interval; // Do not dump
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+  grid->dx = hx;
+  grid->dy = hy;
+  grid->dz = hz;
+  grid->dt = dt;
+  grid->cvac = c;
+  //grid->damp = damp;
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  // define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+  //                        0.5*Lx, Ly, Lz,  // High corner
+  //                        nx, ny, nz,      // Resolution
+  //                        1, nproc(), 1 ); // Topology
+  define_periodic_grid(  0, -0.5*Ly, -0.5*Lz,    // Low corner
+			  Lx, 0.5*Ly, 0.5*Lz,     // High corner
+			  nx, ny, nz,             // Resolution
+			  topology_x, topology_y, topology_z); // Topology
+
+  //   printf("in harris.cxx: g->neighbor[6*265]=%jd\n",  grid->neighbor[6*265]);
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  // set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  // set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  // set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  // set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  // species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  // species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+  //species_t *electron = define_species("electron",-ec,me,2.4*Ne/nproc(),-1,25,0);
+  //species_t *ion      = define_species("ion",      ec,mi,2.4*Ne/nproc(),-1,25,0);
+
+  species_t *electron = define_species("electron",-ec,me,2.4*Ne/nproc(),-1,0,0); //turn off sorting (GY)
+  species_t *ion      = define_species("ion",      ec,mi,2.4*Ne/nproc(),-1,0,0); //(GY)
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  sim_log( "***********************************************" );
+  sim_log ( "mi/me = " << mi_me );
+  sim_log ( "tauwpe = " << tauwpe );
+  sim_log ( "num_step = " << num_step );
+  sim_log ( "Lx/di = " << Lx/di );
+  sim_log ( "Lx/de = " << Lx/de );
+  sim_log ( "Ly/di = " << Ly/di );
+  sim_log ( "Ly/de = " << Ly/de );
+  sim_log ( "Lz/di = " << Lz/di );
+  sim_log ( "Lz/de = " << Lz/de );
+  sim_log ( "nx = " << nx );
+  sim_log ( "ny = " << ny );
+  sim_log ( "nz = " << nz );
+  sim_log ( "damp = " << damp );
+  sim_log ( "courant = " << c*dt/dg );
+  sim_log ( "nproc = " << nproc ()  );
+  sim_log ( "nppc = " << nppc );
+  sim_log ( " b0 = " << b0 );
+  sim_log ( " di = " << di );
+  sim_log ( " Ne = " << Ne );
+  sim_log ( "total # of particles = " << 2*Ne );
+  sim_log ( "dt*wpe = " << wpe*dt );
+  sim_log ( "dx/de = " << Lx/(de*nx) );
+  sim_log ( "dy/de = " << Ly/(de*ny) );
+  sim_log ( "dz/de = " << Lz/(de*nz) );
+  sim_log ( "dx/debye = " << (Lx/nx)/(vthe/wpe)  );
+  sim_log ( "n0 = " << n0 );
+  sim_log ( "vthi/c = " << vthi/c );
+  sim_log ( "vthe/c = " << vthe/c );
+  sim_log( "" );
+
+  ////////////////////////////
+  // Load fields and particles
+
+  // sim_log( "Loading fields" );
+
+  // set_region_field( everywhere, 0, 0, 0,                    // Electric field
+  //                   0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  // Do a fast load of the particles
+  //seed_rand( rng_seed*nproc() + rank() );  //Generators desynchronized
+  double xmin = grid->x0 , xmax = grid->x0+(grid->dx)*(grid->nx);
+  double ymin = grid->y0 , ymax = grid->y0+(grid->dy)*(grid->ny);
+  double zmin = grid->z0 , zmax = grid->z0+(grid->dz)*(grid->nz);
+
+  sim_log( "-> Uniform Bi-Maxwellian" );
+
+  double n1,n2,n3;
+
+  repeat ( Ne/nproc() ) {
+
+      double x = uniform( rng(0), xmin, xmax );
+      double y = uniform( rng(0), ymin, ymax );
+      double z = uniform( rng(0), zmin, zmax );
+      n1 = normal(rng(0),0,vthex);
+      n2 = normal(rng(0),0,vthe );
+      n3 = normal(rng(0),0,vthe );
+
+      inject_particle( electron, x, y, z,
+              n1,
+              n2,
+              n3,we, 0, 0);
+
+      n1 = normal(rng(0),0,vthix);
+      n2 = normal(rng(0),0,vthi );
+      n3 = normal(rng(0),0,vthi );
+
+      inject_particle( ion, x, y, z,
+              n1,
+              n2,
+              n3,wi, 0 ,0 );
+
+  }
+
+  sim_log( "Finished loading particles" );
+
+  //exit(1);
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+begin_diagnostics {
+
+# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0)
+
+  if( step()==-10 ) {
+    // A grid dump contains all grid parameters, field boundary conditions,
+    // particle boundary conditions and domain connectivity information. This
+    // is stored in a binary format. Each rank makes a grid dump
+    dump_grid("grid");
+
+    // A materials dump contains all the materials parameters. This is in a
+    // text format. Only rank 0 makes the materials dump
+    dump_materials("materials");
+
+    // A species dump contains the physics parameters of a species. This is in
+    // a text format. Only rank 0 makes the species dump
+    dump_species("species");
+  }
+
+  // Energy dumps store all the energies in various directions of E and B
+  // and the total kinetic (not including rest mass) energies of each species
+  // species in a simple text format. By default, the energies are appended to
+  // the file. However, if a "0" is added to the dump_energies call, a new
+  // energies dump file will be created. The energies are in the units of the
+  // problem and are all time centered appropriately. Note: When restarting a
+  // simulation from a restart dump made at a prior time step to the last
+  // energies dump, the energies file will have a "hiccup" of intervening
+  // time levels. This "hiccup" will not occur if the simulation is aborted
+  // immediately following a restart dump. Energies dumps are in a text
+  // format and the layout is documented at the top of the file. Only rank 0
+  // makes makes an energies dump.
+  if( should_dump(energies) ) {
+    dump_energies( "energies", step()==0 ? 0 : 1 );
+  }
+
+  // Field dumps store the raw electromagnetic fields, sources and material
+  // placement and a number of auxilliary fields. E, B and RHOB are
+  // timecentered, JF and TCA are half a step old. Material fields are static
+  // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for
+  // debugging purposes. By default, field dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not be
+  // tagged. The JF that gets stored is accumulated with a charge-conserving
+  // algorithm. As a result, JF is not valid until at least one timestep has
+  // been completed. Field dumps are in a binary format. Each rank makes a
+  // field dump.
+  if( step()==-10 )         dump_fields("fields"); // Get first valid total J
+  if( should_dump(fields) ) dump_fields("fields");
+
+  // Hydro dumps store particle charge density, current density and
+  // stress-energy tensor. All these quantities are known at the time
+  // t = time().  All these quantities are accumulated trilinear
+  // node-centered. By default, species dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not
+  // be tagged. Note that the current density accumulated by this routine is
+  // purely diagnostic. It is not used by the simulation and it is not
+  // accumulated using a self-consistent charge-conserving method. Hydro dumps
+  // are in a binary format. Each rank makes a hydro dump.
+  if( should_dump(ehydro) ) dump_hydro("electron","ehydro");
+  if( should_dump(ihydro) ) dump_hydro("ion",     "ihydro");
+
+  // Particle dumps store the particle data for a given species. The data
+  // written is known at the time t = time().  By default, particle dumps
+  // are tagged with step(). However, if a "0" is added to the call, the
+  // filename will not be tagged. Particle dumps are in a binary format.
+  // Each rank makes a particle dump.
+  if( should_dump(eparticle) ) dump_particles("electron","eparticle");
+  if( should_dump(iparticle) ) dump_particles("ion",     "iparticle");
+
+  // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
+  // and tag is an integer.  A typical usage is:
+  //   checkpt( "checkpt", step() ).
+  // This will cause each process to write their simulation state to a file
+  // whose name is based on fbase, tag and the node's rank.  For the above
+  // usage, if called on step 314 on a 4 process run, the four files:
+  //   checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3
+  // to be written.  The simulation can then be restarted from this point by
+  // invoking the application with "--restore checkpt.314".  checkpt must be
+  // the _VERY_ LAST_ diagnostic called.  If not, diagnostics performed after
+  // the checkpt but before the next timestep will be missed on restore.
+  // Restart dumps are in a binary format unique to the each simulation.
+
+  if( should_dump(restart) ) checkpt( "checkpt", step() );
+
+  // If you want to write a checkpt after a certain amount of simulation time,
+  // use uptime() in conjunction with checkpt.  For example, this will cause
+  // the simulation state to be written after 7.5 hours of running to the
+  // same file every time (useful for dealing with quotas on big machines).
+  //if( uptime()>=27000 ) {
+  //  checkpt( "timeout", 0 );
+  //  abort(0);
+  //}
+
+# undef should_dump
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
diff --git a/test/integrated/legacy/CMakeLists.txt b/test/integrated/legacy/CMakeLists.txt
new file mode 100644
index 00000000..cdb9530c
--- /dev/null
+++ b/test/integrated/legacy/CMakeLists.txt
@@ -0,0 +1,16 @@
+# add the tests
+set(MPI_NUM_RANKS 1)
+set(ARGS "1 1")
+
+list(APPEND DEFAULT_ARG_TESTS accel cyclo inbndj interpe outbndj)
+list(APPEND ALL_TESTS ${DEFAULT_ARG_TESTS} pcomm)
+
+foreach(test ${ALL_TESTS})
+  build_a_vpic(${test} ${CMAKE_CURRENT_SOURCE_DIR}/${test}.deck)
+endforeach()
+
+foreach(test ${DEFAULT_ARG_TESTS})
+  add_test(${test} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${test} ${MPIEXEC_POSTFLAGS} ${ARGS})
+endforeach()
+
+add_test(pcomm ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 8 ${MPIEXEC_PREFLAGS} pcomm ${MPIEXEC_POSTFLAGS} ${ARGS})
diff --git a/test/integrated/accel.deck b/test/integrated/legacy/accel.deck
similarity index 100%
rename from test/integrated/accel.deck
rename to test/integrated/legacy/accel.deck
diff --git a/test/integrated/cyclo.deck b/test/integrated/legacy/cyclo.deck
similarity index 100%
rename from test/integrated/cyclo.deck
rename to test/integrated/legacy/cyclo.deck
diff --git a/test/integrated/inbndj.deck b/test/integrated/legacy/inbndj.deck
similarity index 100%
rename from test/integrated/inbndj.deck
rename to test/integrated/legacy/inbndj.deck
diff --git a/test/integrated/interpe.deck b/test/integrated/legacy/interpe.deck
similarity index 100%
rename from test/integrated/interpe.deck
rename to test/integrated/legacy/interpe.deck
diff --git a/test/integrated/outbndj.deck b/test/integrated/legacy/outbndj.deck
similarity index 100%
rename from test/integrated/outbndj.deck
rename to test/integrated/legacy/outbndj.deck
diff --git a/test/integrated/legacy/pcomm.deck b/test/integrated/legacy/pcomm.deck
new file mode 100644
index 00000000..e6a2c738
--- /dev/null
+++ b/test/integrated/legacy/pcomm.deck
@@ -0,0 +1,262 @@
+// Test particle communications
+//
+// Particle starts in center of the center cell of rank 0 and moves exactly
+// +0.5, +0.5, +0.5 of a cell every step. After 10 steps it will land exactly
+// on the corner shared by ranks 0-7. On the push for step 11, the particle
+// will pass through 3 domains simultaneously at land in rank 7. After 20 steps
+// the particle should be in the middle cell of rank 7. After 30 steps it will
+// land exactly on the corner shared by ranks 0-7. On the push for step 31,
+// the particle will pass through 3 domains simultaneously and land in rank 0.
+// On step 40, the particle should end up exactly where it started.
+//
+// This input deck was written by:
+//   Kevin J Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// March/April 2004 - Original version written
+
+begin_globals {
+    int fail = 0;
+};
+
+const int NUM_PROC = 8;
+
+begin_initialization {
+    if( nproc( )!= NUM_PROC ) {
+        sim_log( "This test case requires 8 processors" ); abort(1);
+    }
+
+    num_step = 40;
+
+    define_units( 1, 1 );
+    define_timestep( 0.5 );
+    define_periodic_grid( 0,  0,  0,     // Box low corner
+            10, 10, 10,    // Box high corner
+            10, 10, 10,    // Box resolution
+            2,  2,  2   ); // Topology
+    define_material( "vacuum", 1 );
+    define_field_array();
+    species_t * sp = define_species( "test_species", 1, 1, 1, 1, 0, 0 );
+    inject_particle( sp, 2.5, 2.5, 2.5, 1, 1, 1, 0, 0, 0 );
+    global->fail = 0;
+}
+
+int compare_expected_values(
+        int np,
+        float dx, // These could be packed into a particle initializer
+        float dy,
+        float dz,
+        float ux,
+        float uy,
+        float uz,
+        int vox,
+        int np_in,
+        particle_t& p_in
+        )
+{
+    // TODO: figure out why Intel with O3 makes this wrong by ~10.5 FLT_EPS
+    // Set experimentally. Once processor under intel with -O3 gets
+    // -1.206994e-06  when it shoud be zer0
+    const float eps = 11*FLT_EPSILON;
+    if (
+            (np_in != np) ||
+            (p_in.i != vox) ||
+            (fabs(p_in.dx - dx) > eps) || (fabs(p_in.dy - dy) > eps) || (fabs(p_in.dz - dz) > eps) ||
+            (fabs(p_in.ux - ux) > eps) || (fabs(p_in.uy - uy) > eps) || (fabs(p_in.uz - uz) > eps)
+       )
+    {
+        printf("Error detected");
+        printf("Np %d vs %d ", np_in, np);
+        printf("Dx %e vs %e ", p_in.dx, dx);
+        printf("Dy %e vs %e ", p_in.dy, dy);
+        printf("Dz %e vs %e ", p_in.dz, dz);
+        printf("Ux %e vs %e ", p_in.ux, ux);
+        printf("Uy %e vs %e ", p_in.uy, uy);
+        printf("Uz %e vs %e ", p_in.uz, uz);
+        printf("Vox %d vs %d \n", p_in.i, vox);
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+begin_diagnostics {
+    // FIXME: the int comparison throughout this file is sketchy
+
+    species_t * sp = find_species_name( "test_species", species_list );
+
+    if( step()==0 ) {
+        if( rank()==0 ) {
+
+            global->fail += compare_expected_values(
+                    1,
+                    0.0, // dx
+                    0.0, // dy
+                    0.0, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(3,3,3), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+        }
+        else if( sp->np!=0 )
+        {
+            global->fail++;
+        }
+    }
+
+    if( step()==10 ) {
+        if( rank()==0 ) {
+            global->fail += compare_expected_values(
+                    1,
+                    1.0, // dx
+                    1.0, // dy
+                    1.0, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(5,5,5), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+
+        }
+        else if( sp->np!=0 ) {
+            global->fail++;
+        }
+    }
+
+    if( step()==11 ) {
+        if( rank()==7 ) {
+            global->fail += compare_expected_values(
+                    1,
+                    -0.5, // dx
+                    -0.5, // dy
+                    -0.5, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(1,1,1), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+        }
+        else if( sp->np!=0 )
+        {
+            global->fail++;
+        }
+    }
+
+    if( step()==20 ) {
+        if( rank()==7 ) {
+            global->fail += compare_expected_values(
+                    1,
+                    0.0, // dx
+                    0.0, // dy
+                    0.0, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(3,3,3), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+        }
+        else if( sp->np!=0 ) {
+            global->fail++;
+        }
+    }
+
+    if( step()==30 ) {
+        if( rank()==7 ) {
+            global->fail += compare_expected_values(
+                    1,
+                    1.0, // dx
+                    1.0, // dy
+                    1.0, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(5,5,5), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+        }
+        else if( sp->np!=0 ) {
+            global->fail++;
+        }
+    }
+
+    if( step()==31 ) {
+        if( rank()==0 ) {
+            global->fail += compare_expected_values(
+                    1,
+                    -0.5, // dx
+                    -0.5, // dy
+                    -0.5, // dz
+                    1.0, // ux
+                    1.0, // uy
+                    1.0, // uz
+                    voxel(1,1,1), // vox
+                    sp->np,
+                    sp->p[0]
+                    );
+        }
+        else if( sp->np!=0 ) {
+            global->fail++;
+        }
+    }
+
+    if( step()==40 ) {
+        if( rank()==0 ) {
+                global->fail += compare_expected_values(
+                        1,
+                        0.0, // dx
+                        0.0, // dy
+                        0.0, // dz
+                        1.0, // ux
+                        1.0, // uy
+                        1.0, // uz
+                        voxel(3,3,3), // vox
+                        sp->np,
+                        sp->p[0]
+                        );
+        }
+        else if( sp->np!=0 ) {
+                global->fail++;
+        }
+    }
+
+    if( step()==40 ) {
+        for( int i=0; i < NUM_PROC; i++ ) {
+            if( rank()==i ) {
+                if( global->fail ) {
+                    sim_log_local( "FAIL" << global->fail ); abort(1);
+                }
+                sim_log_local( "pass" );
+            }
+            barrier();
+        }
+        halt_mp();
+        exit(0);
+    }
+}
+
+begin_particle_injection {
+}
+
+begin_current_injection {
+}
+
+begin_field_injection {
+}
+
+begin_particle_collisions {
+}
+
+
diff --git a/test/integrated/particle_push/CMakeLists.txt b/test/integrated/particle_push/CMakeLists.txt
new file mode 100644
index 00000000..cd488d7b
--- /dev/null
+++ b/test/integrated/particle_push/CMakeLists.txt
@@ -0,0 +1,19 @@
+if (NO_EXPLICIT_VECTOR)
+    # add the tests
+    set(MPI_NUM_RANKS 1)
+    set(ARGS "1 1")
+
+    set(TESTS "array_index")
+
+    # Build
+    # TODO: This method of doing the tests is really bad at rebuilding them properly
+    foreach(test ${TESTS})
+        MESSAGE("Build")
+        build_a_vpic(${test} ${CMAKE_CURRENT_SOURCE_DIR}/${test}.deck)
+    endforeach()
+
+    # Add test
+    foreach(test ${TESTS})
+        add_test(${test} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${test} ${MPIEXEC_POSTFLAGS} ${ARGS})
+    endforeach()
+endif(NO_EXPLICIT_VECTOR)
diff --git a/test/integrated/particle_push/advance_p.h b/test/integrated/particle_push/advance_p.h
new file mode 100644
index 00000000..0618cd00
--- /dev/null
+++ b/test/integrated/particle_push/advance_p.h
@@ -0,0 +1,255 @@
+// FIXME: PARTICLE MOVERS NEED TO BE OVERALLOCATED IN STRUCTORS TO
+// ACCOUNT FOR SPLITTING THE MOVER ARRAY BETWEEN HOST AND PIPELINES
+
+#define IN_spa
+#include "src/species_advance/standard/pipeline/spa_private.h"
+#include "src/util/pipelines/pipelines_exec.h"
+
+void
+advance_p2_pipeline_scalar( advance_p_pipeline_args_t * args,
+        int pipeline_rank,
+        int n_pipeline ) {
+    particle_t           * ALIGNED(128) p0 = args->p0;
+    accumulator_t        * ALIGNED(128) a0 = args->a0;
+    const interpolator_t * ALIGNED(128) f0 = args->f0;
+    const grid_t *                      g  = args->g;
+
+    particle_t           * ALIGNED(32)  p;
+    particle_mover_t     * ALIGNED(16)  pm;
+    const interpolator_t * ALIGNED(16)  f;
+    float                * ALIGNED(16)  a;
+
+    const float qdt_2mc        = args->qdt_2mc;
+    const float cdt_dx         = args->cdt_dx;
+    const float cdt_dy         = args->cdt_dy;
+    const float cdt_dz         = args->cdt_dz;
+    const float qsp            = args->qsp;
+    const float one            = 1.;
+    const float one_third      = 1./3.;
+    const float two_fifteenths = 2./15.;
+
+    float dx, dy, dz, ux, uy, uz, q;
+    float hax, hay, haz, cbx, cby, cbz;
+    float v0, v1, v2, v3, v4, v5;
+
+    int itmp, ii, n, nm, max_nm;
+
+    DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 );
+
+    // Determine which quads of particles quads this pipeline processes
+
+    DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, n );
+    p = args->p0 + itmp;
+
+    // Determine which movers are reserved for this pipeline
+    // Movers (16 bytes) should be reserved for pipelines in at least
+    // multiples of 8 such that the set of particle movers reserved for
+    // a pipeline is 128-byte aligned and a multiple of 128-byte in
+    // size.  The host is guaranteed to get enough movers to process its
+    // particles with this allocation.
+
+    max_nm = args->max_nm - (args->np&15);
+    if( max_nm<0 ) max_nm = 0;
+    DISTRIBUTE( max_nm, 8, pipeline_rank, n_pipeline, itmp, max_nm );
+    if( pipeline_rank==n_pipeline ) max_nm = args->max_nm - itmp;
+    pm   = args->pm + itmp;
+    nm   = 0;
+    int ignore = 0;
+
+    // Determine which accumulator array to use
+    // The host gets the first accumulator array
+
+    if( pipeline_rank!=n_pipeline )
+        a0 += (1+pipeline_rank)*
+            POW2_CEIL((args->nx+2)*(args->ny+2)*(args->nz+2),2);
+
+    // Process particles for this pipeline
+
+    particle_t           * ALIGNED(32)  p_ = p;
+    //for(;n;n--,p++) {
+    for(int i = 0; i < n; i++, p_++) {
+        dx   = p[i].dx;                             // Load position
+        dy   = p[i].dy;
+        dz   = p[i].dz;
+        ii   = p[i].i;
+        f    = f0 + ii;                           // Interpolate E
+        hax  = qdt_2mc*(    ( f->ex    + dy*f->dexdy    ) +
+                dz*( f->dexdz + dy*f->d2exdydz ) );
+        hay  = qdt_2mc*(    ( f->ey    + dz*f->deydz    ) +
+                dx*( f->deydx + dz*f->d2eydzdx ) );
+        haz  = qdt_2mc*(    ( f->ez    + dx*f->dezdx    ) +
+                dy*( f->dezdy + dx*f->d2ezdxdy ) );
+        cbx  = f->cbx + dx*f->dcbxdx;             // Interpolate B
+        cby  = f->cby + dy*f->dcbydy;
+        cbz  = f->cbz + dz*f->dcbzdz;
+        ux   = p[i].ux;                             // Load momentum
+        uy   = p[i].uy;
+        uz   = p[i].uz;
+        q    = p[i].w;
+        ux  += hax;                               // Half advance E
+        uy  += hay;
+        uz  += haz;
+        v0   = qdt_2mc/sqrtf(one + (ux*ux + (uy*uy + uz*uz)));
+        /**/                                      // Boris - scalars
+        v1   = cbx*cbx + (cby*cby + cbz*cbz);
+        v2   = (v0*v0)*v1;
+        v3   = v0*(one+v2*(one_third+v2*two_fifteenths));
+        v4   = v3/(one+v1*(v3*v3));
+        v4  += v4;
+        v0   = ux + v3*( uy*cbz - uz*cby );       // Boris - uprime
+        v1   = uy + v3*( uz*cbx - ux*cbz );
+        v2   = uz + v3*( ux*cby - uy*cbx );
+        ux  += v4*( v1*cbz - v2*cby );            // Boris - rotation
+        uy  += v4*( v2*cbx - v0*cbz );
+        uz  += v4*( v0*cby - v1*cbx );
+        ux  += hax;                               // Half advance E
+        uy  += hay;
+        uz  += haz;
+        p[i].ux = ux;                               // Store momentum
+        p[i].uy = uy;
+        p[i].uz = uz;
+        v0   = one/sqrtf(one + (ux*ux+ (uy*uy + uz*uz)));
+        /**/                                      // Get norm displacement
+        ux  *= cdt_dx;
+        uy  *= cdt_dy;
+        uz  *= cdt_dz;
+        ux  *= v0;
+        uy  *= v0;
+        uz  *= v0;
+        v0   = dx + ux;                           // Streak midpoint (inbnds)
+        v1   = dy + uy;
+        v2   = dz + uz;
+        v3   = v0 + ux;                           // New position
+        v4   = v1 + uy;
+        v5   = v2 + uz;
+
+        // FIXME-KJB: COULD SHORT CIRCUIT ACCUMULATION IN THE CASE WHERE QSP==0!
+        if(  v3<=one &&  v4<=one &&  v5<=one &&   // Check if inbnds
+                -v3<=one && -v4<=one && -v5<=one ) {
+
+            // Common case (inbnds).  Note: accumulator values are 4 times
+            // the total physical charge that passed through the appropriate
+            // current quadrant in a time-step
+
+            q *= qsp;
+            p[i].dx = v3;                             // Store new position
+            p[i].dy = v4;
+            p[i].dz = v5;
+            dx = v0;                                // Streak midpoint
+            dy = v1;
+            dz = v2;
+            v5 = q*ux*uy*uz*one_third;              // Compute correction
+            a  = (float *)( a0 + ii );              // Get accumulator
+
+#     define ACCUMULATE_J(X,Y,Z,offset)                                 \
+            v4  = q*u##X;   /* v2 = q ux                            */        \
+            v1  = v4*d##Y;  /* v1 = q ux dy                         */        \
+            v0  = v4-v1;    /* v0 = q ux (1-dy)                     */        \
+            v1 += v4;       /* v1 = q ux (1+dy)                     */        \
+            v4  = one+d##Z; /* v4 = 1+dz                            */        \
+            v2  = v0*v4;    /* v2 = q ux (1-dy)(1+dz)               */        \
+            v3  = v1*v4;    /* v3 = q ux (1+dy)(1+dz)               */        \
+            v4  = one-d##Z; /* v4 = 1-dz                            */        \
+            v0 *= v4;       /* v0 = q ux (1-dy)(1-dz)               */        \
+            v1 *= v4;       /* v1 = q ux (1+dy)(1-dz)               */        \
+            v0 += v5;       /* v0 = q ux [ (1-dy)(1-dz) + uy*uz/3 ] */        \
+            v1 -= v5;       /* v1 = q ux [ (1+dy)(1-dz) - uy*uz/3 ] */        \
+            v2 -= v5;       /* v2 = q ux [ (1-dy)(1+dz) - uy*uz/3 ] */        \
+            v3 += v5;       /* v3 = q ux [ (1+dy)(1+dz) + uy*uz/3 ] */        \
+            a[offset+0] += v0;                                                \
+            a[offset+1] += v1;                                                \
+            a[offset+2] += v2;                                                \
+            a[offset+3] += v3
+
+            ACCUMULATE_J( x,y,z, 0 );
+            ACCUMULATE_J( y,z,x, 4 );
+            ACCUMULATE_J( z,x,y, 8 );
+
+#     undef ACCUMULATE_J
+
+        }
+        else
+        {                                    // Unlikely
+            local_pm->dispx = ux;
+            local_pm->dispy = uy;
+            local_pm->dispz = uz;
+
+            // TODO: this could be something like i.. but that fails?!
+            local_pm->i = i + itmp; //p_ - p0;
+
+            if( move_p( p0, local_pm, a0, g, qsp ) ) { // Unlikely
+                if( nm<max_nm ) {
+                    pm[nm++] = local_pm[0];
+                }
+                else {
+                    ignore++;                 // Unlikely
+                } // if
+            } // if
+        }
+
+    }
+
+    args->seg[pipeline_rank].pm        = pm;
+    args->seg[pipeline_rank].max_nm    = max_nm;
+    args->seg[pipeline_rank].nm        = nm;
+    args->seg[pipeline_rank].n_ignored = ignore;
+}
+
+void
+advance_p2( /**/  species_t            * RESTRICT sp,
+        /**/  accumulator_array_t  * RESTRICT aa,
+        const interpolator_array_t * RESTRICT ia ) {
+    DECLARE_ALIGNED_ARRAY( advance_p_pipeline_args_t, 128, args, 1 );
+    DECLARE_ALIGNED_ARRAY( particle_mover_seg_t, 128, seg, MAX_PIPELINE+1 );
+    int rank;
+
+    if( !sp || !aa || !ia || sp->g!=aa->g || sp->g!=ia->g )
+        ERROR(( "Bad args" ));
+
+    args->p0       = sp->p;
+    args->pm       = sp->pm;
+    args->a0       = aa->a;
+    args->f0       = ia->i;
+    args->seg      = seg;
+    args->g        = sp->g;
+
+    args->qdt_2mc  = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+    args->cdt_dx   = sp->g->cvac*sp->g->dt*sp->g->rdx;
+    args->cdt_dy   = sp->g->cvac*sp->g->dt*sp->g->rdy;
+    args->cdt_dz   = sp->g->cvac*sp->g->dt*sp->g->rdz;
+    args->qsp      = sp->q;
+
+    args->np       = sp->np;
+    args->max_nm   = sp->max_nm;
+    args->nx       = sp->g->nx;
+    args->ny       = sp->g->ny;
+    args->nz       = sp->g->nz;
+
+    // Have the host processor do the last incomplete bundle if necessary.
+    // Note: This is overlapped with the pipelined processing.  As such,
+    // it uses an entire accumulator.  Reserving an entire accumulator
+    // for the host processor to handle at most 15 particles is wasteful
+    // of memory.  It is anticipated that it may be useful at some point
+    // in the future have pipelines accumulating currents while the host
+    // processor is doing other more substantive work (e.g. accumulating
+    // currents from particles received from neighboring nodes).
+    // However, it is worth reconsidering this at some point in the
+    // future.
+
+    EXEC_PIPELINES( advance_p2, args, 0 );
+    WAIT_PIPELINES();
+
+    // FIXME: HIDEOUS HACK UNTIL BETTER PARTICLE MOVER SEMANTICS
+    // INSTALLED FOR DEALING WITH PIPELINES.  COMPACT THE PARTICLE
+    // MOVERS TO ELIMINATE HOLES FROM THE PIPELINING.
+
+    sp->nm = 0;
+    for( rank=0; rank<=N_PIPELINE; rank++ ) {
+        if( args->seg[rank].n_ignored )
+            WARNING(( "Pipeline %i ran out of storage for %i movers",
+                        rank, args->seg[rank].n_ignored ));
+        if( sp->pm+sp->nm != args->seg[rank].pm )
+            MOVE( sp->pm+sp->nm, args->seg[rank].pm, args->seg[rank].nm );
+        sp->nm += args->seg[rank].nm;
+    }
+}
diff --git a/test/integrated/particle_push/array_index.deck b/test/integrated/particle_push/array_index.deck
new file mode 100644
index 00000000..99669ecd
--- /dev/null
+++ b/test/integrated/particle_push/array_index.deck
@@ -0,0 +1,136 @@
+#include "advance_p.h"
+
+// Test the "normal" pusher, vs one that uses traditional ([]) array syntax and
+// loop structure
+
+begin_globals {
+};
+
+begin_initialization {
+  double L  = 1e2;
+  int npart = 127;
+  int nstep = 100;
+
+  define_units( 1, 1 );
+  define_timestep( 1 );
+  define_periodic_grid( 0, 0, 0,   // Grid low corner
+                        L, L, L,   // Grid high corner
+                        1, 1, 1,   // Grid resolution
+                        1, 1, 1 ); // Processor configuration
+  define_material( "vacuum", 1.0, 1.0, 0.0 );
+  define_field_array();
+
+  field(1,1,1).ex  = 1;
+  field(1,2,1).ex  = 1;
+  field(1,1,2).ex  = 1;
+  field(1,2,2).ex  = 1;
+
+  field(1,1,1).ey  = 2;
+  field(1,1,2).ey  = 2;
+  field(2,1,1).ey  = 2;
+  field(2,1,2).ey  = 2;
+
+  field(1,1,1).ez  = 3;
+  field(2,1,1).ez  = 3;
+  field(1,2,1).ez  = 3;
+  field(2,2,1).ez  = 3;
+
+  species_t * sp =
+    define_species( "test_species", 1., 1., npart, npart, 0, 0 );
+
+  species_t* sp2 =
+    define_species( "test_species2", 1., 1., npart, npart, 0, 0 );
+
+  repeat(npart)
+  {
+      float x = uniform( rng(0), 0, L);
+      float y = uniform( rng(0), 0, L);
+      float z = uniform( rng(0), 0, L);
+
+      // Put two sets of particle in the exact same space
+      inject_particle( sp2, x, y, z, 0., 0., 0., 1., 0., 0);
+      inject_particle( sp , x, y, z, 0., 0., 0., 1., 0., 0);
+  }
+
+  // Create a second accumulator_array
+  accumulator_array_t* accumulator_array2 = new_accumulator_array( grid );
+
+  clear_accumulator_array(accumulator_array);
+  clear_accumulator_array(accumulator_array2);
+
+  // Hack into vpic internals
+  int failed = 0;
+  load_interpolator_array( interpolator_array, field_array );
+  for( int n=0; n<nstep; n++ ) {
+
+    advance_p( sp, accumulator_array, interpolator_array );
+    advance_p2( sp2, accumulator_array2, interpolator_array );
+
+    // This is how many pipelines there are inside the array
+    for (int n = 0; n < accumulator_array->n_pipeline+1; n++)
+    {
+        accumulator_t* a = accumulator_array->a + (n * accumulator_array2->stride);
+        accumulator_t* a2 = accumulator_array2->a + (n * accumulator_array2->stride);
+        for (int i = 0; i < grid->nv; i++)
+        {
+            if (
+                    (a[i].jx[0] != a2[i].jx[0]) ||
+                    (a[i].jx[1] != a2[i].jx[1]) ||
+                    (a[i].jx[2] != a2[i].jx[2]) ||
+                    (a[i].jx[3] != a2[i].jx[3]) ||
+                    (a[i].jy[0] != a2[i].jy[0]) ||
+                    (a[i].jy[1] != a2[i].jy[1]) ||
+                    (a[i].jy[2] != a2[i].jy[2]) ||
+                    (a[i].jy[3] != a2[i].jy[3]) ||
+                    (a[i].jz[0] != a2[i].jz[0]) ||
+                    (a[i].jz[1] != a2[i].jz[1]) ||
+                    (a[i].jz[2] != a2[i].jz[2]) ||
+                    (a[i].jz[3] != a2[i].jz[3])
+            )
+            {
+                sim_log(" Failed at " << i );
+                failed++;
+            }
+        }
+      if( failed ) { sim_log( "FAIL" ); abort(1); }
+    }
+
+    for ( int m=0; m<npart; m++ ) {
+      if( sp->p[m].ux != 1*(n+1) ||
+          sp->p[m].uy != 2*(n+1) ||
+          sp->p[m].uz != 3*(n+1) ) {
+        failed++;
+        sim_log( n << " " <<
+                 m << " " <<
+                 sp->p[m].i  << " " <<
+                 sp->p[m].dx << " " <<
+                 sp->p[m].dy << " " <<
+                 sp->p[m].dz << " " <<
+                 sp->p[m].ux << " " <<
+                 sp->p[m].uy << " " <<
+                 sp->p[m].uz << " " <<
+                 sp->p[m].w );
+      }
+    }
+  }
+
+  if( failed ) { sim_log( "FAIL" ); abort(1); }
+  sim_log( "pass" );
+  halt_mp();
+  exit(0);
+}
+
+begin_diagnostics {
+}
+
+begin_particle_injection {
+}
+
+begin_current_injection {
+}
+
+begin_field_injection {
+}
+
+begin_particle_collisions {
+}
diff --git a/test/integrated/pcomm.deck b/test/integrated/pcomm.deck
deleted file mode 100644
index b03e0926..00000000
--- a/test/integrated/pcomm.deck
+++ /dev/null
@@ -1,149 +0,0 @@
-// Test particle communications
-//
-// Particle starts in center of the center cell of rank 0 and moves exactly
-// +0.5, +0.5, +0.5 of a cell every step. After 10 steps it will land exactly
-// on the corner shared by ranks 0-7. On the push for step 11, the particle
-// will pass through 3 domains simultaneously at land in rank 7. After 20 steps
-// the particle should be in the middle cell of rank 7. After 30 steps it will
-// land exactly on the corner shared by ranks 0-7. On the push for step 31,
-// the particle will pass through 3 domains simultaneously and land in rank 0.
-// On step 40, the particle should end up exactly where it started.
-//
-// This input deck was written by:
-//   Kevin J Bowers, Ph.D.
-//   Plasma Physics Group (X-1)
-//   Applied Physics Division
-//   Los Alamos National Lab
-// March/April 2004 - Original version written
-
-begin_globals {
-  int fail;
-};
-
-begin_initialization {
-  if( nproc()!=8 ) {
-    sim_log( "This test case requires 8 processors" ); abort(1);
-  }
-
-  num_step = 40;
-
-  define_units( 1, 1 );
-  define_timestep( 0.5 );
-  define_periodic_grid( 0,  0,  0,     // Box low corner
-                        10, 10, 10,    // Box high corner
-                        10, 10, 10,    // Box resolution
-                        2,  2,  2   ); // Topology
-  define_material( "vacuum", 1 );
-  define_field_array();
-  species_t * sp = define_species( "test_species", 1, 1, 1, 1, 0, 0 );
-  inject_particle( sp, 2.5, 2.5, 2.5, 1, 1, 1, 0, 0, 0 );
-  global->fail = 0;
-}
-
-begin_diagnostics {
-  species_t * sp = find_species_name( "test_species", species_list );
-
-  if( step()==0 ) {
-    if( rank()==0 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(3,3,3) || 
-          sp->p[0].dx!=0 || sp->p[0].dy!=0 || sp->p[0].dz!=0 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++;
-    }
-  }
-
-  if( step()==10 ) {
-    if( rank()==0 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(5,5,5) || 
-          sp->p[0].dx!=1 || sp->p[0].dy!=1 || sp->p[0].dz!=1 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++;
-    }
-  }
-
-  if( step()==11 ) {
-    if( rank()==7 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(1,1,1) || 
-          sp->p[0].dx!=-0.5 || sp->p[0].dy!=-0.5 || sp->p[0].dz!=-0.5 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++;
-    }
-  }
-
-  if( step()==20 ) {
-    if( rank()==7 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(3,3,3) || 
-          sp->p[0].dx!=0 || sp->p[0].dy!=0 || sp->p[0].dz!=0 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++; 
-
-    }
-  }
-
-  if( step()==30 ) {
-    if( rank()==7 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(5,5,5) || 
-          sp->p[0].dx!=1 || sp->p[0].dy!=1 || sp->p[0].dz!=1 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++; 
-    }
-  }
-
-  if( step()==31 ) {
-    if( rank()==0 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(1,1,1) || 
-          sp->p[0].dx!=-0.5 || sp->p[0].dy!=-0.5 || sp->p[0].dz!=-0.5 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++;
-    }
-  }
-
-  if( step()==40 ) {
-    if( rank()==0 ) {
-      if( sp->np!=1 ||
-          sp->p[0].i!=voxel(3,3,3) || 
-          sp->p[0].dx!=0 || sp->p[0].dy!=0 || sp->p[0].dz!=0 ||
-          sp->p[0].ux!=1 || sp->p[0].uy!=1 || sp->p[0].uz!=1 ) global->fail++;
-    } else {
-      if( sp->np!=0 ) global->fail++;
-    }
-  }
-
-  if( step()==40 ) {
-    for( int i=0; i<8; i++ ) {
-      if( rank()==i ) {
-        if( global->fail ) { sim_log_local( "FAIL" ); abort(1); }
-        sim_log_local( "pass" );
-      }
-      barrier();
-    }
-    halt_mp();
-    exit(0);
-  }
-}
-
-begin_particle_injection {
-}
-
-begin_current_injection {
-}
-
-begin_field_injection {
-}
-
-begin_particle_collisions {
-}
-
-
diff --git a/test/integrated/to_completion/CMakeLists.txt b/test/integrated/to_completion/CMakeLists.txt
new file mode 100644
index 00000000..d09a3620
--- /dev/null
+++ b/test/integrated/to_completion/CMakeLists.txt
@@ -0,0 +1,54 @@
+# WARNING: None of these tests test correctness, only that they don't die.
+
+set(MPIEXEC_NUMPROC 1)
+
+# TODO: If would be good if we could detect that the machine supports the
+# desired number of threads
+set(MPIEXEC_NUMPROC_PARALLEL 8)
+
+set(ARGS "")
+
+list(APPEND DEFAULT_ARG_TESTS
+    simple # This test is a simple run which should not die
+    dump # This is a simple run which should dump restart files
+    reconnection_test # This is a simple reconnection run
+    )
+
+list(APPEND RESTART_DECK dump) # Reuse existing deck and start half way
+list(APPEND RESTART_BINARY restore)
+
+foreach(test ${DEFAULT_ARG_TESTS})
+    build_a_vpic(${test} ${CMAKE_CURRENT_SOURCE_DIR}/${test}.deck)
+endforeach()
+
+foreach(test ${DEFAULT_ARG_TESTS})
+    add_test(${test} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_NUMPROC}
+        ${MPIEXEC_PREFLAGS} ${test} ${MPIEXEC_POSTFLAGS} ${ARGS})
+endforeach()
+
+# Try a parallel run
+set (PARALLEL_TEST parallel)
+build_a_vpic(${PARALLEL_TEST} ${CMAKE_CURRENT_SOURCE_DIR}/simple.deck)
+add_test(${PARALLEL_TEST} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
+    ${MPIEXEC_NUMPROC_PARALLEL} ${MPIEXEC_PREFLAGS} ${PARALLEL_TEST}
+    ${MPIEXEC_POSTFLAGS} ${ARGS})
+
+# Try a threaded run
+set (THREADED_TEST threaded)
+list(APPEND THREADED_ARGS --tpp ${MPIEXEC_NUMPROC_PARALLEL})
+
+build_a_vpic(${THREADED_TEST} ${CMAKE_CURRENT_SOURCE_DIR}/simple.deck)
+add_test(${THREADED_TEST} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_NUMPROC}
+    ${MPIEXEC_PREFLAGS} ${THREADED_TEST} ${MPIEXEC_POSTFLAGS} ${THREADED_ARGS})
+
+# TODO: Do we want to try an MPI + Threaded runs
+
+# Test Restart (restore) functionality
+
+list(APPEND CHECKPOINT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/checkpt.1")
+list(APPEND RESTART_ARGS --restore ${CHECKPOINT_FILE})
+
+build_a_vpic(${RESTART_BINARY} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck)
+add_test(${RESTART_BINARY} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
+    ${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${RESTART_BINARY}
+    ${MPIEXEC_POSTFLAGS} ${RESTART_ARGS})
diff --git a/test/integrated/to_completion/checkpt.1.0 b/test/integrated/to_completion/checkpt.1.0
new file mode 100644
index 00000000..d657bef0
Binary files /dev/null and b/test/integrated/to_completion/checkpt.1.0 differ
diff --git a/test/integrated/to_completion/dump.deck b/test/integrated/to_completion/dump.deck
new file mode 100644
index 00000000..9e970fca
--- /dev/null
+++ b/test/integrated/to_completion/dump.deck
@@ -0,0 +1,340 @@
+// Dummy test deck to make sure ensure a small run can successfully operate
+// (based on harris, but modified. May not make much physical sense)
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+begin_initialization {
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  double input_mass_ratio;
+  int input_seed;
+
+  // Set sensible defaults
+  input_mass_ratio = 1.0;
+  input_seed = 0;
+
+  seed_entropy( input_seed );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  double L    = 1; // Length normalization (sheet thickness)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = input_mass_ratio; // Ion mass / electron mass
+  double rhoi_L  = 1;    // Ion thermal gyroradius / Sheet thickness
+  double Ti_Te   = 1;    // Ion temperature / electron temperature
+  double wpe_wce = 3;    // Electron plasma freq / electron cycltron freq
+  double theta   = 0;    // Orientation of the simulation wrt current sheet
+
+  // Numerical parameters
+  double Lx        = 16*L;  // How big should the box be in the x direction
+  double Ly        = 16*L;  // How big should the box be in the y direction
+  double Lz        = 16*L;  // How big should the box be in the z direction
+  double nx        = 8;    // Global resolution in the x direction
+  double ny        = 8;    // Global resolution in the y direction
+  double nz        = 1;     // Global resolution in the z direction
+  double nppc      = 16;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.001; // Level of radiation damping
+
+  // Derived quantities
+  double mi   = me*mi_me;                             // Ion mass
+  double kTe  = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature
+  double kTi  = kTe*Ti_Te;                            // Ion temperature
+  double vthi = sqrt(2*kTi/mi);                       // Ion thermal velocity (B.D. convention)
+  double wci  = vthi/(rhoi_L*L);                      // Ion cyclotron frequency
+  double wce  = wci*mi_me;                            // Electron cyclotron frequency
+  double wpe  = wce*wpe_wce;                          // Electron plasma frequency
+  double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te));        // Electron drift velocity
+  double vdri = -Ti_Te*vdre;                          // Ion drift velocity
+  double b0   = me*wce/ec;                            // Asymptotic magnetic field strength
+  double n0   = me*eps0*wpe*wpe/(ec*ec);              // Peak electron density (also peak ion density)
+  double Npe  = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L);          // Number of physical electrons in box
+  double Npi  = Npe;                                  // Number of physical ions in box
+  double Ne   = 0.5*nppc*nx*ny*nz;                    // Total macro electrons in box
+  Ne = trunc_granular(Ne,nproc());                    // Make it divisible by number of processors
+  double Ni   = Ne;                                   // Total macro ions in box
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+  double gdri = 1/sqrt(1-vdri*vdri/(c*c));            // gamma of ion drift frame
+  double gdre = 1/sqrt(1-vdre*vdre/(c*c));            // gamma of electron drift frame
+  double udri = vdri*gdri;                            // 4-velocity of ion drift frame
+  double udre = vdre*gdre;                            // 4-velocity of electron drift frame
+  double uthi = sqrt(kTi/mi)/c;                       // Normalized ion thermal velocity (K.B. convention)
+  double uthe = sqrt(kTe/me)/c;                       // Normalized electron thermal velocity (K.B. convention)
+  double cs   = cos(theta);
+  double sn   = sin(theta);
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = 2;
+  status_interval      = 1;
+
+  global->energies_interval  = status_interval;
+  global->fields_interval    = status_interval;
+  global->ehydro_interval    = status_interval;
+  global->ihydro_interval    = status_interval;
+  global->eparticle_interval = status_interval;
+  global->iparticle_interval = status_interval;
+  global->restart_interval   = status_interval;
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+                         0.5*Lx, Ly, Lz,  // High corner
+                         nx, ny, nz,      // Resolution
+                         1, nproc(), 1 ); // Topology
+
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  ////////////////////////////
+  // Load fields and particles
+
+  sim_log( "Loading fields" );
+
+  set_region_field( everywhere, 0, 0, 0,                    // Electric field
+                    0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc();
+
+  repeat( Ni/nproc() ) {
+    double x, y, z, ux, uy, uz, d0;
+
+    // Pick an appropriately distributed random location for the pair
+    do {
+      x = L*atanh( uniform( rng(0), -1, 1 ) );
+    } while( x<=-0.5*Lx || x>=0.5*Lx );
+    y = uniform( rng(0), ymin, ymax );
+    z = uniform( rng(0), 0,    Lz   );
+
+    // For the ion, pick an isothermal normalized momentum in the drift frame
+    // (this is a proper thermal equilibrium in the non-relativistic limit),
+    // boost it from the drift frame to the frame with the magnetic field
+    // along z and then rotate it into the lab frame. Then load the particle.
+    // Repeat the process for the electron.
+
+    ux = normal( rng(0), 0, uthi );
+    uy = normal( rng(0), 0, uthi );
+    uz = normal( rng(0), 0, uthi );
+    d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( ion,      x, y, z, ux, uy, uz, wi, 0, 0 );
+
+    ux = normal( rng(0), 0, uthe );
+    uy = normal( rng(0), 0, uthe );
+    uz = normal( rng(0), 0, uthe );
+    d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 );
+  }
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+begin_diagnostics {
+
+# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0)
+
+  int DUMP_FLAG = 1;
+
+  if( step() == DUMP_FLAG ) {
+    // A grid dump contains all grid parameters, field boundary conditions,
+    // particle boundary conditions and domain connectivity information. This
+    // is stored in a binary format. Each rank makes a grid dump
+    dump_grid("grid");
+
+    // A materials dump contains all the materials parameters. This is in a
+    // text format. Only rank 0 makes the materials dump
+    dump_materials("materials");
+
+    // A species dump contains the physics parameters of a species. This is in
+    // a text format. Only rank 0 makes the species dump
+    dump_species("species");
+  }
+
+  // Energy dumps store all the energies in various directions of E and B
+  // and the total kinetic (not including rest mass) energies of each species
+  // species in a simple text format. By default, the energies are appended to
+  // the file. However, if a "0" is added to the dump_energies call, a new
+  // energies dump file will be created. The energies are in the units of the
+  // problem and are all time centered appropriately. Note: When restarting a
+  // simulation from a restart dump made at a prior time step to the last
+  // energies dump, the energies file will have a "hiccup" of intervening
+  // time levels. This "hiccup" will not occur if the simulation is aborted
+  // immediately following a restart dump. Energies dumps are in a text
+  // format and the layout is documented at the top of the file. Only rank 0
+  // makes makes an energies dump.
+  if( should_dump(energies) ) dump_energies( "energies", step()==0 ? 0 : 1 );
+
+  // Field dumps store the raw electromagnetic fields, sources and material
+  // placement and a number of auxilliary fields. E, B and RHOB are
+  // timecentered, JF and TCA are half a step old. Material fields are static
+  // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for
+  // debugging purposes. By default, field dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not be
+  // tagged. The JF that gets stored is accumulated with a charge-conserving
+  // algorithm. As a result, JF is not valid until at least one timestep has
+  // been completed. Field dumps are in a binary format. Each rank makes a
+  // field dump.
+  if( step()== DUMP_FLAG )         dump_fields("fields"); // Get first valid total J
+  if( should_dump(fields) ) dump_fields("fields");
+
+  // Hydro dumps store particle charge density, current density and
+  // stress-energy tensor. All these quantities are known at the time
+  // t = time().  All these quantities are accumulated trilinear
+  // node-centered. By default, species dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not
+  // be tagged. Note that the current density accumulated by this routine is
+  // purely diagnostic. It is not used by the simulation and it is not
+  // accumulated using a self-consistent charge-conserving method. Hydro dumps
+  // are in a binary format. Each rank makes a hydro dump.
+  if( should_dump(ehydro) ) dump_hydro("electron","ehydro");
+  if( should_dump(ihydro) ) dump_hydro("ion",     "ihydro");
+
+  // Particle dumps store the particle data for a given species. The data
+  // written is known at the time t = time().  By default, particle dumps
+  // are tagged with step(). However, if a "0" is added to the call, the
+  // filename will not be tagged. Particle dumps are in a binary format.
+  // Each rank makes a particle dump.
+  if( should_dump(eparticle) ) dump_particles("electron","eparticle");
+  if( should_dump(iparticle) ) dump_particles("ion",     "iparticle");
+
+  // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
+  // and tag is an integer.  A typical usage is:
+  //   checkpt( "checkpt", step() ).
+  // This will cause each process to write their simulation state to a file
+  // whose name is based on fbase, tag and the node's rank.  For the above
+  // usage, if called on step 314 on a 4 process run, the four files:
+  //   checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3
+  // to be written.  The simulation can then be restarted from this point by
+  // invoking the application with "--restore checkpt.314".  checkpt must be
+  // the _VERY_ LAST_ diagnostic called.  If not, diagnostics performed after
+  // the checkpt but before the next timestep will be missed on restore.
+  // Restart dumps are in a binary format unique to the each simulation.
+
+  if( should_dump(restart) ) checkpt( "checkpt", step() );
+
+  // If you want to write a checkpt after a certain amount of simulation time,
+  // use uptime() in conjunction with checkpt.  For example, this will cause
+  // the simulation state to be written after 7.5 hours of running to the
+  // same file every time (useful for dealing with quotas on big machines).
+  //if( uptime()>=27000 ) {
+  //  checkpt( "timeout", 0 );
+  //  abort(0);
+  //}
+
+# undef should_dump
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
diff --git a/test/integrated/to_completion/reconnection_test.deck b/test/integrated/to_completion/reconnection_test.deck
new file mode 100644
index 00000000..39169534
--- /dev/null
+++ b/test/integrated/to_completion/reconnection_test.deck
@@ -0,0 +1,630 @@
+//  Reconnection Problem
+//  Physical Motivation: Hotspots in Pictor A
+//  Numerically: 2d, sigma ~ 1, low beta
+//               double force free sheets
+//               double periodic,
+//               duplicate species above and below current sheet
+//  Author: Patrick Kilian
+///////////////////////////////////////////////////////////////////////
+#include <math.h>
+
+// naming convention for the hydro dump files
+#define HYDRO_FILE_FORMAT "hydro/T.%d/%s.%d.%d"
+
+begin_globals {
+    int energies_interval;
+    int fields_interval;
+    int hydro_interval;
+    int particle_interval;
+
+    double topology_x;       // domain topology
+    double topology_y;
+    double topology_z;
+
+    //  Variables for output configuration
+    DumpParameters fdParams;
+    DumpParameters he1dParams;
+    DumpParameters he2dParams;
+    DumpParameters hH1dParams;
+    DumpParameters hH2dParams;
+    std::vector<DumpParameters *> outputParams;
+};
+
+
+begin_initialization {
+    int smalltest = 1;
+
+    // use natural PIC units
+    double ec   = 1.;              // Charge normalization
+    double me   = 1.;              // Mass normalization
+    double c    = 1.;              // Speed of light
+    double de   = 1.;              // Length normalization (electron inertial length)
+    double eps0 = 1.;              // Permittivity of space
+
+    // key parameters
+    double mi_me   = 1836.2;       // natural mass ratio
+    double nd_nb   = 0.;           // Over density in the initial CS
+    double b_drift = 0.0;          // Beta of drift in initial CS
+    double sigma_i = 3.0;          // cold magnetization
+    double sigma_e = sigma_i*mi_me;// electron cold magnetization
+    double th_be   = sigma_e/200.; // background electron temperature (as a fraction of rest energy)
+    double th_bi   = sigma_i/200.; // background electron temperature
+
+    // derived quantities
+    //double mu0     = 1./(c*c*eps0);// Permeability
+    double wpe     = c / de;       // electron plasma frequency
+    double nbe0    = wpe*wpe * me *eps0 / (ec*ec); // background electron density
+    double nbi0    = nbe0;         // neutral background
+    double nde0    = nd_nb * nbe0; // peak electron density
+    double ndi0    = nd_nb * nbi0; // peak ion density
+    double mi      = mi_me * me;   // proton mass
+    double wpi     = sqrt(nbi0 * ec*ec / (eps0 * mi));
+    double di      = c / wpi;      // Ion inertial length
+    double Ti_Te   = (th_bi * mi*c*c) / (th_be * me*c*c);// Temperature ratio
+    double g_drift = 1./sqrt(1. - b_drift*b_drift);
+    double vthe    = sqrt(th_be * me*c*c / me);  // Horrible non-relativistic approximation
+    double vthi    = sqrt(th_bi * mi*c*c / mi);  // for ions probably ok
+
+    // reconnecting magnetic field
+    double B0      = 1. * sqrt(sigma_i * nbi0 * mi * c*c / eps0);
+    // guide field
+    double Bg      = 0. * sqrt(sigma_i * nbi0 * mi * c*c / eps0);
+    // magnetic field perturbation strength
+    double dB      = 0.01 * B0;
+
+    // non-relativistic electron gyro radius
+    //double rho_e0  = me * c*c / (ec * B0);
+    // non-relativistic ion gyro radius
+    double rho_i0  = mi * c*c / (ec * B0);
+    // non-relativistic electron gyro frequency
+    double wce     = ec * B0 / (me * c);
+    double wci     = ec * B0 / (mi * c);
+    double wpe_wce = wpe/wce;
+    // relativistic gyro radius
+    double rho_c   = sigma_i * rho_i0; // this is also about sigma_e * rho_e0
+    // relativistic gyro frequency
+    double W_c     = c / rho_c;
+
+    // system size
+    double Lx      = 120. * rho_c; // large enough
+    if (smalltest) {
+        double smallLx = 1.2 * rho_c;
+        sim_log("Using Lx = "<<smallLx<<" instead of "<<Lx);
+        Lx = smallLx;
+    }
+    double Ly;                     // one cell, we will get back to that
+    double Lz      =   2. * Lx;    // enlongated for two CS
+
+    // target resolution
+    double dx      = rho_c / 20.;
+    double dy      = dx;
+    double dz      = dx;
+    Ly      = 1. * dy;
+
+
+    double topology_x = nproc();  // Number of domains in x, y, and z
+    double topology_y = 1;
+    double topology_z = 1;
+
+    // cell counts
+    double nx      = topology_x * std::round(Lx/dx / topology_x);
+    double ny      = topology_y * std::round(Ly/dy / topology_y);
+    double nz      = topology_z * std::round(Lz/dz / topology_z);
+
+    // adjust dx, dy, dz to reality
+    dx      = Lx / nx;
+    dy      = Ly / ny;
+    dz      = Lz / nz;
+
+    // time step
+    double dt_cfl  = courant_length(Lx,Ly,Lz,nx,ny,nz);
+    //double dt_wpe  = 1.0 / wpe;
+    //double dt_Wc   = 1.0 / W_c;
+    double dt      = 0.99 * dt_cfl;
+
+    // Debye length
+    double lD      = sqrt( (th_be * me * c*c) / (4.*M_PI * (nbe0 + nbi0) * ec*ec) ); // this should be about rho_c / 20.
+
+    // upstream plasma beta
+    double beta_up = 8.*M_PI*(nbe0*th_be + nbi0 * th_bi)/(B0*B0); // this should be about 50.
+
+    // temperature in the initial CS
+    double th_di   = g_drift/4. * nbi0/ndi0 * sigma_i; // should be about 0.05 sigma_i
+    double th_de   = g_drift/4. * nbe0/nde0 * sigma_e;
+    //double vthde   = sqrt(th_de);
+    //double vthdi   = sqrt(th_di);
+
+    // half thickness of current sheet
+    //double delta   = rho_c / 3.;
+    //double delta   = 17.44189453125;
+    double delta   = di;
+
+    // Simulation duration
+    double taui     = 2000.; // Duration in inverse w_c
+    double duration = taui / W_c;
+    num_step = trunc_granular(duration/dt, 100);
+    if (smalltest) {
+        double smallnumstep = 100;
+        sim_log("Using num_step = "<<smallnumstep<<" instead of "<<num_step);
+        num_step = smallnumstep;
+    }
+
+    double ion_sort_interval = 25;       //  Injector moments are also updated at this internal
+    double electron_sort_interval = 25;    //  Injector moments are also updated at this internal
+    double nppc  =  100; // Average number of macro particle per cell per species
+    if (smalltest) {
+        double smallnppc = 10;
+        sim_log("Using nppc = "<<smallnppc<<" instead of "<<nppc);
+        nppc = smallnppc;
+    }
+
+    double Ne  = nppc*nx*ny*nz;  // total macro electrons in box
+    double Np  = nbe0*Lx*Ly*Lz;  //  total number of physical electrons
+    double weight = Np/Ne;
+    double qe = -ec*Np/Ne;       // Charge per macro electron
+    double qi =  ec*Np/Ne;       // Charge per macro ion
+
+    int energies_interval = 100;
+    int interval = 1000;
+    int fields_interval = interval;
+    int hydro_interval  = interval;
+    int particle_interval = 10000;
+    if (smalltest) {
+        double smallpi = 1000;
+        sim_log("Using particle_interval = "<<smallpi<<" instead of "<<particle_interval);
+        particle_interval = smallpi;
+    }
+
+    status_interval      = 200;
+    sync_shared_interval = status_interval/2;
+    clean_div_e_interval = status_interval/2;
+    clean_div_b_interval = status_interval/2;
+
+    global->energies_interval  = energies_interval;
+    global->fields_interval    = fields_interval;
+    global->hydro_interval     = hydro_interval;
+    global->particle_interval  = particle_interval;
+
+    global->topology_x  = topology_x;
+    global->topology_y  = topology_y;
+    global->topology_z  = topology_z;
+
+    // Setup basic grid parameters
+    grid->dx = dx;
+    grid->dy = dy;
+    grid->dz = dz;
+    grid->dt = dt;
+    grid->cvac = c;
+    grid->eps0 = eps0;
+
+    // Define the grid
+    define_periodic_grid(         0,          0,          0,  // Low corner
+            Lx,         Ly,         Lz,  // High corner
+            nx,         ny,         nz,  // Resolution
+            topology_x, topology_y, topology_z); // Topology
+
+    // Setup materials
+    sim_log("Setting up materials. ");
+    define_material( "vacuum", 1 );
+    define_field_array(NULL);
+
+    // Setup the species
+    sim_log("Setting up species. ");
+    double nmax = 4.0*Ne/nproc();
+    double nmovers = 0.1*nmax;
+    double sort_method = 1;   //  0=in place and 1=out of place
+    species_t *electron1 = define_species("electron1",-ec, me, nmax, nmovers, electron_sort_interval, sort_method);
+    species_t *electron2 = define_species("electron2",-ec, me, nmax, nmovers, electron_sort_interval, sort_method);
+    species_t *ion1      = define_species("ion1",      ec, mi, nmax, nmovers, ion_sort_interval,      sort_method);
+    species_t *ion2      = define_species("ion2",      ec, mi, nmax, nmovers, ion_sort_interval,      sort_method);
+
+    ///////////////////////////////////////////////////
+    // Log diagnostic information about this simulation
+
+    sim_log( "***********************************************" );
+    sim_log("* Topology:                       "<<topology_x<<" "<<topology_y<<" "<<topology_z);
+    sim_log ( "L_di = " << delta/di );
+    sim_log ( "Ti/Te = " << Ti_Te ) ;
+    sim_log ( "wpe/wce = " << wpe_wce );
+    sim_log ( "mi/me = " << mi_me );
+    //sim_log ( "theta = " << theta );
+    sim_log ( "taui = " << taui );
+    sim_log ( "num_step = " << num_step );
+    sim_log ( "Lx/di = " << Lx/di );
+    sim_log ( "Lx/de = " << Lx/de );
+    sim_log ( "Ly/di = " << Ly/di );
+    sim_log ( "Ly/de = " << Ly/de );
+    sim_log ( "Lz/di = " << Lz/di );
+    sim_log ( "Lz/de = " << Lz/de );
+    sim_log ( "nx = " << nx );
+    sim_log ( "ny = " << ny );
+    sim_log ( "nz = " << nz );
+    sim_log ( "courant = " << c*dt/dx );
+    sim_log ( "nproc = " << nproc()  );
+    sim_log ( "nppc = " << nppc );
+    sim_log ( " B0 = " << B0 );
+    sim_log ( " di = " << di );
+    sim_log ( " Ne = " << Ne );
+    sim_log ( "total # of particles = " << 2*Ne );
+    sim_log ( " qi = " << qi );
+    sim_log ( " qe = " << qe );
+    sim_log ( "dt*wpe = " << wpe*dt );
+    sim_log ( "dt*wce = " << wce*dt );
+    sim_log ( "dt*wci = " << wci*dt );
+    sim_log ( " energies_interval: " << energies_interval );
+    sim_log ( "dx/de = " << Lx/(de*nx) );
+    sim_log ( "dy/de = " << Ly/(de*ny) );
+    sim_log ( "dz/de = " << Lz/(de*nz) );
+    sim_log ( "dx/rhoe = " << dx/(vthe/wce)  );
+    sim_log ( "Lx/debye = " << Lx/lD  );
+    sim_log ( "dx/debye = " << dx/lD  );
+    sim_log ( "nbe0 = " << nbe0 );
+    sim_log ( "vthi/c = " << vthi/c );
+    sim_log ( "vthe/c = " << vthe/c );
+    sim_log ( "beta = " << beta_up );
+
+    sim_log( "mi_me = " << mi_me );
+    sim_log( "nd_nb = " << nd_nb );
+    sim_log( "b_drift = " << b_drift );
+    sim_log( "sigma_i = " << sigma_i );
+    sim_log( "sigma_e = " << sigma_e );
+    sim_log( "th_be = " << th_be );
+    sim_log( "th_bi = " << th_bi );
+    sim_log( "th_de = " << th_de );
+    sim_log( "th_di = " << th_di );
+
+    sim_log( "g_drift = " << g_drift );
+    sim_log( "nbe0 = " << nbe0 );
+    sim_log( "nbi0 = " << nbi0 );
+    sim_log( "nde0 = " << nde0 );
+    sim_log( "ndi0 = " << ndi0 );
+    sim_log( "wpe = " << wpe );
+    sim_log( "wpi = " << wpi );
+
+    sim_log( "rho_c/dx = " << rho_c/dx );
+    sim_log( "delta/dx = " << delta/dx );
+
+
+    // Dump simulation information to file "info"
+    if (rank() == 0 ) {
+        FileIO fp_info;
+
+        if ( ! (fp_info.open("info", io_write)==ok) ) ERROR(("Cannot open file."));
+
+        fp_info.print( "           ***** Simulation parameters ***** \n");
+        fp_info.print( " L/di=%e\n", delta/di);
+        fp_info.print( " L/de=%e\n", delta/de);
+        fp_info.print( " Ti/Te=%e\n", Ti_Te );
+        fp_info.print( " wpe/wce = %e\n", wpe_wce );
+        fp_info.print( " mi/me =%e\n", mi_me );
+        //fp_info.print( " theta =%e\n", theta );
+        fp_info.print( " taui =%e\n", taui );
+        fp_info.print( " num_step = %i\n", num_step );
+        fp_info.print( " Lx/de = %e\n", Lx/de );
+        fp_info.print( " Ly/de = %e\n", Ly/de );
+        fp_info.print( " Lz/de =%e\n", Lz/de );
+        fp_info.print( " Lx/di = %e\n", Lx/di );
+        fp_info.print( " Ly/di = %e\n", Ly/di );
+        fp_info.print( " Lz/di =%e\n", Lz/di );
+        fp_info.print( " nx = %e\n", nx );
+        fp_info.print( " ny = %e\n", ny );
+        fp_info.print( " nz =%e\n", nz );
+        fp_info.print( " courant = %e\n", c*dt/dx );
+        fp_info.print( " nproc = %d\n", nproc() );
+        fp_info.print( " nppc = %e\n", nppc );
+        fp_info.print( " B0 = %e\n", B0 );
+        fp_info.print( " di = %e\n", di );
+        fp_info.print( " Ne = %e\n", Ne );
+        fp_info.print( " total # of particles = %e\n", 2*Ne );
+        fp_info.print( " dt*wpe = %e\n", wpe*dt );
+        fp_info.print( " dt*wce = %e\n", wce*dt );
+        fp_info.print( " dt*wci = %e\n", wci*dt );
+        fp_info.print( " energies_interval: %i\n", energies_interval);
+        fp_info.print( " dx/de =%e\n", Lx/(de*nx) );
+        fp_info.print( " dy/de =%e\n", Ly/(de*ny) );
+        fp_info.print( " dz/de =%e\n", Lz/(de*nz) );
+        fp_info.print( " Lx/debye =%e\n", Lx/lD );
+        fp_info.print( " dx/rhoi =%e\n", dx/(vthi/wci) );
+        fp_info.print( " dx/rhoe = %e\n", dx/(vthe/wce) );
+        fp_info.print( " dx/debye = %e\n", dx/lD );
+        fp_info.print( " n0 = %e\n", nbe0 );
+        fp_info.print( " vthi/c =%e\n", vthi/c );
+        fp_info.print( " vthe/c =%e\n", vthe/c );
+        fp_info.print( " beta = %e\n", beta_up);
+        fp_info.print( " mi_me = %e\n", mi_me );
+        fp_info.print( " nd_nb = %e\n", nd_nb );
+        fp_info.print( " b_drift = %e\n", b_drift );
+        fp_info.print( " sigma_i = %e\n", sigma_i );
+        fp_info.print( " sigma_e = %e\n", sigma_e );
+        fp_info.print( " th_be = %e\n", th_be );
+        fp_info.print( " th_bi = %e\n", th_bi );
+        fp_info.print( " th_de = %e\n", th_de );
+        fp_info.print( " th_di = %e\n", th_di );
+        fp_info.print( " g_drift = %e\n", g_drift );
+        fp_info.print( " nbe0 = %e\n", nbe0 );
+        fp_info.print( " nbi0 = %e\n", nbi0 );
+        fp_info.print( " nde0 = %e\n", nde0 );
+        fp_info.print( " ndi0 = %e\n", ndi0 );
+        fp_info.print( " wpe = %e\n", wpe );
+        fp_info.print( " wpi = %e\n", wpi );
+        fp_info.print( " rho_c/dx = %e\n", rho_c/dx );
+        fp_info.print( " delta/dx = %e\n", delta/dx );
+        fp_info.print( " ***************************\n");
+        fp_info.close();
+
+
+        // for the parallized translate.f90 written by Vadim
+        // write binary info file
+        if ( ! (fp_info.open("info.bin", io_write)==ok) ) ERROR(("Cannot open file."));
+
+        fp_info.write(&topology_x, 1 );
+        fp_info.write(&topology_y, 1 );
+        fp_info.write(&topology_z, 1 );
+
+        fp_info.write(&Lx, 1 );
+        fp_info.write(&Ly, 1 );
+        fp_info.write(&Lz, 1 );
+
+        fp_info.write(&nx, 1 );
+        fp_info.write(&ny, 1 );
+        fp_info.write(&nz, 1 );
+
+        fp_info.write(&dt, 1 );
+
+        fp_info.write(&mi_me, 1 );
+        fp_info.write(&wpe_wce, 1 );
+        fp_info.write(&vthe, 1 );
+        fp_info.write(&vthi, 1 );
+
+        fp_info.close();
+    }
+
+
+    // Load fields
+    sim_log( "Loading fields" );
+#define BX B0*(   tanh((z-0.75*Lz)/delta) -    tanh((z-0.25*Lz)/delta) + 1.)
+#define BY B0*(1./cosh((z-0.75*Lz)/delta) - 1./cosh((z-0.25*Lz)/delta))
+
+#define  KX  2.*M_PI/Lx
+#define  KZ  2.*M_PI/Lz
+#define dBX -dB*Lx/(2.0*Lz)*cos(KX*x)*sin(0.5*KZ*z)
+#define dBZ  dB*            sin(KX*x)*cos(0.5*KZ*z)
+
+    set_region_field(everywhere, 0, 0, 0, BX+dBX, BY+Bg, dBZ);
+
+
+    // Load particles
+    sim_log( "Loading particles" );
+
+    seed_entropy( rank() );  //Generators desynchronized
+    double xmin = grid->x0 , xmax = grid->x0+(grid->dx)*(grid->nx);
+    double ymin = grid->y0 , ymax = grid->y0+(grid->dy)*(grid->ny);
+    double zmin = grid->z0 , zmax = grid->z0+(grid->dz)*(grid->nz);
+
+    int particle_number = 0;
+    //int rank_int        = int(rank());
+    repeat ( Ne/nproc() ) {
+        double x, y, z, ux, uy, uz;
+        double gu, ua, ub;
+        species_t* target_species;
+
+        x = uniform( rng(0), xmin, xmax);
+        y = uniform( rng(0), ymin, ymax);
+        z = uniform( rng(0), zmin, zmax);
+
+# define VDY -0.5*(B0/delta) * (pow(cosh((z-0.75*Lz)/delta), -2.) - pow(cosh((z-0.25*Lz)/delta), -2.))
+# define VDX VDY*BX/BY
+# define VD sqrt(VDX*VDX + VDY*VDY)
+# define GVD 1./sqrt(1.-VD*VD/(c*c))
+
+        if(!(VDX < 1.) || !(VDY < 1.) ){
+            sim_log("x = "<<x<<", y = "<<y<<", z = "<<z<<" VDX = "<<VDX<<", VDY = "<<VDY);
+            exit(1);
+        }
+
+        //thermal gamma is much larger than unity, non-relativistic
+        //approximation is not very good
+        //ua = normal( rng(0), 0, vthe);
+        //ub = normal( rng(0), 0 ,vthe);
+        //uz = normal( rng(0), 0, vthe);
+        //gu = sqrt(1. + ua*ua + ub*ub + uz*uz);
+
+        const double ue_range = 6.0 * th_be/(me * c*c); // a few times the thermal gamma factor
+        double       f        = 1.0;
+        double       fs       = 0.0;
+        do {
+            ua = uniform(rng(0), -ue_range, ue_range);
+            ub = uniform(rng(0), -ue_range, ue_range);
+            uz = uniform(rng(0), -ue_range, ue_range);
+            gu = sqrt(1.0 + ua*ua + ub*ub + uz*uz);
+            fs = exp(-(gu-1)*me*c*c/th_be);
+            f  = uniform(rng(0), 0., 1.01);
+        } while(f > fs);
+
+        ux = (GVD*ua*VDX/VD - ub*VDY/VD) + GVD*VDX*gu;
+        uy = (GVD*ua*VDY/VD + ub*VDX/VD) + GVD*VDY*gu;
+
+        if(0.25*Lz < z && z <= 0.75*Lz) {
+            target_species = electron1;
+        } else {
+            target_species = electron2;
+        }
+        inject_particle(target_species, x, y, z, ux, uy, uz, weight, 0, 0);
+
+
+        // Ions are cold enough
+        ua = normal( rng(0), 0, vthi);
+        ub = normal( rng(0), 0, vthi);
+        uz = normal( rng(0), 0, vthi);
+        gu = sqrt(1. + ua*ua + ub*ub + uz*uz);
+        ux = (-GVD*ua*VDX/VD + ub*VDY/VD) - GVD*VDX*gu;
+        uy = (-GVD*ua*VDY/VD - ub*VDX/VD) - GVD*VDY*gu;
+        if(0.25*Lz < z && z <= 0.75*Lz) {
+            target_species = ion1;
+        } else {
+            target_species = ion2;
+        }
+        inject_particle(target_species, x, y, z, ux, uy, uz, weight, 0, 0);
+
+        particle_number++;
+    }
+
+    sim_log( "Finished loading particles" );
+
+    // Configure output
+    global->fdParams.format = band;
+    sim_log ( "Fields output format = band" );
+
+    global->he1dParams.format = band;
+    global->he2dParams.format = band;
+    sim_log ( "Electron species output format = band" );
+
+    global->hH1dParams.format = band;
+    global->hH2dParams.format = band;
+    sim_log ( "Ion species output format = band" );
+
+    // relative path to fields data from global header
+    sprintf(global->fdParams.baseDir, "fields");
+
+    // base file name for fields output
+    sprintf(global->fdParams.baseFileName, "fields");
+
+    global->fdParams.stride_x = 1;
+    global->fdParams.stride_y = 1;
+    global->fdParams.stride_z = 1;
+
+    // add field parameters to list
+    global->outputParams.push_back(&global->fdParams);
+
+    sim_log ( "Fields x-stride " << global->fdParams.stride_x );
+    sim_log ( "Fields y-stride " << global->fdParams.stride_y );
+    sim_log ( "Fields z-stride " << global->fdParams.stride_z );
+
+    // relative path to electron species data from global header
+    sprintf(global->he1dParams.baseDir, "hydro");
+    sprintf(global->he2dParams.baseDir, "hydro");
+
+    // base file name for fields output
+    sprintf(global->he1dParams.baseFileName, "ehydro1");
+    sprintf(global->he2dParams.baseFileName, "ehydro2");
+
+    global->he1dParams.stride_x = 1;
+    global->he1dParams.stride_y = 1;
+    global->he1dParams.stride_z = 1;
+    global->he2dParams.stride_x = 1;
+    global->he2dParams.stride_y = 1;
+    global->he2dParams.stride_z = 1;
+
+    // add electron species parameters to list
+    global->outputParams.push_back(&global->he1dParams);
+    global->outputParams.push_back(&global->he2dParams);
+
+    sim_log ( "Electron species x-stride " << global->he1dParams.stride_x );
+    sim_log ( "Electron species y-stride " << global->he1dParams.stride_y );
+    sim_log ( "Electron species z-stride " << global->he1dParams.stride_z );
+
+    // relative path to electron species data from global header
+    sprintf(global->hH1dParams.baseDir, "hydro");
+    sprintf(global->hH2dParams.baseDir, "hydro");
+
+    // base file name for fields output
+    sprintf(global->hH1dParams.baseFileName, "Hhydro1");
+    sprintf(global->hH2dParams.baseFileName, "Hhydro2");
+
+    global->hH1dParams.stride_x = 1;
+    global->hH1dParams.stride_y = 1;
+    global->hH1dParams.stride_z = 1;
+    global->hH2dParams.stride_x = 1;
+    global->hH2dParams.stride_y = 1;
+    global->hH2dParams.stride_z = 1;
+
+    sim_log ( "Ion species x-stride " << global->hH1dParams.stride_x );
+    sim_log ( "Ion species y-stride " << global->hH1dParams.stride_y );
+    sim_log ( "Ion species z-stride " << global->hH1dParams.stride_z );
+
+    // add electron species parameters to list
+    global->outputParams.push_back(&global->hH1dParams);
+    global->outputParams.push_back(&global->hH2dParams);
+
+    global->fdParams.output_variables( electric | magnetic );
+    global->he1dParams.output_variables( current_density | charge_density | stress_tensor );
+    global->he2dParams.output_variables( current_density | charge_density | stress_tensor );
+    global->hH1dParams.output_variables( current_density | charge_density | stress_tensor );
+    global->hH2dParams.output_variables( current_density | charge_density | stress_tensor );
+
+    sim_log("*** Finished with user-specified initialization ***");
+} //begin_initialization
+
+#define should_dump(x) \
+    (global->x##_interval>0 && remainder(step(), global->x##_interval) == 0)
+
+begin_diagnostics {
+    /*--------------------------------------------------------------------------
+     * Normal rundata dump
+     *------------------------------------------------------------------------*/
+    if(step()==0) {
+        dump_mkdir("fields");
+        dump_mkdir("hydro");
+        dump_mkdir("rundata");
+        dump_mkdir("particle");
+
+        dump_grid("rundata/grid");
+        dump_materials("rundata/materials");
+        dump_species("rundata/species");
+        global_header("global", global->outputParams);
+    }
+
+    // Normal rundata energies dump
+    if(should_dump(energies)) {
+        dump_energies("rundata/energies", step() == 0 ? 0 : 1);
+    }
+
+    // Field data output
+    if(step() == 1 || should_dump(fields)) {
+        field_dump(global->fdParams);
+    }
+
+    // Hydro data output
+    if(should_dump(hydro)) {
+        hydro_dump("electron1", global->he1dParams);
+        hydro_dump("electron2", global->he2dParams);
+        hydro_dump("ion1", global->hH1dParams);
+        hydro_dump("ion2", global->hH2dParams);
+    }
+
+    // Dump particle data
+    char subdir[36];
+    if(should_dump(particle))
+    {
+        sprintf(subdir,"particle/T.%lld",step());
+        dump_mkdir(subdir);
+        sprintf(subdir,"particle/T.%lld/eparticle1",step());
+        dump_particles("electron1",subdir);
+        sprintf(subdir,"particle/T.%lld/eparticle2",step());
+        dump_particles("electron2",subdir);
+        sprintf(subdir,"particle/T.%lld/Hparticle1",step());
+        dump_particles("ion1",subdir);
+        sprintf(subdir,"particle/T.%lld/Hparticle2",step());
+        dump_particles("ion2",subdir);
+    }
+
+} // end diagnostics
+
+
+
+begin_particle_injection {
+    // no particle injection
+}
+
+begin_current_injection {
+    // No current injection for this simulation
+}
+
+begin_field_injection {
+    // No field injection
+}
+
+begin_particle_collisions {
+    // No particle collisions in this simulation
+}
+
diff --git a/test/integrated/to_completion/simple.deck b/test/integrated/to_completion/simple.deck
new file mode 100644
index 00000000..4ad1c657
--- /dev/null
+++ b/test/integrated/to_completion/simple.deck
@@ -0,0 +1,243 @@
+// Dummy test deck to make sure ensure a small run can successfully operate
+// (based on harris, but modified. May not make much physical sense)
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+begin_initialization {
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  double input_mass_ratio;
+  int input_seed;
+
+  // Set sensible defaults
+  input_mass_ratio = 1.0;
+  input_seed = 0;
+
+  seed_entropy( input_seed );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  double L    = 1; // Length normalization (sheet thickness)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = input_mass_ratio; // Ion mass / electron mass
+  double rhoi_L  = 1;    // Ion thermal gyroradius / Sheet thickness
+  double Ti_Te   = 1;    // Ion temperature / electron temperature
+  double wpe_wce = 3;    // Electron plasma freq / electron cycltron freq
+  double theta   = 0;    // Orientation of the simulation wrt current sheet
+
+  // Numerical parameters
+  double Lx        = 16*L;  // How big should the box be in the x direction
+  double Ly        = 16*L;  // How big should the box be in the y direction
+  double Lz        = 16*L;  // How big should the box be in the z direction
+  double nx        = 8;    // Global resolution in the x direction
+  double ny        = 8;    // Global resolution in the y direction
+  double nz        = 1;     // Global resolution in the z direction
+  double nppc      = 16;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.001; // Level of radiation damping
+
+  // Derived quantities
+  double mi   = me*mi_me;                             // Ion mass
+  double kTe  = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature
+  double kTi  = kTe*Ti_Te;                            // Ion temperature
+  double vthi = sqrt(2*kTi/mi);                       // Ion thermal velocity (B.D. convention)
+  double wci  = vthi/(rhoi_L*L);                      // Ion cyclotron frequency
+  double wce  = wci*mi_me;                            // Electron cyclotron frequency
+  double wpe  = wce*wpe_wce;                          // Electron plasma frequency
+  double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te));        // Electron drift velocity
+  double vdri = -Ti_Te*vdre;                          // Ion drift velocity
+  double b0   = me*wce/ec;                            // Asymptotic magnetic field strength
+  double n0   = me*eps0*wpe*wpe/(ec*ec);              // Peak electron density (also peak ion density)
+  double Npe  = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L);          // Number of physical electrons in box
+  double Npi  = Npe;                                  // Number of physical ions in box
+  double Ne   = 0.5*nppc*nx*ny*nz;                    // Total macro electrons in box
+  Ne = trunc_granular(Ne,nproc());                    // Make it divisible by number of processors
+  double Ni   = Ne;                                   // Total macro ions in box
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+  double gdri = 1/sqrt(1-vdri*vdri/(c*c));            // gamma of ion drift frame
+  double gdre = 1/sqrt(1-vdre*vdre/(c*c));            // gamma of electron drift frame
+  double udri = vdri*gdri;                            // 4-velocity of ion drift frame
+  double udre = vdre*gdre;                            // 4-velocity of electron drift frame
+  double uthi = sqrt(kTi/mi)/c;                       // Normalized ion thermal velocity (K.B. convention)
+  double uthe = sqrt(kTe/me)/c;                       // Normalized electron thermal velocity (K.B. convention)
+  double cs   = cos(theta);
+  double sn   = sin(theta);
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = 500;
+  status_interval      = 1;
+
+  clean_div_e_interval = status_interval;
+  clean_div_b_interval = status_interval;
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+                         0.5*Lx, Ly, Lz,  // High corner
+                         nx, ny, nz,      // Resolution
+                         1, nproc(), 1 ); // Topology
+
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  ////////////////////////////
+  // Load fields and particles
+
+  sim_log( "Loading fields" );
+
+  set_region_field( everywhere, 0, 0, 0,                    // Electric field
+                    0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc();
+
+  repeat( Ni/nproc() ) {
+    double x, y, z, ux, uy, uz, d0;
+
+    // Pick an appropriately distributed random location for the pair
+    do {
+      x = L*atanh( uniform( rng(0), -1, 1 ) );
+    } while( x<=-0.5*Lx || x>=0.5*Lx );
+    y = uniform( rng(0), ymin, ymax );
+    z = uniform( rng(0), 0,    Lz   );
+
+    // For the ion, pick an isothermal normalized momentum in the drift frame
+    // (this is a proper thermal equilibrium in the non-relativistic limit),
+    // boost it from the drift frame to the frame with the magnetic field
+    // along z and then rotate it into the lab frame. Then load the particle.
+    // Repeat the process for the electron.
+
+    ux = normal( rng(0), 0, uthi );
+    uy = normal( rng(0), 0, uthi );
+    uz = normal( rng(0), 0, uthi );
+    d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( ion,      x, y, z, ux, uy, uz, wi, 0, 0 );
+
+    ux = normal( rng(0), 0, uthe );
+    uy = normal( rng(0), 0, uthe );
+    uz = normal( rng(0), 0, uthe );
+    d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 );
+  }
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+begin_diagnostics {
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
new file mode 100644
index 00000000..bb75b807
--- /dev/null
+++ b/test/unit/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(particle_push)
+add_subdirectory(energy_comparison)
diff --git a/test/unit/energy_comparison/CMakeLists.txt b/test/unit/energy_comparison/CMakeLists.txt
new file mode 100644
index 00000000..be96860f
--- /dev/null
+++ b/test/unit/energy_comparison/CMakeLists.txt
@@ -0,0 +1,10 @@
+LIST(APPEND TESTS weibel_driver)
+
+list(APPEND gold_file "${CMAKE_CURRENT_SOURCE_DIR}/energies_gold")
+add_definitions(-DGOLD_ENERGY_FILE=${gold_file})
+
+foreach (test ${TESTS})
+    add_executable(${test} ./${test}.cc)
+    target_link_libraries(${test} vpic)
+    add_test(NAME ${test} COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ./${test})
+endforeach(test)
diff --git a/test/unit/energy_comparison/compare_energies.h b/test/unit/energy_comparison/compare_energies.h
new file mode 100644
index 00000000..a0417613
--- /dev/null
+++ b/test/unit/energy_comparison/compare_energies.h
@@ -0,0 +1,337 @@
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#include <limits> // epsilon for limit
+#include <utility> // pair
+
+#include <bitset> // TODO: Remove
+
+namespace test_utils {
+/**
+ * @brief Helper function to write collective errors to file for further analysis
+ *
+ * @param errs The vector of all errors
+ * @param field_per_line The number of values to write per file line
+ */
+void write_error_ouput( std::vector<double> errs, int field_per_line, std::string err_file_base_name)
+{
+    int counter = 0;
+    std::ofstream outputFile(err_file_base_name);
+
+    for (auto e : errs)
+    {
+        counter++;
+        outputFile << counter << " " << e*100.0 << " "; // Convert to percent and dump
+        if (counter % field_per_line == 0)
+        {
+            outputFile << std::endl;
+        }
+    }
+    outputFile.close();
+}
+
+/**
+ * @brief Helper function to compare numbers and calculate a absolute error
+ *
+ * @param A The first value to compare
+ * @param B The second value to compare
+ *
+ * @return The calculated error
+ */
+double calculate_abs_error(double A, double B)
+{
+    return std::abs(A-B) / std::min(A,B);
+}
+
+/**
+ * @brief Helper function to compare numbers and calculate a relative error
+ *
+ * @param A The first value to compare
+ * @param B The second value to compare
+ *
+ * @return The calculated error
+ */
+double calculate_relative_error(double A, double B)
+{
+     return std::abs(A-B);
+}
+
+/**
+ * @brief Function to compare errors to a given tolerance, and decide if it's within range
+ *
+ * @param A The first value to compare
+ * @param B The second value to compare
+ * @param relative_tolerance The relative tolerance to use when comparing
+ *
+ * @return A pair containing true/false if it's within tolerance, and the calculated error
+ */
+std::pair<bool, double> compare_error(double A, double B, double relative_tolerance)
+{
+    bool within_tol = false;
+    double err = 0.0;
+
+    // Right now this is pretty arbitrary..
+    double abs_threshhold = 10 * std::numeric_limits<float>::epsilon();
+
+    // Calculate if we're withing tolerances
+    // If we're close to relative, do absolute
+    if (std::abs(std::min(A,B)) < abs_threshhold)
+    {
+        err = calculate_relative_error(A, B);
+
+        // Finding a relative error to 0 doesn't make much
+        // sense, so lets do absolute error instead
+        if ( err < std::numeric_limits<double>::epsilon() )
+        {
+            within_tol = true;
+        }
+        else {
+            within_tol = false;
+        }
+    }
+    else { // Do absolute error
+
+        err = calculate_abs_error(A, B);
+
+        if (err < relative_tolerance)
+        {
+            within_tol = true;
+        }
+        else {
+            within_tol = false;
+        }
+    }
+    return { within_tol, err };
+}
+
+enum FIELD_ENUM {
+    Individual = 0, // Track each field individually
+    Sum // Sum the masked fields
+};
+
+/**
+ * @brief Function to compare the contents of two energy files
+ *
+ * @param file_a First file to compare
+ * @param file_b Second file to compare
+ * @param relative_tolerance Relative tolerance which is acceptable
+ * @param field_mask A mask to specify which fields in the file to use
+ * @param sum_mask A mask to specify which fields in the file to sum and compare
+ * @param write_err_output If you should write the error output to a file
+ * @param err_file_base_name Base filename for writing output
+ * @param num_lines_to_skip The number of lines to skup into the file
+ *
+ * @NOTE A typical energy file is:
+ * <step> <ex> <ey> <ez> <bx> <by> <bz> <particle energies...>
+ * and the bit maps go accordingly with <step> being the LSB.
+ * A mask for b fields only would be 0x000001110
+ *
+ * @NOTE We could * use bitsets for the masking but * they're generally slower
+ *
+ * @return True is they match (within tol), false if not
+ */
+bool compare_energies(
+        const std::string file_a,
+        const std::string file_b,
+        const double relative_tolerance,
+        const unsigned short field_mask = 0b1111111111111111, /// short has 16 bytes, assume all are true
+        const FIELD_ENUM field_enum = FIELD_ENUM::Individual, /// short has 16 bytes, assume all are true
+        const int write_err_ouput = 0, // If the run should dump the errors to disk
+        const std::string err_file_base_name =  "err.out", // File name to write errors to
+        const int num_lines_to_skip = 0 // Most energy files have 3 lines of padding
+)
+{
+    // TODO: I could easily have a policy here based on the type of the field_mask
+    std::vector<double> errs;
+
+    const int DEFAULT_FILED_COUNT = 7;
+
+    unsigned short agg_total = 0;
+    unsigned short v = field_mask;
+    // Count set bits
+    for (agg_total = 0; v; agg_total++)
+    {
+        v &= v - 1; // clear the least significant bit set
+    }
+
+    try {
+
+        bool match = true;
+
+        std::string line1 = "";
+        std::string line2 = "";
+
+        std::ifstream f1 (file_a);
+        std::ifstream f2 (file_b);
+
+        double max_err = 0.0;
+        int max_err_line = -1;
+
+        // This is for counting the number of tokens on a line (changes
+        // based on number of species). It can likely be done much better
+        int line_token_count = 0;
+
+        if (!f1.is_open())
+        {
+            std::cerr << "Unable to open file f1 " << file_a << std::endl;;
+            return false;
+        }
+        else if (!f2.is_open())
+        {
+            std::cerr << "Unable to open file f2 " << file_b << std::endl;
+            return false;
+        }
+        else // Performan test
+        {
+
+            // Perform skipping
+            for (int i = 0; i < num_lines_to_skip; i++)
+            {
+                getline(f1,line1);
+                getline(f2,line2);
+            }
+
+            int counter = num_lines_to_skip;
+
+            // Do processing
+            while ( getline(f1,line1) )
+            {
+                getline(f2,line2);
+
+                // Tokenize lines
+                std::stringstream linestream1(line1);
+                std::string item1;
+
+                std::stringstream linestream2(line2);
+                std::string item2;
+
+                int used_line_token_count = 0;
+                int total_line_token_count = 0;
+
+                double sum_A = 0.0;
+                double sum_B = 0.0;
+                std::pair<bool, double> returned_err;
+                returned_err.second = -1.0; // set a dummy value to show uninit
+
+                int agg_count = 0;
+                while (getline(linestream1, item1, ' '))
+                {
+                    bool write_this_err_ouput = write_err_ouput;
+                    //std::cout << "Setting write_this_err_ouput tp " << write_this_err_ouput << std::endl;
+
+                    getline(linestream2, item2, ' ');
+                    total_line_token_count++;
+
+                    // Use this field
+                    //std::cout << "this_line " << this_line_token_count << " mask " << field_mask << std::endl;
+
+                    // Take the value one, and shift it to generate the mask to compare
+                    unsigned short this_line_token_mask = 1 << (total_line_token_count - 1); // Set correct highest bit on
+                    //this_line_token_mask |= this_line_token_mask-1; // Set lower bits on
+
+                    // If this field is within our requested mask, use it
+                    if (this_line_token_mask & field_mask)
+                    {
+                        used_line_token_count++;
+                        //std::cout << "Parsing field " << used_line_token_count << " val " << item1 << std::endl;
+
+                        double A = std::stod(item1);
+                        double B = std::stod(item2);
+
+                        if (
+                                (field_enum == FIELD_ENUM::Sum) && // Need to aggregate
+                                (agg_count < agg_total) // Not done aggregating yet
+                            )
+                        {
+                            // Need to aggregate..
+                            sum_A += A;
+                            sum_B += B;
+                            agg_count++;
+
+                            //std::cout << "sum a " << sum_A << " += " << A << std::endl;
+                            //std::cout << "sum b " << sum_B << " += " << B << std::endl;
+
+                            // Don't write this particular one
+                            write_this_err_ouput = false;
+
+                            if (agg_count == agg_total) { // final_aggregation
+                                returned_err = compare_error(sum_A, sum_B, relative_tolerance);
+                                write_this_err_ouput = true;
+                            }
+                        }
+                        else // We can just compare this val
+                        {
+
+                            returned_err = compare_error(A, B, relative_tolerance);
+
+                        }
+
+                        if (returned_err.second != -1.0)  // Has some value set
+                        {
+                            bool returned_match = returned_err.first;
+
+                            if (!returned_match) {
+                                match = false;
+                            }
+
+                            double err = returned_err.second;
+
+                            // Track max absolute error
+                            if (err > max_err)
+                            {
+                                max_err = err;
+                                max_err_line = counter;
+                            }
+
+
+                            // If we track the errors, track this one
+                            if (write_this_err_ouput)
+                            {
+                                errs.push_back(err);
+                            }
+                        }
+                    }
+                    else {
+                        //std::cout << "Skipping field " << this_line_token_mask << " val " << item1 << std::endl;
+                    }
+                }
+                line_token_count = used_line_token_count;
+                counter++;
+            }
+
+            f1.close();
+            f2.close();
+        }
+
+        //std::cout << "Field mask : " << field_mask << std::endl;
+        //std::cout << "Fields used : " << line_token_count << std::endl;
+
+        std::cout << "Max found err was " << max_err*100 << "% on line " << max_err_line << " (Threshold: " <<
+            relative_tolerance*100 << "%)" << std::endl;
+
+        if (write_err_ouput)
+        {
+            int err_per_line = line_token_count;
+            if (field_enum == FIELD_ENUM::Sum) // Need to aggregate
+            {
+                err_per_line /= agg_total; // Reduce by aggregation factor
+            }
+
+            std::cout << "Writing error output " << errs.size() << std::endl;
+            write_error_ouput( errs, err_per_line, err_file_base_name);
+        }
+
+
+        return match;
+    }
+    catch (const std::exception &exc) // Catching all is bad form, but OK for now..
+    {
+        // catch anything thrown within try block that derives from std::exception
+        std::cerr << exc.what();
+        return false;
+    }
+
+}
+
+} // namespace
diff --git a/test/unit/energy_comparison/energies_gold b/test/unit/energy_comparison/energies_gold
new file mode 100644
index 00000000..e5842cce
--- /dev/null
+++ b/test/unit/energy_comparison/energies_gold
@@ -0,0 +1,701 @@
+0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.184982e+02 6.493993e-02
+1 2.114318e-07 2.748296e-06 4.165634e-06 0.000000e+00 2.148136e-06 1.153632e-06 1.184982e+02 6.493579e-02
+2 8.295282e-07 5.955656e-06 3.916506e-06 0.000000e+00 6.172393e-06 2.885251e-06 1.184982e+02 6.493138e-02
+3 1.749687e-06 6.462033e-06 6.261195e-06 0.000000e+00 3.832411e-06 6.372292e-06 1.184982e+02 6.492944e-02
+4 2.958083e-06 7.773431e-06 5.334200e-06 0.000000e+00 3.181687e-06 8.651884e-06 1.184982e+02 6.492829e-02
+5 4.393735e-06 1.143827e-05 9.035736e-06 0.000000e+00 3.038687e-06 6.426210e-06 1.184982e+02 6.492813e-02
+6 6.096144e-06 1.484543e-05 9.051144e-06 0.000000e+00 5.724866e-06 3.549470e-06 1.184982e+02 6.492780e-02
+7 7.929523e-06 1.520083e-05 1.202037e-05 0.000000e+00 2.797590e-06 3.021667e-06 1.184982e+02 6.493011e-02
+8 9.712584e-06 1.553720e-05 1.108577e-05 0.000000e+00 1.977976e-06 2.976940e-06 1.184982e+02 6.493329e-02
+9 1.153069e-05 1.947669e-05 1.693664e-05 0.000000e+00 1.995379e-06 2.581437e-06 1.184982e+02 6.493407e-02
+10 1.307797e-05 2.160053e-05 1.642253e-05 0.000000e+00 5.323156e-06 3.715039e-06 1.184982e+02 6.493514e-02
+11 1.425408e-05 2.028152e-05 1.776440e-05 0.000000e+00 3.602827e-06 6.519449e-06 1.184982e+02 6.493978e-02
+12 1.518056e-05 2.007149e-05 1.598566e-05 0.000000e+00 2.949663e-06 9.395593e-06 1.184981e+02 6.494548e-02
+13 1.567814e-05 2.164696e-05 1.805553e-05 0.000000e+00 3.513625e-06 7.419815e-06 1.184981e+02 6.495274e-02
+14 1.569166e-05 2.231079e-05 1.611286e-05 0.000000e+00 5.318751e-06 4.131231e-06 1.184981e+02 6.496359e-02
+15 1.505206e-05 2.036765e-05 1.678849e-05 0.000000e+00 2.259511e-06 1.995403e-06 1.184981e+02 6.497721e-02
+16 1.399251e-05 1.869547e-05 1.367504e-05 0.000000e+00 5.199316e-07 8.195201e-07 1.184981e+02 6.499133e-02
+17 1.249974e-05 1.924676e-05 1.544571e-05 0.000000e+00 8.085756e-07 1.394086e-06 1.184981e+02 6.500123e-02
+18 1.073298e-05 1.715052e-05 1.095502e-05 0.000000e+00 5.231911e-06 4.621071e-06 1.184981e+02 6.500884e-02
+19 8.758880e-06 1.234872e-05 9.518768e-06 0.000000e+00 4.439173e-06 8.381782e-06 1.184981e+02 6.501846e-02
+20 7.054467e-06 9.272719e-06 6.591506e-06 0.000000e+00 3.697312e-06 1.044818e-05 1.184981e+02 6.502863e-02
+21 5.824408e-06 7.654984e-06 7.165079e-06 0.000000e+00 2.821531e-06 8.677758e-06 1.184981e+02 6.503871e-02
+22 5.106047e-06 5.642151e-06 4.509504e-06 0.000000e+00 4.625907e-06 4.916184e-06 1.184981e+02 6.504632e-02
+23 4.765343e-06 2.987584e-06 4.786002e-06 0.000000e+00 2.935849e-06 4.075174e-06 1.184981e+02 6.505213e-02
+24 5.030705e-06 1.906972e-06 2.389091e-06 0.000000e+00 2.374656e-06 3.427287e-06 1.184981e+02 6.505607e-02
+25 5.792468e-06 3.800821e-06 4.299995e-06 0.000000e+00 1.993837e-06 3.713679e-06 1.184981e+02 6.505483e-02
+26 6.730379e-06 4.854365e-06 1.817357e-06 0.000000e+00 4.371554e-06 5.440971e-06 1.184981e+02 6.505128e-02
+27 7.921957e-06 5.114067e-06 3.678592e-06 0.000000e+00 3.968256e-06 9.192113e-06 1.184981e+02 6.504590e-02
+28 8.767811e-06 7.314804e-06 4.691720e-06 0.000000e+00 3.469039e-06 1.144050e-05 1.184981e+02 6.503975e-02
+29 9.441552e-06 9.970639e-06 7.684310e-06 0.000000e+00 3.348704e-06 9.854202e-06 1.184981e+02 6.503242e-02
+30 1.010771e-05 1.181095e-05 7.852642e-06 0.000000e+00 4.349028e-06 5.856208e-06 1.184981e+02 6.502560e-02
+31 1.076089e-05 1.269817e-05 1.050850e-05 0.000000e+00 2.420436e-06 3.194498e-06 1.184981e+02 6.501920e-02
+32 1.134675e-05 1.389674e-05 1.088471e-05 0.000000e+00 1.336126e-06 2.360580e-06 1.184981e+02 6.501092e-02
+33 1.176388e-05 1.646795e-05 1.377408e-05 0.000000e+00 1.103848e-06 2.713855e-06 1.184981e+02 6.500045e-02
+34 1.215815e-05 1.836085e-05 1.374047e-05 0.000000e+00 4.446045e-06 6.522613e-06 1.184981e+02 6.498804e-02
+35 1.240326e-05 1.943054e-05 1.675348e-05 0.000000e+00 4.361800e-06 1.068783e-05 1.184981e+02 6.497749e-02
+36 1.242936e-05 2.098746e-05 1.761356e-05 0.000000e+00 4.108722e-06 1.294349e-05 1.184981e+02 6.496898e-02
+37 1.239419e-05 2.224936e-05 1.818587e-05 0.000000e+00 3.476282e-06 1.092535e-05 1.184981e+02 6.496343e-02
+38 1.213073e-05 2.264307e-05 1.658901e-05 0.000000e+00 3.429445e-06 7.036580e-06 1.184981e+02 6.496005e-02
+39 1.162989e-05 2.243661e-05 1.690455e-05 0.000000e+00 3.012411e-06 5.049023e-06 1.184981e+02 6.495737e-02
+40 1.103116e-05 2.088681e-05 1.519252e-05 0.000000e+00 3.360401e-06 5.015700e-06 1.184982e+02 6.495483e-02
+41 1.036127e-05 2.005796e-05 1.479215e-05 0.000000e+00 2.812548e-06 5.024893e-06 1.184982e+02 6.495301e-02
+42 9.744529e-06 1.833048e-05 1.284700e-05 0.000000e+00 3.180777e-06 7.582183e-06 1.184982e+02 6.495162e-02
+43 9.263170e-06 1.491333e-05 1.231457e-05 0.000000e+00 3.352863e-06 1.128084e-05 1.184982e+02 6.495069e-02
+44 8.908231e-06 1.109706e-05 9.750280e-06 0.000000e+00 4.022006e-06 1.335878e-05 1.184982e+02 6.495083e-02
+45 8.689926e-06 7.993414e-06 7.228397e-06 0.000000e+00 3.728930e-06 1.194650e-05 1.184982e+02 6.495284e-02
+46 8.623843e-06 6.257969e-06 5.142891e-06 0.000000e+00 3.181845e-06 7.491831e-06 1.184982e+02 6.495683e-02
+47 8.544442e-06 5.039086e-06 4.940162e-06 0.000000e+00 2.548429e-06 4.732586e-06 1.184982e+02 6.495949e-02
+48 8.658175e-06 3.665292e-06 3.700680e-06 0.000000e+00 2.780130e-06 4.194308e-06 1.184982e+02 6.496131e-02
+49 8.932635e-06 4.188016e-06 3.218705e-06 0.000000e+00 2.596445e-06 4.997936e-06 1.184982e+02 6.496167e-02
+50 9.354697e-06 5.983561e-06 3.264117e-06 0.000000e+00 3.669311e-06 8.454366e-06 1.184982e+02 6.496033e-02
+51 9.564652e-06 5.983758e-06 3.808870e-06 0.000000e+00 4.277975e-06 1.334353e-05 1.184982e+02 6.495947e-02
+52 9.973970e-06 5.763224e-06 3.441982e-06 0.000000e+00 4.647834e-06 1.573895e-05 1.184982e+02 6.495831e-02
+53 1.077490e-05 6.908530e-06 3.979456e-06 0.000000e+00 4.941954e-06 1.371142e-05 1.184982e+02 6.495753e-02
+54 1.162351e-05 9.526445e-06 6.372980e-06 0.000000e+00 3.779930e-06 9.804602e-06 1.184982e+02 6.495901e-02
+55 1.230223e-05 1.102725e-05 8.923143e-06 0.000000e+00 3.660578e-06 7.864807e-06 1.184982e+02 6.496026e-02
+56 1.315551e-05 1.126178e-05 1.038434e-05 0.000000e+00 4.881735e-06 7.615148e-06 1.184982e+02 6.496014e-02
+57 1.400204e-05 1.378678e-05 1.140362e-05 0.000000e+00 4.702933e-06 8.384719e-06 1.184981e+02 6.495994e-02
+58 1.473320e-05 1.741066e-05 1.400107e-05 0.000000e+00 3.461314e-06 1.059767e-05 1.184981e+02 6.496023e-02
+59 1.544506e-05 1.860019e-05 1.472486e-05 0.000000e+00 4.101008e-06 1.511014e-05 1.184981e+02 6.496108e-02
+60 1.604218e-05 1.927665e-05 1.536366e-05 0.000000e+00 4.819529e-06 1.692470e-05 1.184981e+02 6.496370e-02
+61 1.622100e-05 2.197242e-05 1.644278e-05 0.000000e+00 4.980495e-06 1.462011e-05 1.184981e+02 6.496983e-02
+62 1.617461e-05 2.488059e-05 1.859620e-05 0.000000e+00 3.688920e-06 1.079571e-05 1.184981e+02 6.497799e-02
+63 1.556817e-05 2.435463e-05 1.813104e-05 0.000000e+00 4.303580e-06 8.168597e-06 1.184981e+02 6.498631e-02
+64 1.464078e-05 2.173178e-05 1.673341e-05 0.000000e+00 5.550074e-06 8.098364e-06 1.184981e+02 6.499464e-02
+65 1.350230e-05 2.097137e-05 1.468624e-05 0.000000e+00 5.943262e-06 9.957077e-06 1.184981e+02 6.500331e-02
+66 1.234629e-05 2.038444e-05 1.517590e-05 0.000000e+00 4.901240e-06 1.321150e-05 1.184981e+02 6.501291e-02
+67 1.101322e-05 1.587321e-05 1.234141e-05 0.000000e+00 5.606571e-06 1.868974e-05 1.184981e+02 6.502270e-02
+68 9.583259e-06 1.171974e-05 1.093836e-05 0.000000e+00 6.338473e-06 2.174756e-05 1.184981e+02 6.503183e-02
+69 8.165632e-06 1.060355e-05 9.458377e-06 0.000000e+00 7.027531e-06 1.907472e-05 1.184981e+02 6.504153e-02
+70 6.959553e-06 9.833284e-06 9.044541e-06 0.000000e+00 6.519257e-06 1.549005e-05 1.184981e+02 6.505126e-02
+71 5.815111e-06 7.023828e-06 5.706147e-06 0.000000e+00 7.206372e-06 1.345502e-05 1.184981e+02 6.506082e-02
+72 5.166924e-06 4.247131e-06 3.958775e-06 0.000000e+00 7.761976e-06 1.267580e-05 1.184981e+02 6.506843e-02
+73 4.803115e-06 5.674442e-06 2.346622e-06 0.000000e+00 8.405699e-06 1.362215e-05 1.184981e+02 6.507111e-02
+74 5.039940e-06 7.274602e-06 4.515458e-06 0.000000e+00 5.938402e-06 1.647480e-05 1.184981e+02 6.507079e-02
+75 5.775290e-06 4.951154e-06 2.313985e-06 0.000000e+00 6.537798e-06 2.140251e-05 1.184981e+02 6.506838e-02
+76 6.624293e-06 3.872294e-06 3.426980e-06 0.000000e+00 6.993488e-06 2.334551e-05 1.184981e+02 6.506541e-02
+77 7.635322e-06 6.004856e-06 4.108564e-06 0.000000e+00 7.268501e-06 2.025680e-05 1.184981e+02 6.506314e-02
+78 8.875641e-06 7.552754e-06 6.178997e-06 0.000000e+00 6.945930e-06 1.638347e-05 1.184981e+02 6.505877e-02
+79 1.026827e-05 7.302621e-06 5.386863e-06 0.000000e+00 9.019517e-06 1.406337e-05 1.184981e+02 6.505133e-02
+80 1.195132e-05 8.100085e-06 7.947816e-06 0.000000e+00 9.807945e-06 1.449388e-05 1.184981e+02 6.504185e-02
+81 1.349678e-05 1.271949e-05 9.544657e-06 0.000000e+00 1.023278e-05 1.646793e-05 1.184981e+02 6.503186e-02
+82 1.476121e-05 1.660877e-05 1.430818e-05 0.000000e+00 8.156377e-06 2.084803e-05 1.184981e+02 6.502117e-02
+83 1.566660e-05 1.650345e-05 1.257526e-05 0.000000e+00 8.947742e-06 2.693621e-05 1.184981e+02 6.501116e-02
+84 1.600851e-05 1.827601e-05 1.518740e-05 0.000000e+00 8.535914e-06 2.949201e-05 1.184981e+02 6.500263e-02
+85 1.593330e-05 2.231372e-05 1.585441e-05 0.000000e+00 1.014545e-05 2.681005e-05 1.184981e+02 6.499477e-02
+86 1.547755e-05 2.391857e-05 1.762964e-05 0.000000e+00 1.034006e-05 2.246395e-05 1.184981e+02 6.498974e-02
+87 1.478723e-05 2.283182e-05 1.608284e-05 0.000000e+00 1.200172e-05 1.946662e-05 1.184981e+02 6.498744e-02
+88 1.376564e-05 2.194535e-05 1.798900e-05 0.000000e+00 1.156620e-05 1.868497e-05 1.184981e+02 6.498555e-02
+89 1.258873e-05 2.250807e-05 1.675023e-05 0.000000e+00 1.215965e-05 1.986826e-05 1.184981e+02 6.498258e-02
+90 1.129421e-05 2.081833e-05 1.778534e-05 0.000000e+00 9.441940e-06 2.370037e-05 1.184981e+02 6.497870e-02
+91 9.902586e-06 1.569493e-05 1.239515e-05 0.000000e+00 9.804146e-06 2.942611e-05 1.184981e+02 6.497579e-02
+92 8.334670e-06 1.349396e-05 1.277649e-05 0.000000e+00 9.552206e-06 3.088699e-05 1.184981e+02 6.497600e-02
+93 6.616452e-06 1.370592e-05 1.067496e-05 0.000000e+00 1.044417e-05 2.763205e-05 1.184981e+02 6.497955e-02
+94 5.380042e-06 1.189486e-05 1.012293e-05 0.000000e+00 1.098050e-05 2.339315e-05 1.184981e+02 6.498347e-02
+95 4.723974e-06 9.209914e-06 6.795317e-06 0.000000e+00 1.371696e-05 2.124474e-05 1.184981e+02 6.498499e-02
+96 4.300907e-06 7.963299e-06 7.318347e-06 0.000000e+00 1.432005e-05 2.175061e-05 1.184981e+02 6.498451e-02
+97 4.267742e-06 8.188468e-06 4.238744e-06 0.000000e+00 1.448553e-05 2.463766e-05 1.184981e+02 6.498309e-02
+98 4.857752e-06 6.681046e-06 4.673723e-06 0.000000e+00 1.174225e-05 2.992827e-05 1.184981e+02 6.498084e-02
+99 5.995379e-06 3.640535e-06 5.063447e-07 0.000000e+00 1.282934e-05 3.679356e-05 1.184981e+02 6.497759e-02
+100 7.860662e-06 3.560924e-06 3.582564e-06 0.000000e+00 1.156355e-05 3.888639e-05 1.184981e+02 6.497466e-02
+101 9.860412e-06 5.059921e-06 3.300192e-06 0.000000e+00 1.362165e-05 3.555205e-05 1.184981e+02 6.497239e-02
+102 1.153142e-05 4.883432e-06 5.296063e-06 0.000000e+00 1.462104e-05 3.130236e-05 1.184981e+02 6.497110e-02
+103 1.289927e-05 5.061299e-06 4.567073e-06 0.000000e+00 1.641775e-05 2.784097e-05 1.184981e+02 6.497177e-02
+104 1.389964e-05 7.065030e-06 7.788923e-06 0.000000e+00 1.607232e-05 2.768081e-05 1.184981e+02 6.497099e-02
+105 1.448034e-05 1.029706e-05 7.082680e-06 0.000000e+00 1.654359e-05 2.890398e-05 1.184981e+02 6.496786e-02
+106 1.480399e-05 1.307918e-05 1.081692e-05 0.000000e+00 1.333789e-05 3.331039e-05 1.184981e+02 6.496443e-02
+107 1.494903e-05 1.490932e-05 1.001407e-05 0.000000e+00 1.330480e-05 3.838874e-05 1.184981e+02 6.496218e-02
+108 1.474430e-05 1.844622e-05 1.511521e-05 0.000000e+00 1.209890e-05 4.002863e-05 1.184981e+02 6.496259e-02
+109 1.404679e-05 2.177889e-05 1.510434e-05 0.000000e+00 1.394179e-05 3.698895e-05 1.184981e+02 6.496668e-02
+110 1.308282e-05 2.248792e-05 1.698198e-05 0.000000e+00 1.524973e-05 3.347804e-05 1.184981e+02 6.497317e-02
+111 1.215977e-05 2.252644e-05 1.589239e-05 0.000000e+00 1.839191e-05 3.188696e-05 1.184981e+02 6.497918e-02
+112 1.132240e-05 2.208240e-05 1.786404e-05 0.000000e+00 1.821354e-05 3.252581e-05 1.184981e+02 6.498450e-02
+113 1.045024e-05 2.148506e-05 1.565670e-05 0.000000e+00 1.780138e-05 3.611192e-05 1.184981e+02 6.498976e-02
+114 9.518518e-06 2.039419e-05 1.762115e-05 0.000000e+00 1.527215e-05 4.183031e-05 1.184981e+02 6.499482e-02
+115 8.460304e-06 1.802592e-05 1.462402e-05 0.000000e+00 1.606040e-05 4.877663e-05 1.184981e+02 6.500168e-02
+116 7.610501e-06 1.657575e-05 1.591295e-05 0.000000e+00 1.458227e-05 5.037100e-05 1.184980e+02 6.501011e-02
+117 6.991693e-06 1.554243e-05 1.180681e-05 0.000000e+00 1.612175e-05 4.654916e-05 1.184980e+02 6.502038e-02
+118 6.588638e-06 1.409423e-05 1.090583e-05 0.000000e+00 1.761966e-05 4.193850e-05 1.184980e+02 6.503195e-02
+119 6.341682e-06 1.248385e-05 7.932327e-06 0.000000e+00 2.005455e-05 3.956180e-05 1.184980e+02 6.504314e-02
+120 6.215525e-06 1.048741e-05 8.019719e-06 0.000000e+00 1.918933e-05 3.961565e-05 1.184980e+02 6.505234e-02
+121 6.206110e-06 8.600038e-06 4.690203e-06 0.000000e+00 1.893273e-05 4.140344e-05 1.184980e+02 6.505803e-02
+122 6.430191e-06 7.413793e-06 6.034761e-06 0.000000e+00 1.585688e-05 4.465298e-05 1.184980e+02 6.506023e-02
+123 6.748747e-06 4.792934e-06 2.883819e-06 0.000000e+00 1.654489e-05 4.903315e-05 1.184980e+02 6.506081e-02
+124 7.635272e-06 2.830598e-06 3.860691e-06 0.000000e+00 1.537727e-05 5.072659e-05 1.184980e+02 6.506146e-02
+125 8.812719e-06 2.958521e-06 1.307768e-06 0.000000e+00 1.663870e-05 4.885172e-05 1.184980e+02 6.506187e-02
+126 1.028769e-05 3.895842e-06 3.502594e-06 0.000000e+00 1.859817e-05 4.702726e-05 1.184980e+02 6.506083e-02
+127 1.180471e-05 4.826407e-06 3.886297e-06 0.000000e+00 2.135167e-05 4.647095e-05 1.184980e+02 6.505729e-02
+128 1.344129e-05 5.783837e-06 6.319783e-06 0.000000e+00 2.132243e-05 4.750129e-05 1.184980e+02 6.505071e-02
+129 1.502819e-05 8.521960e-06 6.099748e-06 0.000000e+00 2.140093e-05 5.086682e-05 1.184980e+02 6.504051e-02
+130 1.610554e-05 1.248035e-05 9.798755e-06 0.000000e+00 1.825660e-05 5.635497e-05 1.184980e+02 6.502980e-02
+131 1.668697e-05 1.425726e-05 9.245042e-06 0.000000e+00 1.853694e-05 6.305256e-05 1.184980e+02 6.501928e-02
+132 1.657435e-05 1.595176e-05 1.210818e-05 0.000000e+00 1.790768e-05 6.458068e-05 1.184980e+02 6.501086e-02
+133 1.588091e-05 1.940286e-05 1.240770e-05 0.000000e+00 1.909093e-05 5.989742e-05 1.184980e+02 6.500578e-02
+134 1.516632e-05 2.187945e-05 1.614019e-05 0.000000e+00 2.075387e-05 5.576763e-05 1.184980e+02 6.500198e-02
+135 1.426317e-05 2.161762e-05 1.664197e-05 0.000000e+00 2.262219e-05 5.425004e-05 1.184980e+02 6.499907e-02
+136 1.313061e-05 2.014295e-05 1.744943e-05 0.000000e+00 2.184530e-05 5.507642e-05 1.184980e+02 6.499438e-02
+137 1.165515e-05 2.020463e-05 1.622897e-05 0.000000e+00 2.102136e-05 5.707687e-05 1.184980e+02 6.498758e-02
+138 1.027969e-05 2.122141e-05 1.790209e-05 0.000000e+00 1.876050e-05 5.870523e-05 1.184981e+02 6.498107e-02
+139 9.151161e-06 1.929146e-05 1.543646e-05 0.000000e+00 1.988921e-05 6.197194e-05 1.184981e+02 6.497554e-02
+140 8.401576e-06 1.785713e-05 1.541897e-05 0.000000e+00 1.928642e-05 6.404110e-05 1.184981e+02 6.497144e-02
+141 7.969762e-06 1.907077e-05 1.379573e-05 0.000000e+00 1.994505e-05 6.355704e-05 1.184981e+02 6.496934e-02
+142 7.323910e-06 1.868470e-05 1.414345e-05 0.000000e+00 2.159843e-05 6.414481e-05 1.184981e+02 6.496863e-02
+143 7.101512e-06 1.524429e-05 1.074126e-05 0.000000e+00 2.455412e-05 6.545456e-05 1.184981e+02 6.496834e-02
+144 7.193146e-06 1.134943e-05 8.465925e-06 0.000000e+00 2.530894e-05 6.718651e-05 1.184981e+02 6.496633e-02
+145 7.847141e-06 1.010744e-05 6.094936e-06 0.000000e+00 2.555471e-05 6.981448e-05 1.184981e+02 6.496189e-02
+146 8.881358e-06 9.051650e-06 6.319833e-06 0.000000e+00 2.308835e-05 7.434114e-05 1.184981e+02 6.495808e-02
+147 1.011800e-05 4.825424e-06 3.851280e-06 0.000000e+00 2.181072e-05 7.954680e-05 1.184981e+02 6.495623e-02
+148 1.151227e-05 2.327883e-06 3.519499e-06 0.000000e+00 2.158597e-05 8.074635e-05 1.184981e+02 6.495501e-02
+149 1.286553e-05 3.844424e-06 3.265090e-06 0.000000e+00 2.378478e-05 7.692219e-05 1.184981e+02 6.495497e-02
+150 1.398386e-05 4.214553e-06 3.877557e-06 0.000000e+00 2.542661e-05 7.373807e-05 1.184981e+02 6.495734e-02
+151 1.494346e-05 2.747631e-06 2.450232e-06 0.000000e+00 2.645267e-05 7.378929e-05 1.184981e+02 6.496010e-02
+152 1.551962e-05 3.703921e-06 3.093129e-06 0.000000e+00 2.661219e-05 7.506217e-05 1.184981e+02 6.496011e-02
+153 1.569375e-05 7.998523e-06 5.070456e-06 0.000000e+00 2.561213e-05 7.598636e-05 1.184981e+02 6.495828e-02
+154 1.527651e-05 1.169593e-05 7.926117e-06 0.000000e+00 2.417678e-05 7.698790e-05 1.184981e+02 6.495613e-02
+155 1.480520e-05 1.186157e-05 8.472266e-06 0.000000e+00 2.395509e-05 7.942609e-05 1.184981e+02 6.495463e-02
+156 1.443039e-05 1.371435e-05 1.006767e-05 0.000000e+00 2.408586e-05 8.148187e-05 1.184981e+02 6.495354e-02
+157 1.364542e-05 1.811526e-05 1.252849e-05 0.000000e+00 2.603951e-05 8.270129e-05 1.184981e+02 6.495306e-02
+158 1.305798e-05 1.934681e-05 1.394184e-05 0.000000e+00 2.824983e-05 8.545799e-05 1.184980e+02 6.495431e-02
+159 1.248267e-05 1.787404e-05 1.407376e-05 0.000000e+00 2.967717e-05 8.849301e-05 1.184980e+02 6.495697e-02
+160 1.205226e-05 1.854096e-05 1.577172e-05 0.000000e+00 3.082631e-05 9.004262e-05 1.184980e+02 6.495907e-02
+161 1.188758e-05 2.121659e-05 1.781086e-05 0.000000e+00 3.124367e-05 9.142901e-05 1.184980e+02 6.496041e-02
+162 1.180356e-05 2.166630e-05 1.798904e-05 0.000000e+00 2.935305e-05 9.370242e-05 1.184980e+02 6.496406e-02
+163 1.214516e-05 1.954199e-05 1.651676e-05 0.000000e+00 2.665216e-05 9.733222e-05 1.184980e+02 6.497057e-02
+164 1.272012e-05 1.971176e-05 1.537534e-05 0.000000e+00 2.666685e-05 9.763183e-05 1.184980e+02 6.497817e-02
+165 1.337472e-05 2.181861e-05 1.573549e-05 0.000000e+00 2.961933e-05 9.471334e-05 1.184980e+02 6.498604e-02
+166 1.407344e-05 2.005031e-05 1.381797e-05 0.000000e+00 3.241843e-05 9.296139e-05 1.184980e+02 6.499629e-02
+167 1.499478e-05 1.593676e-05 1.188485e-05 0.000000e+00 3.268202e-05 9.375230e-05 1.184980e+02 6.500691e-02
+168 1.576771e-05 1.386851e-05 1.062976e-05 0.000000e+00 3.217885e-05 9.465565e-05 1.184980e+02 6.501580e-02
+169 1.652889e-05 1.259293e-05 9.910030e-06 0.000000e+00 3.316532e-05 9.531114e-05 1.184980e+02 6.502139e-02
+170 1.661954e-05 9.061469e-06 6.763577e-06 0.000000e+00 3.295240e-05 9.679586e-05 1.184980e+02 6.502619e-02
+171 1.659207e-05 4.743582e-06 4.765769e-06 0.000000e+00 2.936072e-05 9.916072e-05 1.184980e+02 6.503157e-02
+172 1.645079e-05 3.804639e-06 3.164678e-06 0.000000e+00 3.059698e-05 1.025897e-04 1.184980e+02 6.503324e-02
+173 1.631701e-05 4.984278e-06 4.604168e-06 0.000000e+00 3.373130e-05 1.046973e-04 1.184980e+02 6.503347e-02
+174 1.632076e-05 3.866597e-06 2.861987e-06 0.000000e+00 3.766186e-05 1.088662e-04 1.184980e+02 6.503216e-02
+175 1.640694e-05 2.752606e-06 3.063587e-06 0.000000e+00 3.745626e-05 1.134271e-04 1.184980e+02 6.503011e-02
+176 1.624584e-05 4.806657e-06 3.302024e-06 0.000000e+00 3.776848e-05 1.152802e-04 1.184980e+02 6.502625e-02
+177 1.613666e-05 7.510312e-06 4.716636e-06 0.000000e+00 3.915788e-05 1.166883e-04 1.184980e+02 6.501885e-02
+178 1.596136e-05 8.749908e-06 4.153924e-06 0.000000e+00 3.813228e-05 1.196055e-04 1.184980e+02 6.501132e-02
+179 1.596574e-05 9.633099e-06 6.631713e-06 0.000000e+00 3.400896e-05 1.217066e-04 1.184980e+02 6.500567e-02
+180 1.607998e-05 1.208214e-05 8.015670e-06 0.000000e+00 3.367470e-05 1.223222e-04 1.184980e+02 6.499865e-02
+181 1.625294e-05 1.513687e-05 1.205184e-05 0.000000e+00 3.667846e-05 1.201889e-04 1.184980e+02 6.498879e-02
+182 1.623981e-05 1.493873e-05 1.158159e-05 0.000000e+00 4.073605e-05 1.184452e-04 1.184980e+02 6.498055e-02
+183 1.607395e-05 1.506021e-05 1.370484e-05 0.000000e+00 3.960438e-05 1.202295e-04 1.184980e+02 6.497314e-02
+184 1.592726e-05 1.740680e-05 1.458307e-05 0.000000e+00 3.935333e-05 1.211136e-04 1.184980e+02 6.496347e-02
+185 1.559330e-05 1.957744e-05 1.635205e-05 0.000000e+00 3.956491e-05 1.224481e-04 1.184980e+02 6.495159e-02
+186 1.512386e-05 2.112024e-05 1.566574e-05 0.000000e+00 3.951650e-05 1.262156e-04 1.184980e+02 6.493993e-02
+187 1.447338e-05 2.174998e-05 1.813882e-05 0.000000e+00 3.631313e-05 1.297609e-04 1.184980e+02 6.493007e-02
+188 1.360220e-05 2.250888e-05 1.661892e-05 0.000000e+00 3.726854e-05 1.337804e-04 1.184980e+02 6.492074e-02
+189 1.291824e-05 2.276172e-05 1.772043e-05 0.000000e+00 3.996087e-05 1.369939e-04 1.184980e+02 6.491172e-02
+190 1.244955e-05 2.027587e-05 1.375933e-05 0.000000e+00 4.470140e-05 1.403563e-04 1.184980e+02 6.490440e-02
+191 1.228397e-05 1.756625e-05 1.390743e-05 0.000000e+00 4.465469e-05 1.452348e-04 1.184980e+02 6.489904e-02
+192 1.196174e-05 1.552100e-05 1.140714e-05 0.000000e+00 4.520870e-05 1.473143e-04 1.184980e+02 6.489490e-02
+193 1.191884e-05 1.292525e-05 1.081647e-05 0.000000e+00 4.490782e-05 1.488781e-04 1.184980e+02 6.489071e-02
+194 1.216907e-05 1.019635e-05 7.940321e-06 0.000000e+00 4.427291e-05 1.515243e-04 1.184980e+02 6.488828e-02
+195 1.251340e-05 7.317615e-06 8.463279e-06 0.000000e+00 4.055276e-05 1.532238e-04 1.184980e+02 6.488897e-02
+196 1.305866e-05 4.983402e-06 4.448139e-06 0.000000e+00 4.073198e-05 1.534340e-04 1.184980e+02 6.489010e-02
+197 1.348361e-05 4.644833e-06 4.811950e-06 0.000000e+00 4.231742e-05 1.512750e-04 1.184980e+02 6.489001e-02
+198 1.416408e-05 4.064728e-06 1.514373e-06 0.000000e+00 4.663548e-05 1.505102e-04 1.184980e+02 6.488978e-02
+199 1.522713e-05 3.972512e-06 3.585000e-06 0.000000e+00 4.666953e-05 1.515950e-04 1.184980e+02 6.489043e-02
+200 1.640793e-05 5.111195e-06 2.513281e-06 0.000000e+00 4.675708e-05 1.532483e-04 1.184980e+02 6.488917e-02
+201 1.714572e-05 6.344286e-06 4.196244e-06 0.000000e+00 4.525965e-05 1.553465e-04 1.184980e+02 6.488562e-02
+202 1.777088e-05 8.089202e-06 3.856276e-06 0.000000e+00 4.508367e-05 1.596605e-04 1.184980e+02 6.488126e-02
+203 1.802502e-05 8.518893e-06 7.007988e-06 0.000000e+00 4.372474e-05 1.654124e-04 1.184980e+02 6.487652e-02
+204 1.781432e-05 8.740743e-06 5.163865e-06 0.000000e+00 4.605821e-05 1.695040e-04 1.184980e+02 6.487274e-02
+205 1.704451e-05 1.121824e-05 9.225617e-06 0.000000e+00 4.638470e-05 1.726281e-04 1.184980e+02 6.487005e-02
+206 1.622760e-05 1.308446e-05 9.355849e-06 0.000000e+00 4.978474e-05 1.755455e-04 1.184980e+02 6.486867e-02
+207 1.556391e-05 1.417920e-05 1.370283e-05 0.000000e+00 5.092474e-05 1.795565e-04 1.184980e+02 6.486848e-02
+208 1.474437e-05 1.594397e-05 1.335671e-05 0.000000e+00 5.242321e-05 1.816820e-04 1.184980e+02 6.486977e-02
+209 1.360105e-05 1.863988e-05 1.569783e-05 0.000000e+00 5.018827e-05 1.827610e-04 1.184980e+02 6.487233e-02
+210 1.265653e-05 2.160609e-05 1.551244e-05 0.000000e+00 4.850402e-05 1.846486e-04 1.184980e+02 6.487737e-02
+211 1.171881e-05 2.218478e-05 1.790504e-05 0.000000e+00 4.651917e-05 1.862353e-04 1.184980e+02 6.488503e-02
+212 1.083113e-05 2.187168e-05 1.496384e-05 0.000000e+00 4.760518e-05 1.865178e-04 1.184980e+02 6.489377e-02
+213 1.009088e-05 2.335240e-05 1.793830e-05 0.000000e+00 4.654069e-05 1.838709e-04 1.184980e+02 6.490313e-02
+214 9.233927e-06 2.246273e-05 1.570109e-05 0.000000e+00 4.908777e-05 1.835718e-04 1.184980e+02 6.491189e-02
+215 8.587394e-06 1.878521e-05 1.643788e-05 0.000000e+00 4.987120e-05 1.857317e-04 1.184980e+02 6.492088e-02
+216 8.288775e-06 1.582118e-05 1.242598e-05 0.000000e+00 5.137792e-05 1.881981e-04 1.184979e+02 6.492848e-02
+217 8.561525e-06 1.434679e-05 1.226109e-05 0.000000e+00 4.940410e-05 1.926037e-04 1.184979e+02 6.493297e-02
+218 9.389615e-06 1.294611e-05 9.674739e-06 0.000000e+00 4.873565e-05 1.978934e-04 1.184979e+02 6.493679e-02
+219 1.036987e-05 9.382294e-06 9.497384e-06 0.000000e+00 4.849665e-05 2.051026e-04 1.184979e+02 6.493893e-02
+220 1.128605e-05 7.003108e-06 5.146001e-06 0.000000e+00 5.199790e-05 2.106272e-04 1.184979e+02 6.494011e-02
+221 1.191285e-05 8.001417e-06 7.087823e-06 0.000000e+00 5.109817e-05 2.134529e-04 1.184979e+02 6.494134e-02
+222 1.251746e-05 7.068918e-06 3.661389e-06 0.000000e+00 5.321318e-05 2.175553e-04 1.184979e+02 6.494219e-02
+223 1.315543e-05 4.494564e-06 3.721553e-06 0.000000e+00 5.475299e-05 2.216447e-04 1.184979e+02 6.494157e-02
+224 1.363436e-05 4.246501e-06 9.727194e-07 0.000000e+00 5.601326e-05 2.236982e-04 1.184979e+02 6.494036e-02
+225 1.404433e-05 6.275135e-06 3.374352e-06 0.000000e+00 5.345356e-05 2.244164e-04 1.184979e+02 6.493835e-02
+226 1.392847e-05 7.145635e-06 3.332075e-06 0.000000e+00 5.231976e-05 2.262410e-04 1.184979e+02 6.493554e-02
+227 1.389463e-05 5.278219e-06 5.263156e-06 0.000000e+00 5.063022e-05 2.288265e-04 1.184979e+02 6.493287e-02
+228 1.361133e-05 5.417304e-06 4.042778e-06 0.000000e+00 5.095233e-05 2.294680e-04 1.184979e+02 6.492828e-02
+229 1.348262e-05 8.909613e-06 8.581480e-06 0.000000e+00 4.967032e-05 2.277265e-04 1.184979e+02 6.492132e-02
+230 1.314408e-05 1.015595e-05 7.692012e-06 0.000000e+00 5.247015e-05 2.281659e-04 1.184979e+02 6.491196e-02
+231 1.237025e-05 1.044388e-05 1.019641e-05 0.000000e+00 5.269277e-05 2.321894e-04 1.184979e+02 6.490174e-02
+232 1.115669e-05 1.397198e-05 1.065293e-05 0.000000e+00 5.295487e-05 2.366187e-04 1.184979e+02 6.489048e-02
+233 9.752835e-06 1.912548e-05 1.511745e-05 0.000000e+00 5.221333e-05 2.416914e-04 1.184979e+02 6.487610e-02
+234 8.605290e-06 2.137043e-05 1.552171e-05 0.000000e+00 5.325057e-05 2.487190e-04 1.184979e+02 6.486006e-02
+235 7.792419e-06 2.063225e-05 1.666107e-05 0.000000e+00 5.338143e-05 2.569528e-04 1.184980e+02 6.484502e-02
+236 7.195132e-06 2.142159e-05 1.496936e-05 0.000000e+00 5.521459e-05 2.628231e-04 1.184980e+02 6.483227e-02
+237 6.888390e-06 2.357607e-05 1.793990e-05 0.000000e+00 5.466270e-05 2.664936e-04 1.184980e+02 6.482033e-02
+238 6.805050e-06 2.155490e-05 1.542123e-05 0.000000e+00 5.690264e-05 2.709374e-04 1.184980e+02 6.481061e-02
+239 6.750359e-06 1.768390e-05 1.571475e-05 0.000000e+00 5.666586e-05 2.743553e-04 1.184980e+02 6.480448e-02
+240 6.703271e-06 1.692516e-05 1.421589e-05 0.000000e+00 5.683803e-05 2.754377e-04 1.184980e+02 6.479944e-02
+241 6.574859e-06 1.651063e-05 1.505981e-05 0.000000e+00 5.529616e-05 2.747465e-04 1.184980e+02 6.479631e-02
+242 6.463050e-06 1.337168e-05 1.171432e-05 0.000000e+00 5.507278e-05 2.748243e-04 1.184980e+02 6.479504e-02
+243 6.162572e-06 9.587882e-06 9.711242e-06 0.000000e+00 5.326817e-05 2.761332e-04 1.184980e+02 6.479634e-02
+244 5.945123e-06 9.164305e-06 6.755044e-06 0.000000e+00 5.185726e-05 2.757937e-04 1.184980e+02 6.479895e-02
+245 6.070609e-06 1.045109e-05 7.581968e-06 0.000000e+00 5.116644e-05 2.759351e-04 1.184980e+02 6.479679e-02
+246 6.429951e-06 8.054248e-06 4.456960e-06 0.000000e+00 5.338279e-05 2.791566e-04 1.184980e+02 6.479230e-02
+247 7.205727e-06 5.500609e-06 4.219219e-06 0.000000e+00 5.325261e-05 2.848417e-04 1.184980e+02 6.478884e-02
+248 8.322260e-06 6.066741e-06 3.131718e-06 0.000000e+00 5.326394e-05 2.898353e-04 1.184980e+02 6.478479e-02
+249 9.684156e-06 6.601148e-06 3.788431e-06 0.000000e+00 5.260184e-05 2.945321e-04 1.184980e+02 6.477855e-02
+250 1.128570e-05 4.433596e-06 2.000451e-06 0.000000e+00 5.396610e-05 3.016443e-04 1.184980e+02 6.477032e-02
+251 1.310173e-05 3.125120e-06 2.805912e-06 0.000000e+00 5.463136e-05 3.086983e-04 1.184980e+02 6.476202e-02
+252 1.501812e-05 4.925280e-06 3.717492e-06 0.000000e+00 5.516285e-05 3.150621e-04 1.184980e+02 6.475449e-02
+253 1.689296e-05 7.519889e-06 6.838041e-06 0.000000e+00 5.462306e-05 3.188408e-04 1.184980e+02 6.474864e-02
+254 1.862692e-05 7.781667e-06 6.787587e-06 0.000000e+00 5.579883e-05 3.233503e-04 1.184980e+02 6.474463e-02
+255 2.004176e-05 9.078100e-06 8.856317e-06 0.000000e+00 5.518956e-05 3.265135e-04 1.184980e+02 6.474287e-02
+256 2.139267e-05 1.348243e-05 1.033544e-05 0.000000e+00 5.484510e-05 3.253773e-04 1.184980e+02 6.474288e-02
+257 2.225975e-05 1.728099e-05 1.251345e-05 0.000000e+00 5.356608e-05 3.226933e-04 1.184980e+02 6.474578e-02
+258 2.301420e-05 1.842333e-05 1.288889e-05 0.000000e+00 5.244702e-05 3.212631e-04 1.184980e+02 6.475070e-02
+259 2.320958e-05 1.972724e-05 1.518920e-05 0.000000e+00 5.142514e-05 3.206830e-04 1.184980e+02 6.475609e-02
+260 2.242806e-05 2.180290e-05 1.645087e-05 0.000000e+00 5.078400e-05 3.214700e-04 1.184980e+02 6.476096e-02
+261 2.101970e-05 2.244771e-05 1.743651e-05 0.000000e+00 4.973813e-05 3.239564e-04 1.184980e+02 6.476461e-02
+262 1.935195e-05 2.035194e-05 1.614836e-05 0.000000e+00 5.039450e-05 3.291325e-04 1.184979e+02 6.476787e-02
+263 1.779515e-05 1.867473e-05 1.596669e-05 0.000000e+00 5.034546e-05 3.368465e-04 1.184979e+02 6.477051e-02
+264 1.667467e-05 1.885797e-05 1.544840e-05 0.000000e+00 5.089765e-05 3.424158e-04 1.184979e+02 6.477316e-02
+265 1.537483e-05 1.793678e-05 1.457540e-05 0.000000e+00 5.126289e-05 3.472284e-04 1.184979e+02 6.477620e-02
+266 1.418454e-05 1.558760e-05 1.287774e-05 0.000000e+00 5.084897e-05 3.539335e-04 1.184979e+02 6.477984e-02
+267 1.291831e-05 1.402697e-05 1.210074e-05 0.000000e+00 5.146592e-05 3.604534e-04 1.184979e+02 6.478278e-02
+268 1.203817e-05 1.292486e-05 1.032458e-05 0.000000e+00 5.245798e-05 3.663206e-04 1.184979e+02 6.478541e-02
+269 1.127373e-05 1.174008e-05 7.747421e-06 0.000000e+00 5.183222e-05 3.706436e-04 1.184979e+02 6.479000e-02
+270 1.056513e-05 9.682309e-06 5.757357e-06 0.000000e+00 5.076800e-05 3.745797e-04 1.184979e+02 6.479558e-02
+271 1.006956e-05 7.720897e-06 4.536173e-06 0.000000e+00 5.024209e-05 3.771698e-04 1.184979e+02 6.480107e-02
+272 9.673895e-06 6.937720e-06 4.338265e-06 0.000000e+00 5.042337e-05 3.753614e-04 1.184979e+02 6.480589e-02
+273 9.456650e-06 5.479337e-06 3.359635e-06 0.000000e+00 5.006724e-05 3.716715e-04 1.184979e+02 6.481066e-02
+274 9.404496e-06 3.577141e-06 3.128884e-06 0.000000e+00 4.794913e-05 3.700032e-04 1.184979e+02 6.481449e-02
+275 9.748479e-06 2.672566e-06 3.195059e-06 0.000000e+00 4.701256e-05 3.702766e-04 1.184979e+02 6.481484e-02
+276 9.972259e-06 2.452314e-06 3.326401e-06 0.000000e+00 4.768690e-05 3.725313e-04 1.184979e+02 6.481010e-02
+277 1.043894e-05 4.364222e-06 3.273732e-06 0.000000e+00 4.770193e-05 3.770828e-04 1.184979e+02 6.480148e-02
+278 1.108211e-05 7.023667e-06 5.718255e-06 0.000000e+00 4.555248e-05 3.844821e-04 1.184979e+02 6.479132e-02
+279 1.168431e-05 9.179251e-06 6.913763e-06 0.000000e+00 4.624762e-05 3.931202e-04 1.184979e+02 6.477817e-02
+280 1.200165e-05 1.245040e-05 9.541981e-06 0.000000e+00 4.753379e-05 3.994836e-04 1.184979e+02 6.476353e-02
+281 1.224947e-05 1.507197e-05 1.033833e-05 0.000000e+00 4.799130e-05 4.048441e-04 1.184979e+02 6.475038e-02
+282 1.234301e-05 1.695932e-05 1.242500e-05 0.000000e+00 4.635134e-05 4.105416e-04 1.184979e+02 6.473806e-02
+283 1.209599e-05 1.763935e-05 1.335099e-05 0.000000e+00 4.692345e-05 4.158974e-04 1.184979e+02 6.472575e-02
+284 1.175198e-05 1.800315e-05 1.461475e-05 0.000000e+00 4.825256e-05 4.194532e-04 1.184979e+02 6.471485e-02
+285 1.129161e-05 1.961735e-05 1.500543e-05 0.000000e+00 4.760208e-05 4.220778e-04 1.184979e+02 6.470658e-02
+286 1.082435e-05 2.022494e-05 1.779407e-05 0.000000e+00 4.475258e-05 4.247101e-04 1.184979e+02 6.470065e-02
+287 1.047554e-05 1.846070e-05 1.648253e-05 0.000000e+00 4.442934e-05 4.254864e-04 1.184979e+02 6.469736e-02
+288 1.010337e-05 1.787128e-05 1.689916e-05 0.000000e+00 4.410055e-05 4.211245e-04 1.184979e+02 6.469641e-02
+289 9.560269e-06 1.830397e-05 1.477652e-05 0.000000e+00 4.468469e-05 4.160830e-04 1.184979e+02 6.469653e-02
+290 9.054678e-06 1.776279e-05 1.473493e-05 0.000000e+00 4.215049e-05 4.127535e-04 1.184980e+02 6.469803e-02
+291 8.937753e-06 1.585793e-05 1.252031e-05 0.000000e+00 4.142438e-05 4.132184e-04 1.184980e+02 6.469799e-02
+292 9.102109e-06 1.433228e-05 1.131935e-05 0.000000e+00 4.189159e-05 4.156700e-04 1.184980e+02 6.469540e-02
+293 9.433155e-06 1.464432e-05 9.212736e-06 0.000000e+00 4.226822e-05 4.200498e-04 1.184980e+02 6.469069e-02
+294 9.760922e-06 1.271591e-05 9.699130e-06 0.000000e+00 4.016184e-05 4.277615e-04 1.184980e+02 6.468624e-02
+295 1.034802e-05 8.225139e-06 5.179977e-06 0.000000e+00 4.118594e-05 4.366230e-04 1.184980e+02 6.468112e-02
+296 1.105491e-05 5.944705e-06 4.896194e-06 0.000000e+00 4.172626e-05 4.421366e-04 1.184980e+02 6.467487e-02
+297 1.122419e-05 5.469309e-06 2.873649e-06 0.000000e+00 4.275459e-05 4.451323e-04 1.184980e+02 6.467122e-02
+298 1.151602e-05 3.885736e-06 3.877891e-06 0.000000e+00 4.060796e-05 4.494952e-04 1.184980e+02 6.466889e-02
+299 1.224114e-05 1.656401e-06 2.445993e-06 0.000000e+00 4.077679e-05 4.519993e-04 1.184980e+02 6.466708e-02
+300 1.292253e-05 1.845285e-06 3.121199e-06 0.000000e+00 4.161807e-05 4.529318e-04 1.184980e+02 6.466513e-02
+301 1.374072e-05 5.302546e-06 3.111205e-06 0.000000e+00 4.155873e-05 4.532788e-04 1.184980e+02 6.466385e-02
+302 1.460778e-05 6.786162e-06 6.117287e-06 0.000000e+00 3.879070e-05 4.546283e-04 1.184980e+02 6.466384e-02
+303 1.552272e-05 6.821334e-06 3.691919e-06 0.000000e+00 3.909836e-05 4.532331e-04 1.184980e+02 6.466588e-02
+304 1.620216e-05 1.004938e-05 7.373777e-06 0.000000e+00 3.794437e-05 4.486841e-04 1.184980e+02 6.466845e-02
+305 1.692011e-05 1.432773e-05 8.392457e-06 0.000000e+00 3.885703e-05 4.421090e-04 1.184980e+02 6.467213e-02
+306 1.758976e-05 1.473448e-05 1.192132e-05 0.000000e+00 3.730560e-05 4.394876e-04 1.184980e+02 6.467482e-02
+307 1.800975e-05 1.401349e-05 1.136578e-05 0.000000e+00 3.702327e-05 4.408586e-04 1.184980e+02 6.467551e-02
+308 1.822602e-05 1.525914e-05 1.373017e-05 0.000000e+00 3.668276e-05 4.438521e-04 1.184979e+02 6.467419e-02
+309 1.812178e-05 1.835044e-05 1.402333e-05 0.000000e+00 3.721782e-05 4.493767e-04 1.184979e+02 6.467060e-02
+310 1.786376e-05 1.833094e-05 1.702058e-05 0.000000e+00 3.576329e-05 4.574906e-04 1.184979e+02 6.466843e-02
+311 1.760878e-05 1.664363e-05 1.390225e-05 0.000000e+00 3.813003e-05 4.664465e-04 1.184979e+02 6.466759e-02
+312 1.693668e-05 1.900879e-05 1.718771e-05 0.000000e+00 3.612078e-05 4.711451e-04 1.184979e+02 6.466922e-02
+313 1.588841e-05 2.100794e-05 1.583777e-05 0.000000e+00 3.701666e-05 4.738392e-04 1.184979e+02 6.467388e-02
+314 1.523141e-05 1.917508e-05 1.649474e-05 0.000000e+00 3.603900e-05 4.769027e-04 1.184979e+02 6.468076e-02
+315 1.497669e-05 1.647467e-05 1.262080e-05 0.000000e+00 3.678219e-05 4.782396e-04 1.184979e+02 6.468982e-02
+316 1.473391e-05 1.649149e-05 1.264315e-05 0.000000e+00 3.628832e-05 4.776635e-04 1.184979e+02 6.470069e-02
+317 1.437845e-05 1.667918e-05 1.050579e-05 0.000000e+00 3.519625e-05 4.760742e-04 1.184979e+02 6.471214e-02
+318 1.403674e-05 1.301428e-05 1.092757e-05 0.000000e+00 3.412736e-05 4.755815e-04 1.184979e+02 6.472240e-02
+319 1.348467e-05 8.543116e-06 5.990457e-06 0.000000e+00 3.646420e-05 4.746007e-04 1.184979e+02 6.473319e-02
+320 1.308219e-05 7.987080e-06 7.881944e-06 0.000000e+00 3.346244e-05 4.696366e-04 1.184979e+02 6.474544e-02
+321 1.260409e-05 6.807943e-06 4.605829e-06 0.000000e+00 3.353699e-05 4.655222e-04 1.184979e+02 6.475596e-02
+322 1.199373e-05 2.961183e-06 4.289799e-06 0.000000e+00 3.396580e-05 4.652070e-04 1.184979e+02 6.476131e-02
+323 1.139186e-05 1.269555e-06 1.239493e-06 0.000000e+00 3.558201e-05 4.670945e-04 1.184979e+02 6.476231e-02
+324 1.057621e-05 3.605030e-06 3.237398e-06 0.000000e+00 3.303181e-05 4.704598e-04 1.184979e+02 6.476156e-02
+325 9.949060e-06 6.085723e-06 2.601224e-06 0.000000e+00 3.229932e-05 4.758944e-04 1.184979e+02 6.475693e-02
+326 1.008871e-05 5.727820e-06 5.011811e-06 0.000000e+00 3.264357e-05 4.838408e-04 1.184979e+02 6.474788e-02
+327 1.084705e-05 5.738840e-06 2.671431e-06 0.000000e+00 3.589466e-05 4.915645e-04 1.184979e+02 6.473772e-02
+328 1.192517e-05 9.196337e-06 7.281320e-06 0.000000e+00 3.290092e-05 4.953175e-04 1.184979e+02 6.472873e-02
+329 1.295244e-05 1.075595e-05 6.236231e-06 0.000000e+00 3.189584e-05 4.974813e-04 1.184979e+02 6.472131e-02
+330 1.389825e-05 9.979620e-06 9.007821e-06 0.000000e+00 3.172459e-05 4.992614e-04 1.184979e+02 6.471378e-02
+331 1.452189e-05 1.112456e-05 9.232443e-06 0.000000e+00 3.406877e-05 4.990813e-04 1.184979e+02 6.470682e-02
+332 1.489289e-05 1.440445e-05 1.351905e-05 0.000000e+00 3.151808e-05 4.960371e-04 1.184979e+02 6.470341e-02
+333 1.479413e-05 1.625305e-05 1.367809e-05 0.000000e+00 3.078667e-05 4.921462e-04 1.184979e+02 6.470074e-02
+334 1.447829e-05 1.601856e-05 1.599872e-05 0.000000e+00 3.076806e-05 4.892507e-04 1.184979e+02 6.469687e-02
+335 1.396654e-05 1.671113e-05 1.407196e-05 0.000000e+00 3.237199e-05 4.857396e-04 1.184979e+02 6.469524e-02
+336 1.290990e-05 1.955467e-05 1.772372e-05 0.000000e+00 3.064321e-05 4.804555e-04 1.184979e+02 6.469514e-02
+337 1.133471e-05 2.034709e-05 1.528239e-05 0.000000e+00 3.128450e-05 4.761221e-04 1.184979e+02 6.469597e-02
+338 9.561444e-06 1.910298e-05 1.614645e-05 0.000000e+00 3.063590e-05 4.749495e-04 1.184979e+02 6.469600e-02
+339 7.972071e-06 1.893948e-05 1.440559e-05 0.000000e+00 3.211940e-05 4.764301e-04 1.184979e+02 6.469344e-02
+340 6.887854e-06 1.880741e-05 1.543108e-05 0.000000e+00 3.017217e-05 4.785278e-04 1.184979e+02 6.468982e-02
+341 5.951887e-06 1.653670e-05 1.200159e-05 0.000000e+00 2.962770e-05 4.832380e-04 1.184979e+02 6.468464e-02
+342 5.496037e-06 1.306173e-05 1.134671e-05 0.000000e+00 2.966583e-05 4.909598e-04 1.184979e+02 6.467818e-02
+343 5.608876e-06 1.029726e-05 7.694006e-06 0.000000e+00 3.133897e-05 4.966976e-04 1.184979e+02 6.467344e-02
+344 6.229923e-06 8.868442e-06 8.831884e-06 0.000000e+00 2.945003e-05 4.984918e-04 1.184979e+02 6.467087e-02
+345 7.395028e-06 6.994186e-06 5.259472e-06 0.000000e+00 2.911775e-05 4.979709e-04 1.184979e+02 6.467056e-02
+346 9.107503e-06 4.830453e-06 5.365243e-06 0.000000e+00 2.752586e-05 4.964976e-04 1.184979e+02 6.467266e-02
+347 1.115116e-05 4.413937e-06 3.314732e-06 0.000000e+00 2.832105e-05 4.932987e-04 1.184979e+02 6.467476e-02
+348 1.361847e-05 5.074367e-06 3.833853e-06 0.000000e+00 2.768723e-05 4.891425e-04 1.184979e+02 6.467541e-02
+349 1.567396e-05 5.272326e-06 1.656109e-06 0.000000e+00 2.837903e-05 4.845217e-04 1.184979e+02 6.467627e-02
+350 1.719064e-05 5.695671e-06 3.347001e-06 0.000000e+00 2.755043e-05 4.804883e-04 1.184979e+02 6.467907e-02
+351 1.790939e-05 5.845781e-06 2.690786e-06 0.000000e+00 2.793406e-05 4.767266e-04 1.184979e+02 6.468280e-02
+352 1.762048e-05 6.754481e-06 5.566141e-06 0.000000e+00 2.712512e-05 4.716676e-04 1.184979e+02 6.468576e-02
+353 1.675962e-05 7.516870e-06 4.997998e-06 0.000000e+00 2.889944e-05 4.671817e-04 1.184979e+02 6.468845e-02
+354 1.548433e-05 7.687119e-06 7.668269e-06 0.000000e+00 2.830598e-05 4.662770e-04 1.184979e+02 6.469064e-02
+355 1.431412e-05 8.652871e-06 8.066246e-06 0.000000e+00 2.788332e-05 4.682466e-04 1.184979e+02 6.469116e-02
+356 1.342894e-05 1.064988e-05 1.038381e-05 0.000000e+00 2.649414e-05 4.708501e-04 1.184979e+02 6.468903e-02
+357 1.247694e-05 1.302942e-05 1.095239e-05 0.000000e+00 2.671983e-05 4.766384e-04 1.184979e+02 6.468431e-02
+358 1.130481e-05 1.531926e-05 1.432085e-05 0.000000e+00 2.633209e-05 4.832585e-04 1.184979e+02 6.468096e-02
+359 1.015681e-05 1.630398e-05 1.444671e-05 0.000000e+00 2.760820e-05 4.885631e-04 1.184979e+02 6.468057e-02
+360 9.135046e-06 1.776935e-05 1.588357e-05 0.000000e+00 2.625285e-05 4.888293e-04 1.184979e+02 6.468478e-02
+361 8.219218e-06 1.961050e-05 1.509533e-05 0.000000e+00 2.478953e-05 4.858920e-04 1.184979e+02 6.469369e-02
+362 7.353334e-06 1.959969e-05 1.626496e-05 0.000000e+00 2.340134e-05 4.829418e-04 1.184979e+02 6.470520e-02
+363 6.664065e-06 1.858890e-05 1.515251e-05 0.000000e+00 2.436567e-05 4.786050e-04 1.184979e+02 6.471653e-02
+364 6.372792e-06 1.794468e-05 1.479936e-05 0.000000e+00 2.466374e-05 4.732165e-04 1.184979e+02 6.472768e-02
+365 6.442233e-06 1.714949e-05 1.351545e-05 0.000000e+00 2.407237e-05 4.685866e-04 1.184979e+02 6.473931e-02
+366 6.990918e-06 1.495759e-05 1.353731e-05 0.000000e+00 2.404273e-05 4.652225e-04 1.184979e+02 6.475030e-02
+367 7.830309e-06 1.089403e-05 1.057242e-05 0.000000e+00 2.556588e-05 4.621599e-04 1.184979e+02 6.476173e-02
+368 8.631484e-06 8.871407e-06 8.635766e-06 0.000000e+00 2.474270e-05 4.583540e-04 1.184979e+02 6.477260e-02
+369 9.626822e-06 8.930323e-06 6.829414e-06 0.000000e+00 2.429970e-05 4.545791e-04 1.184979e+02 6.478195e-02
+370 1.080557e-05 7.497673e-06 6.098281e-06 0.000000e+00 2.409679e-05 4.551401e-04 1.184979e+02 6.478814e-02
+371 1.206824e-05 6.035248e-06 4.175647e-06 0.000000e+00 2.439154e-05 4.577338e-04 1.184979e+02 6.479121e-02
+372 1.355399e-05 6.149531e-06 3.599927e-06 0.000000e+00 2.292680e-05 4.609467e-04 1.184979e+02 6.479139e-02
+373 1.506409e-05 6.935898e-06 3.093088e-06 0.000000e+00 2.094558e-05 4.661992e-04 1.184979e+02 6.478953e-02
+374 1.664526e-05 5.862578e-06 3.332642e-06 0.000000e+00 2.117634e-05 4.714110e-04 1.184979e+02 6.478560e-02
+375 1.814816e-05 3.257894e-06 2.216668e-06 0.000000e+00 2.418512e-05 4.744400e-04 1.184978e+02 6.478074e-02
+376 1.975005e-05 3.871637e-06 2.426199e-06 0.000000e+00 2.371069e-05 4.732530e-04 1.184978e+02 6.477941e-02
+377 2.122496e-05 6.499837e-06 4.583136e-06 0.000000e+00 1.925079e-05 4.683902e-04 1.184979e+02 6.478238e-02
+378 2.214942e-05 6.121195e-06 6.149981e-06 0.000000e+00 1.834848e-05 4.637588e-04 1.184979e+02 6.478432e-02
+379 2.248921e-05 6.246184e-06 7.134666e-06 0.000000e+00 2.111720e-05 4.578889e-04 1.184979e+02 6.478287e-02
+380 2.260698e-05 9.074887e-06 8.687650e-06 0.000000e+00 2.163419e-05 4.505653e-04 1.184979e+02 6.478221e-02
+381 2.237000e-05 1.246133e-05 1.077465e-05 0.000000e+00 2.069219e-05 4.446378e-04 1.184979e+02 6.478196e-02
+382 2.165554e-05 1.381220e-05 1.211609e-05 0.000000e+00 2.112869e-05 4.410391e-04 1.184979e+02 6.478049e-02
+383 2.036909e-05 1.420495e-05 1.269653e-05 0.000000e+00 2.254958e-05 4.378223e-04 1.184979e+02 6.477910e-02
+384 1.868226e-05 1.762896e-05 1.383082e-05 0.000000e+00 2.285351e-05 4.332361e-04 1.184979e+02 6.477764e-02
+385 1.691887e-05 2.108639e-05 1.668543e-05 0.000000e+00 2.045927e-05 4.296912e-04 1.184979e+02 6.477661e-02
+386 1.512223e-05 1.941313e-05 1.587911e-05 0.000000e+00 2.053317e-05 4.301807e-04 1.184979e+02 6.477409e-02
+387 1.321465e-05 1.776016e-05 1.551711e-05 0.000000e+00 2.054286e-05 4.321725e-04 1.184979e+02 6.476980e-02
+388 1.136391e-05 1.830754e-05 1.499724e-05 0.000000e+00 1.894679e-05 4.352394e-04 1.184979e+02 6.476543e-02
+389 1.005192e-05 1.781064e-05 1.505495e-05 0.000000e+00 1.699848e-05 4.393079e-04 1.184979e+02 6.476142e-02
+390 9.070564e-06 1.476344e-05 1.326127e-05 0.000000e+00 1.717590e-05 4.428849e-04 1.184979e+02 6.475852e-02
+391 8.765891e-06 1.177313e-05 1.179773e-05 0.000000e+00 1.890144e-05 4.440969e-04 1.184979e+02 6.475636e-02
+392 9.353165e-06 1.246911e-05 1.019541e-05 0.000000e+00 1.914337e-05 4.402790e-04 1.184979e+02 6.475824e-02
+393 1.054408e-05 1.278817e-05 1.069253e-05 0.000000e+00 1.572208e-05 4.345572e-04 1.184979e+02 6.476283e-02
+394 1.199569e-05 9.333892e-06 6.525754e-06 0.000000e+00 1.579473e-05 4.297207e-04 1.184979e+02 6.476823e-02
+395 1.397544e-05 7.595621e-06 5.390588e-06 0.000000e+00 1.688031e-05 4.224848e-04 1.184979e+02 6.477433e-02
+396 1.594342e-05 8.630160e-06 4.187829e-06 0.000000e+00 1.863127e-05 4.153663e-04 1.184979e+02 6.477865e-02
+397 1.759889e-05 7.985760e-06 4.396924e-06 0.000000e+00 1.829285e-05 4.103449e-04 1.184979e+02 6.478412e-02
+398 1.886776e-05 4.859389e-06 2.569185e-06 0.000000e+00 1.887312e-05 4.077386e-04 1.184979e+02 6.478920e-02
+399 1.992775e-05 2.708467e-06 2.790036e-06 0.000000e+00 1.879376e-05 4.065682e-04 1.184979e+02 6.479120e-02
+400 2.064756e-05 3.912182e-06 2.216940e-06 0.000000e+00 1.884682e-05 4.030030e-04 1.184979e+02 6.479366e-02
+401 2.087596e-05 4.441208e-06 4.924962e-06 0.000000e+00 1.752571e-05 4.008795e-04 1.184979e+02 6.479499e-02
+402 2.068695e-05 2.766064e-06 2.830396e-06 0.000000e+00 1.865633e-05 4.025752e-04 1.184979e+02 6.479472e-02
+403 2.039406e-05 4.445049e-06 5.733298e-06 0.000000e+00 1.681361e-05 4.050626e-04 1.184979e+02 6.479485e-02
+404 1.957251e-05 8.490789e-06 7.118890e-06 0.000000e+00 1.556668e-05 4.080807e-04 1.184979e+02 6.479402e-02
+405 1.830085e-05 1.080329e-05 9.950465e-06 0.000000e+00 1.396052e-05 4.118145e-04 1.184979e+02 6.479432e-02
+406 1.700102e-05 1.150183e-05 9.895261e-06 0.000000e+00 1.492765e-05 4.143400e-04 1.184979e+02 6.479675e-02
+407 1.553966e-05 1.323984e-05 1.228571e-05 0.000000e+00 1.549236e-05 4.140022e-04 1.184979e+02 6.480012e-02
+408 1.431270e-05 1.692868e-05 1.258037e-05 0.000000e+00 1.611982e-05 4.097370e-04 1.184979e+02 6.480617e-02
+409 1.365683e-05 1.848834e-05 1.613336e-05 0.000000e+00 1.391618e-05 4.043644e-04 1.184979e+02 6.481386e-02
+410 1.343385e-05 1.718677e-05 1.388645e-05 0.000000e+00 1.527756e-05 3.997441e-04 1.184979e+02 6.482198e-02
+411 1.326401e-05 1.772621e-05 1.696517e-05 0.000000e+00 1.546958e-05 3.935336e-04 1.184979e+02 6.483282e-02
+412 1.306248e-05 1.851029e-05 1.595975e-05 0.000000e+00 1.762262e-05 3.867244e-04 1.184979e+02 6.484469e-02
+413 1.334181e-05 1.679606e-05 1.633067e-05 0.000000e+00 1.805851e-05 3.822904e-04 1.184979e+02 6.485632e-02
+414 1.366401e-05 1.434949e-05 1.348569e-05 0.000000e+00 1.910570e-05 3.805913e-04 1.184978e+02 6.486858e-02
+415 1.390646e-05 1.374664e-05 1.372587e-05 0.000000e+00 1.772541e-05 3.795549e-04 1.184978e+02 6.487857e-02
+416 1.418786e-05 1.468511e-05 1.101134e-05 0.000000e+00 1.831808e-05 3.780374e-04 1.184978e+02 6.488667e-02
+417 1.453484e-05 1.441149e-05 1.196848e-05 0.000000e+00 1.693976e-05 3.776781e-04 1.184978e+02 6.489335e-02
+418 1.516072e-05 1.198488e-05 7.423931e-06 0.000000e+00 1.855905e-05 3.802140e-04 1.184978e+02 6.489808e-02
+419 1.618475e-05 1.098572e-05 8.579992e-06 0.000000e+00 1.651874e-05 3.835670e-04 1.184978e+02 6.490264e-02
+420 1.726021e-05 9.891325e-06 5.115647e-06 0.000000e+00 1.538530e-05 3.860036e-04 1.184978e+02 6.490764e-02
+421 1.799280e-05 7.142076e-06 4.643153e-06 0.000000e+00 1.344044e-05 3.882325e-04 1.184978e+02 6.491276e-02
+422 1.847779e-05 4.570957e-06 2.123073e-06 0.000000e+00 1.437295e-05 3.892033e-04 1.184978e+02 6.491828e-02
+423 1.882902e-05 3.399756e-06 3.589194e-06 0.000000e+00 1.505935e-05 3.882021e-04 1.184978e+02 6.492211e-02
+424 1.889056e-05 3.531039e-06 1.880991e-06 0.000000e+00 1.656662e-05 3.835863e-04 1.184978e+02 6.492551e-02
+425 1.898973e-05 3.449773e-06 4.635500e-06 0.000000e+00 1.515804e-05 3.769447e-04 1.184978e+02 6.492900e-02
+426 1.893497e-05 2.919288e-06 2.282442e-06 0.000000e+00 1.680551e-05 3.716202e-04 1.184978e+02 6.493163e-02
+427 1.861593e-05 4.338519e-06 5.795771e-06 0.000000e+00 1.734531e-05 3.656210e-04 1.184978e+02 6.493354e-02
+428 1.808772e-05 6.477892e-06 4.784099e-06 0.000000e+00 2.045741e-05 3.603468e-04 1.184978e+02 6.493412e-02
+429 1.735015e-05 8.539207e-06 7.587670e-06 0.000000e+00 2.032005e-05 3.557145e-04 1.184978e+02 6.493481e-02
+430 1.640191e-05 1.076815e-05 8.086260e-06 0.000000e+00 2.163700e-05 3.535462e-04 1.184978e+02 6.493312e-02
+431 1.510747e-05 1.292279e-05 1.180087e-05 0.000000e+00 1.996543e-05 3.526076e-04 1.184978e+02 6.492943e-02
+432 1.379926e-05 1.488355e-05 1.123173e-05 0.000000e+00 2.034713e-05 3.517710e-04 1.184978e+02 6.492333e-02
+433 1.287943e-05 1.638642e-05 1.484567e-05 0.000000e+00 1.947645e-05 3.524817e-04 1.184978e+02 6.491444e-02
+434 1.218516e-05 1.594569e-05 1.317009e-05 0.000000e+00 2.120029e-05 3.554179e-04 1.184978e+02 6.490597e-02
+435 1.172897e-05 1.597935e-05 1.637438e-05 0.000000e+00 1.959477e-05 3.578365e-04 1.184978e+02 6.489884e-02
+436 1.167454e-05 1.633229e-05 1.455812e-05 0.000000e+00 1.870651e-05 3.584234e-04 1.184978e+02 6.489383e-02
+437 1.192383e-05 1.685299e-05 1.655441e-05 0.000000e+00 1.710005e-05 3.585649e-04 1.184978e+02 6.489045e-02
+438 1.249901e-05 1.679000e-05 1.509692e-05 0.000000e+00 1.849547e-05 3.583120e-04 1.184978e+02 6.488787e-02
+439 1.320384e-05 1.598210e-05 1.572344e-05 0.000000e+00 1.905102e-05 3.557323e-04 1.184979e+02 6.488658e-02
+440 1.420161e-05 1.619318e-05 1.228746e-05 0.000000e+00 2.161495e-05 3.500024e-04 1.184979e+02 6.488630e-02
+441 1.545738e-05 1.667802e-05 1.310947e-05 0.000000e+00 2.126997e-05 3.437816e-04 1.184979e+02 6.488654e-02
+442 1.677398e-05 1.494592e-05 9.293079e-06 0.000000e+00 2.377337e-05 3.383263e-04 1.184979e+02 6.488932e-02
+443 1.791538e-05 1.254606e-05 9.892126e-06 0.000000e+00 2.467072e-05 3.332228e-04 1.184979e+02 6.489191e-02
+444 1.885188e-05 1.076852e-05 6.085864e-06 0.000000e+00 2.784101e-05 3.286610e-04 1.184979e+02 6.489395e-02
+445 1.965084e-05 9.150345e-06 6.777640e-06 0.000000e+00 2.729641e-05 3.253332e-04 1.184979e+02 6.489797e-02
+446 2.073495e-05 6.185625e-06 3.980952e-06 0.000000e+00 2.749593e-05 3.253576e-04 1.184979e+02 6.490015e-02
+447 2.165807e-05 3.182414e-06 3.780398e-06 0.000000e+00 2.720414e-05 3.264059e-04 1.184979e+02 6.489926e-02
+448 2.220240e-05 2.641779e-06 1.189557e-06 0.000000e+00 2.808115e-05 3.265761e-04 1.184979e+02 6.489749e-02
+449 2.204430e-05 4.090053e-06 3.618442e-06 0.000000e+00 2.613138e-05 3.286944e-04 1.184979e+02 6.489486e-02
+450 2.154049e-05 3.622045e-06 2.149981e-06 0.000000e+00 2.716938e-05 3.322086e-04 1.184979e+02 6.489272e-02
+451 2.110219e-05 3.658333e-06 4.452225e-06 0.000000e+00 2.638281e-05 3.347370e-04 1.184979e+02 6.489145e-02
+452 2.126424e-05 5.921234e-06 3.686708e-06 0.000000e+00 2.643633e-05 3.362541e-04 1.184979e+02 6.489056e-02
+453 2.135796e-05 8.491606e-06 6.971936e-06 0.000000e+00 2.521079e-05 3.362856e-04 1.184979e+02 6.489218e-02
+454 2.164506e-05 9.066946e-06 6.720669e-06 0.000000e+00 2.597920e-05 3.358678e-04 1.184979e+02 6.489533e-02
+455 2.217733e-05 9.559274e-06 9.202378e-06 0.000000e+00 2.729430e-05 3.326743e-04 1.184979e+02 6.489814e-02
+456 2.231221e-05 1.224430e-05 9.739512e-06 0.000000e+00 3.141673e-05 3.261816e-04 1.184979e+02 6.490108e-02
+457 2.225643e-05 1.498537e-05 1.405533e-05 0.000000e+00 3.151002e-05 3.206686e-04 1.184978e+02 6.490561e-02
+458 2.216740e-05 1.397677e-05 1.352937e-05 0.000000e+00 3.356596e-05 3.175166e-04 1.184978e+02 6.491229e-02
+459 2.176996e-05 1.304925e-05 1.531971e-05 0.000000e+00 3.583981e-05 3.131867e-04 1.184978e+02 6.491910e-02
+460 2.139166e-05 1.535808e-05 1.469839e-05 0.000000e+00 3.900153e-05 3.083961e-04 1.184978e+02 6.492619e-02
+461 2.045986e-05 1.728792e-05 1.682936e-05 0.000000e+00 3.926439e-05 3.067369e-04 1.184978e+02 6.493437e-02
+462 1.910396e-05 1.680893e-05 1.492984e-05 0.000000e+00 3.923770e-05 3.085421e-04 1.184978e+02 6.494253e-02
+463 1.803026e-05 1.707687e-05 1.535974e-05 0.000000e+00 3.812132e-05 3.112528e-04 1.184978e+02 6.494888e-02
+464 1.769632e-05 1.904773e-05 1.396510e-05 0.000000e+00 3.969828e-05 3.127590e-04 1.184978e+02 6.495247e-02
+465 1.817324e-05 1.946738e-05 1.488979e-05 0.000000e+00 3.811828e-05 3.151002e-04 1.184978e+02 6.495702e-02
+466 1.859601e-05 1.549925e-05 1.119621e-05 0.000000e+00 3.737620e-05 3.194829e-04 1.184978e+02 6.496243e-02
+467 1.984703e-05 1.218966e-05 9.775039e-06 0.000000e+00 3.714840e-05 3.220526e-04 1.184978e+02 6.496672e-02
+468 2.149225e-05 1.209358e-05 7.996026e-06 0.000000e+00 3.835375e-05 3.227669e-04 1.184978e+02 6.497202e-02
+469 2.313865e-05 1.022372e-05 7.927552e-06 0.000000e+00 3.766150e-05 3.229142e-04 1.184978e+02 6.497874e-02
+470 2.450245e-05 6.365045e-06 5.074658e-06 0.000000e+00 3.860981e-05 3.216153e-04 1.184978e+02 6.498643e-02
+471 2.560891e-05 4.537848e-06 4.661932e-06 0.000000e+00 4.176023e-05 3.176088e-04 1.184978e+02 6.499348e-02
+472 2.595598e-05 4.962201e-06 3.400385e-06 0.000000e+00 4.581172e-05 3.110663e-04 1.184978e+02 6.499966e-02
+473 2.603307e-05 4.870230e-06 3.944617e-06 0.000000e+00 4.685687e-05 3.053156e-04 1.184978e+02 6.500592e-02
+474 2.585256e-05 2.350839e-06 1.859070e-06 0.000000e+00 4.895098e-05 3.022649e-04 1.184978e+02 6.501116e-02
+475 2.555804e-05 2.807366e-06 2.354704e-06 0.000000e+00 5.141433e-05 2.989155e-04 1.184978e+02 6.501392e-02
+476 2.450294e-05 6.813642e-06 3.877902e-06 0.000000e+00 5.455142e-05 2.954205e-04 1.184978e+02 6.501415e-02
+477 2.269610e-05 7.726242e-06 5.639489e-06 0.000000e+00 5.613313e-05 2.959002e-04 1.184978e+02 6.501282e-02
+478 1.995874e-05 6.955024e-06 5.424231e-06 0.000000e+00 5.559143e-05 2.989191e-04 1.184978e+02 6.501036e-02
+479 1.715039e-05 8.164284e-06 7.730597e-06 0.000000e+00 5.492241e-05 3.024231e-04 1.184978e+02 6.500487e-02
+480 1.445963e-05 1.053737e-05 8.975535e-06 0.000000e+00 5.555280e-05 3.045775e-04 1.184978e+02 6.499776e-02
+481 1.175369e-05 1.141326e-05 1.106791e-05 0.000000e+00 5.401578e-05 3.076551e-04 1.184978e+02 6.499023e-02
+482 9.408573e-06 1.031382e-05 1.160375e-05 0.000000e+00 5.332219e-05 3.125933e-04 1.184978e+02 6.498237e-02
+483 7.597084e-06 1.201205e-05 1.340742e-05 0.000000e+00 5.298553e-05 3.136621e-04 1.184978e+02 6.497553e-02
+484 6.530657e-06 1.590200e-05 1.594369e-05 0.000000e+00 5.536701e-05 3.121390e-04 1.184978e+02 6.496804e-02
+485 5.933364e-06 1.670816e-05 1.636618e-05 0.000000e+00 5.763837e-05 3.107534e-04 1.184978e+02 6.496236e-02
+486 6.095549e-06 1.652101e-05 1.542957e-05 0.000000e+00 5.910720e-05 3.075284e-04 1.184978e+02 6.495885e-02
+487 6.747048e-06 1.845314e-05 1.595482e-05 0.000000e+00 6.316558e-05 3.031380e-04 1.184978e+02 6.495414e-02
+488 7.648938e-06 1.992258e-05 1.549907e-05 0.000000e+00 6.816788e-05 2.965507e-04 1.184978e+02 6.495022e-02
+489 8.749662e-06 1.897818e-05 1.474984e-05 0.000000e+00 7.077629e-05 2.909911e-04 1.184978e+02 6.494748e-02
+490 1.045232e-05 1.600029e-05 1.321147e-05 0.000000e+00 7.173868e-05 2.889426e-04 1.184978e+02 6.494514e-02
+491 1.234825e-05 1.463306e-05 1.161173e-05 0.000000e+00 7.413316e-05 2.864970e-04 1.184978e+02 6.494197e-02
+492 1.419823e-05 1.384063e-05 1.133477e-05 0.000000e+00 7.768286e-05 2.849054e-04 1.184978e+02 6.493715e-02
+493 1.611747e-05 1.029685e-05 8.005879e-06 0.000000e+00 8.000766e-05 2.869956e-04 1.184978e+02 6.493291e-02
+494 1.786482e-05 7.523165e-06 6.096563e-06 0.000000e+00 7.851622e-05 2.915187e-04 1.184978e+02 6.492961e-02
+495 1.927191e-05 7.120307e-06 5.212088e-06 0.000000e+00 7.760778e-05 2.956781e-04 1.184978e+02 6.492569e-02
+496 1.992272e-05 6.695143e-06 4.284955e-06 0.000000e+00 7.794644e-05 2.992593e-04 1.184978e+02 6.492072e-02
+497 2.006296e-05 5.657047e-06 3.254625e-06 0.000000e+00 7.658416e-05 3.029960e-04 1.184978e+02 6.491797e-02
+498 1.969560e-05 4.180866e-06 3.047437e-06 0.000000e+00 7.462055e-05 3.073571e-04 1.184978e+02 6.491751e-02
+499 1.926461e-05 4.600565e-06 2.144335e-06 0.000000e+00 7.600262e-05 3.078338e-04 1.184978e+02 6.491700e-02
+500 1.854546e-05 5.781658e-06 3.854050e-06 0.000000e+00 7.864257e-05 3.051972e-04 1.184978e+02 6.491596e-02
+501 1.755931e-05 5.468832e-06 2.920718e-06 0.000000e+00 8.325511e-05 3.027716e-04 1.184978e+02 6.491588e-02
+502 1.650757e-05 5.710919e-06 4.988150e-06 0.000000e+00 8.652704e-05 2.991139e-04 1.184978e+02 6.491824e-02
+503 1.557103e-05 6.970430e-06 6.660095e-06 0.000000e+00 9.072593e-05 2.943139e-04 1.184978e+02 6.492063e-02
+504 1.472811e-05 7.623034e-06 8.493629e-06 0.000000e+00 9.573132e-05 2.891218e-04 1.184978e+02 6.492177e-02
+505 1.422618e-05 8.289415e-06 9.343871e-06 0.000000e+00 9.924982e-05 2.847550e-04 1.184978e+02 6.492339e-02
+506 1.417882e-05 8.854255e-06 1.161864e-05 0.000000e+00 9.884674e-05 2.834393e-04 1.184978e+02 6.492560e-02
+507 1.460576e-05 1.077649e-05 1.147496e-05 0.000000e+00 1.008171e-04 2.829040e-04 1.184978e+02 6.492598e-02
+508 1.523121e-05 1.410827e-05 1.460095e-05 0.000000e+00 1.028551e-04 2.825964e-04 1.184978e+02 6.492515e-02
+509 1.592480e-05 1.638911e-05 1.404834e-05 0.000000e+00 1.050479e-04 2.855182e-04 1.184978e+02 6.492460e-02
+510 1.676537e-05 1.843063e-05 1.663181e-05 0.000000e+00 1.035001e-04 2.909718e-04 1.184978e+02 6.492416e-02
+511 1.790949e-05 1.956549e-05 1.608673e-05 0.000000e+00 1.023534e-04 2.962441e-04 1.184978e+02 6.492452e-02
+512 1.872174e-05 1.940326e-05 1.592019e-05 0.000000e+00 1.004482e-04 3.007088e-04 1.184978e+02 6.492661e-02
+513 1.877545e-05 1.896575e-05 1.456555e-05 0.000000e+00 9.946060e-05 3.045884e-04 1.184978e+02 6.492994e-02
+514 1.875032e-05 1.698998e-05 1.481532e-05 0.000000e+00 9.788707e-05 3.078144e-04 1.184978e+02 6.493508e-02
+515 1.909706e-05 1.480994e-05 1.179820e-05 0.000000e+00 9.991394e-05 3.077909e-04 1.184978e+02 6.494128e-02
+516 1.968953e-05 1.407685e-05 1.283803e-05 0.000000e+00 1.031284e-04 3.040207e-04 1.184977e+02 6.494653e-02
+517 2.033877e-05 1.256771e-05 9.587109e-06 0.000000e+00 1.094675e-04 3.002860e-04 1.184977e+02 6.495242e-02
+518 2.075231e-05 1.065013e-05 9.930571e-06 0.000000e+00 1.140439e-04 2.960041e-04 1.184977e+02 6.495921e-02
+519 2.112110e-05 8.473171e-06 6.424913e-06 0.000000e+00 1.197520e-04 2.911761e-04 1.184977e+02 6.496637e-02
+520 2.131812e-05 7.615631e-06 5.142166e-06 0.000000e+00 1.231946e-04 2.868304e-04 1.184977e+02 6.497331e-02
+521 2.117201e-05 7.869133e-06 3.365270e-06 0.000000e+00 1.255356e-04 2.837377e-04 1.184977e+02 6.497960e-02
+522 2.072488e-05 6.724289e-06 4.177600e-06 0.000000e+00 1.255019e-04 2.831267e-04 1.184977e+02 6.498403e-02
+523 1.972670e-05 5.363941e-06 1.571799e-06 0.000000e+00 1.273830e-04 2.840659e-04 1.184977e+02 6.498516e-02
+524 1.842968e-05 6.279174e-06 4.121930e-06 0.000000e+00 1.270238e-04 2.857530e-04 1.184977e+02 6.498316e-02
+525 1.752483e-05 6.073766e-06 2.426467e-06 0.000000e+00 1.287915e-04 2.901860e-04 1.184977e+02 6.497907e-02
+526 1.685082e-05 4.568691e-06 4.923326e-06 0.000000e+00 1.282250e-04 2.961101e-04 1.184977e+02 6.497288e-02
+527 1.578859e-05 3.405625e-06 3.596311e-06 0.000000e+00 1.276079e-04 3.014898e-04 1.184977e+02 6.496738e-02
+528 1.439510e-05 4.781522e-06 6.487411e-06 0.000000e+00 1.236876e-04 3.053135e-04 1.184977e+02 6.496377e-02
+529 1.310293e-05 7.158382e-06 7.902262e-06 0.000000e+00 1.230121e-04 3.085467e-04 1.184977e+02 6.495785e-02
+530 1.193373e-05 7.967666e-06 1.119990e-05 0.000000e+00 1.235350e-04 3.101811e-04 1.184977e+02 6.495074e-02
+531 1.140265e-05 9.216593e-06 1.016447e-05 0.000000e+00 1.274295e-04 3.091906e-04 1.184977e+02 6.494449e-02
+532 1.124230e-05 1.358080e-05 1.430141e-05 0.000000e+00 1.305404e-04 3.037276e-04 1.184977e+02 6.493780e-02
+533 1.133757e-05 1.666591e-05 1.340040e-05 0.000000e+00 1.377421e-04 2.987768e-04 1.184977e+02 6.493127e-02
+534 1.163363e-05 1.698474e-05 1.636093e-05 0.000000e+00 1.429747e-04 2.944974e-04 1.184977e+02 6.492440e-02
+535 1.223464e-05 1.762516e-05 1.460721e-05 0.000000e+00 1.498770e-04 2.900445e-04 1.184978e+02 6.491669e-02
+536 1.269909e-05 1.949732e-05 1.708569e-05 0.000000e+00 1.529565e-04 2.871526e-04 1.184978e+02 6.491089e-02
+537 1.305757e-05 1.969892e-05 1.623494e-05 0.000000e+00 1.551300e-04 2.853261e-04 1.184978e+02 6.490644e-02
+538 1.380390e-05 1.685321e-05 1.638358e-05 0.000000e+00 1.539041e-04 2.864533e-04 1.184978e+02 6.490240e-02
+539 1.461211e-05 1.450457e-05 1.244972e-05 0.000000e+00 1.543278e-04 2.889348e-04 1.184978e+02 6.489735e-02
+540 1.590975e-05 1.554665e-05 1.403429e-05 0.000000e+00 1.536527e-04 2.910898e-04 1.184978e+02 6.488952e-02
+541 1.770301e-05 1.487968e-05 1.057267e-05 0.000000e+00 1.553322e-04 2.971148e-04 1.184978e+02 6.488114e-02
+542 1.935292e-05 1.184563e-05 1.072148e-05 0.000000e+00 1.535778e-04 3.045242e-04 1.184978e+02 6.487483e-02
+543 2.076529e-05 1.087309e-05 6.913676e-06 0.000000e+00 1.515750e-04 3.088513e-04 1.184978e+02 6.487206e-02
+544 2.248110e-05 1.162060e-05 8.010319e-06 0.000000e+00 1.479846e-04 3.136002e-04 1.184978e+02 6.486991e-02
+545 2.400958e-05 1.048615e-05 5.359916e-06 0.000000e+00 1.491763e-04 3.163776e-04 1.184978e+02 6.486775e-02
+546 2.535944e-05 7.049470e-06 4.603108e-06 0.000000e+00 1.517588e-04 3.173574e-04 1.184978e+02 6.486586e-02
+547 2.614901e-05 5.590627e-06 1.293823e-06 0.000000e+00 1.574931e-04 3.165461e-04 1.184978e+02 6.486505e-02
+548 2.663333e-05 6.768623e-06 4.209319e-06 0.000000e+00 1.602640e-04 3.096739e-04 1.184978e+02 6.486812e-02
+549 2.636393e-05 5.630843e-06 2.534013e-06 0.000000e+00 1.657497e-04 3.042402e-04 1.184978e+02 6.487201e-02
+550 2.510998e-05 2.235640e-06 4.679790e-06 0.000000e+00 1.711246e-04 3.013245e-04 1.184978e+02 6.487444e-02
+551 2.334594e-05 2.309129e-06 3.009614e-06 0.000000e+00 1.786513e-04 2.963125e-04 1.184978e+02 6.487644e-02
+552 2.107673e-05 4.446503e-06 6.572253e-06 0.000000e+00 1.816737e-04 2.953413e-04 1.184978e+02 6.487800e-02
+553 1.864900e-05 5.165099e-06 6.046159e-06 0.000000e+00 1.827680e-04 2.950840e-04 1.184978e+02 6.488180e-02
+554 1.646997e-05 5.669962e-06 8.284915e-06 0.000000e+00 1.799672e-04 2.964708e-04 1.184978e+02 6.488562e-02
+555 1.444045e-05 9.079022e-06 8.437270e-06 0.000000e+00 1.796240e-04 3.001226e-04 1.184978e+02 6.488658e-02
+556 1.234157e-05 1.411764e-05 1.341022e-05 0.000000e+00 1.765633e-04 3.034635e-04 1.184978e+02 6.488646e-02
+557 1.052939e-05 1.529590e-05 1.280048e-05 0.000000e+00 1.762899e-04 3.095204e-04 1.184977e+02 6.488548e-02
+558 9.323692e-06 1.464761e-05 1.510695e-05 0.000000e+00 1.739005e-04 3.173090e-04 1.184977e+02 6.488459e-02
+559 8.659572e-06 1.639727e-05 1.388775e-05 0.000000e+00 1.716506e-04 3.207769e-04 1.184977e+02 6.488637e-02
+560 8.699480e-06 1.807214e-05 1.688885e-05 0.000000e+00 1.677189e-04 3.229393e-04 1.184977e+02 6.488949e-02
+561 9.566065e-06 1.706285e-05 1.522065e-05 0.000000e+00 1.686806e-04 3.253685e-04 1.184977e+02 6.489166e-02
+562 1.067463e-05 1.582782e-05 1.614844e-05 0.000000e+00 1.731251e-04 3.241581e-04 1.184977e+02 6.489365e-02
+563 1.187188e-05 1.648675e-05 1.463008e-05 0.000000e+00 1.803114e-04 3.216031e-04 1.184977e+02 6.489654e-02
+564 1.287251e-05 1.763814e-05 1.609430e-05 0.000000e+00 1.825806e-04 3.145327e-04 1.184977e+02 6.490393e-02
+565 1.408212e-05 1.552208e-05 1.244777e-05 0.000000e+00 1.858936e-04 3.081360e-04 1.184977e+02 6.491315e-02
+566 1.566786e-05 1.331092e-05 1.128192e-05 0.000000e+00 1.909649e-04 3.051035e-04 1.184977e+02 6.491909e-02
+567 1.757173e-05 1.391979e-05 8.257135e-06 0.000000e+00 1.981699e-04 3.010829e-04 1.184977e+02 6.492356e-02
+568 1.977589e-05 1.366121e-05 9.097571e-06 0.000000e+00 1.997338e-04 2.999200e-04 1.184977e+02 6.492784e-02
+569 2.217748e-05 1.152920e-05 5.702542e-06 0.000000e+00 1.990998e-04 3.017924e-04 1.184977e+02 6.493207e-02
+570 2.433363e-05 9.383727e-06 5.597391e-06 0.000000e+00 1.953947e-04 3.045825e-04 1.184977e+02 6.493617e-02
+571 2.658207e-05 7.853440e-06 3.436501e-06 0.000000e+00 1.935057e-04 3.087061e-04 1.184977e+02 6.493815e-02
+572 2.839483e-05 6.635643e-06 4.157794e-06 0.000000e+00 1.900996e-04 3.131291e-04 1.184977e+02 6.493760e-02
+573 2.979742e-05 3.501592e-06 1.799310e-06 0.000000e+00 1.884599e-04 3.193163e-04 1.184977e+02 6.493584e-02
+574 3.053693e-05 1.647844e-06 2.621264e-06 0.000000e+00 1.866176e-04 3.252748e-04 1.184977e+02 6.493414e-02
+575 3.061290e-05 2.681896e-06 2.845743e-06 0.000000e+00 1.847654e-04 3.281660e-04 1.184977e+02 6.493337e-02
+576 2.993279e-05 3.659688e-06 5.632069e-06 0.000000e+00 1.824350e-04 3.289716e-04 1.184977e+02 6.493153e-02
+577 2.808018e-05 4.245318e-06 5.012002e-06 0.000000e+00 1.849626e-04 3.281638e-04 1.184977e+02 6.492882e-02
+578 2.577546e-05 6.499467e-06 7.585482e-06 0.000000e+00 1.891692e-04 3.261381e-04 1.184977e+02 6.492629e-02
+579 2.311595e-05 8.907510e-06 8.078814e-06 0.000000e+00 1.956206e-04 3.208739e-04 1.184977e+02 6.492471e-02
+580 2.074079e-05 1.153486e-05 1.076619e-05 0.000000e+00 1.995884e-04 3.133953e-04 1.184977e+02 6.492283e-02
+581 1.878464e-05 1.287095e-05 1.144485e-05 0.000000e+00 2.040925e-04 3.077730e-04 1.184977e+02 6.491979e-02
+582 1.748956e-05 1.378858e-05 1.362246e-05 0.000000e+00 2.088940e-04 3.033650e-04 1.184977e+02 6.491658e-02
+583 1.655729e-05 1.501208e-05 1.483734e-05 0.000000e+00 2.127982e-04 3.009486e-04 1.184977e+02 6.491064e-02
+584 1.608586e-05 1.532516e-05 1.656503e-05 0.000000e+00 2.138224e-04 3.006186e-04 1.184977e+02 6.490249e-02
+585 1.611642e-05 1.537847e-05 1.535616e-05 0.000000e+00 2.131890e-04 3.024245e-04 1.184977e+02 6.489484e-02
+586 1.680566e-05 1.628366e-05 1.658623e-05 0.000000e+00 2.093875e-04 3.068367e-04 1.184977e+02 6.488715e-02
+587 1.760237e-05 1.664075e-05 1.506915e-05 0.000000e+00 2.072550e-04 3.119550e-04 1.184977e+02 6.488044e-02
+588 1.837546e-05 1.763158e-05 1.510569e-05 0.000000e+00 2.033000e-04 3.161208e-04 1.184977e+02 6.487416e-02
+589 1.885129e-05 1.805351e-05 1.421696e-05 0.000000e+00 2.011006e-04 3.223970e-04 1.184977e+02 6.486719e-02
+590 1.897439e-05 1.713664e-05 1.310725e-05 0.000000e+00 1.993853e-04 3.270636e-04 1.184977e+02 6.486352e-02
+591 1.912374e-05 1.607190e-05 1.162781e-05 0.000000e+00 1.993925e-04 3.286217e-04 1.184977e+02 6.486126e-02
+592 1.942177e-05 1.473822e-05 9.527926e-06 0.000000e+00 2.004906e-04 3.277004e-04 1.184977e+02 6.485975e-02
+593 1.942880e-05 1.336913e-05 6.803074e-06 0.000000e+00 2.017146e-04 3.240843e-04 1.184977e+02 6.486099e-02
+594 1.945779e-05 1.133814e-05 6.556827e-06 0.000000e+00 2.068119e-04 3.199217e-04 1.184977e+02 6.486255e-02
+595 1.962911e-05 8.218170e-06 4.720970e-06 0.000000e+00 2.132545e-04 3.142438e-04 1.184977e+02 6.486550e-02
+596 2.004355e-05 6.615923e-06 4.190290e-06 0.000000e+00 2.172956e-04 3.072441e-04 1.184977e+02 6.486863e-02
+597 2.011324e-05 4.898435e-06 4.030937e-06 0.000000e+00 2.205056e-04 3.022233e-04 1.184977e+02 6.487169e-02
+598 1.986334e-05 2.528786e-06 2.620958e-06 0.000000e+00 2.233363e-04 2.994557e-04 1.184977e+02 6.487555e-02
+599 1.951813e-05 1.739543e-06 2.812379e-06 0.000000e+00 2.259686e-04 2.972973e-04 1.184977e+02 6.487805e-02
+600 1.885206e-05 3.204200e-06 2.835149e-06 0.000000e+00 2.258550e-04 2.967135e-04 1.184977e+02 6.488028e-02
+601 1.823684e-05 5.582044e-06 4.302815e-06 0.000000e+00 2.210460e-04 2.982201e-04 1.184977e+02 6.488233e-02
+602 1.704419e-05 6.850293e-06 6.570825e-06 0.000000e+00 2.177580e-04 3.024097e-04 1.184977e+02 6.488258e-02
+603 1.581394e-05 7.454809e-06 7.456944e-06 0.000000e+00 2.166572e-04 3.078766e-04 1.184977e+02 6.488142e-02
+604 1.525221e-05 9.965474e-06 8.998853e-06 0.000000e+00 2.114551e-04 3.125501e-04 1.184977e+02 6.488038e-02
+605 1.513795e-05 1.129658e-05 1.125497e-05 0.000000e+00 2.058873e-04 3.175580e-04 1.184977e+02 6.488053e-02
+606 1.546619e-05 1.048456e-05 1.115542e-05 0.000000e+00 2.045196e-04 3.213538e-04 1.184977e+02 6.488160e-02
+607 1.568998e-05 1.117403e-05 1.346170e-05 0.000000e+00 2.051760e-04 3.205743e-04 1.184977e+02 6.488322e-02
+608 1.573833e-05 1.391357e-05 1.442054e-05 0.000000e+00 2.083412e-04 3.155823e-04 1.184977e+02 6.488733e-02
+609 1.588559e-05 1.582032e-05 1.662015e-05 0.000000e+00 2.089107e-04 3.093938e-04 1.184977e+02 6.489436e-02
+610 1.635312e-05 1.586041e-05 1.683126e-05 0.000000e+00 2.123839e-04 3.029366e-04 1.184977e+02 6.490163e-02
+611 1.706775e-05 1.612422e-05 1.612316e-05 0.000000e+00 2.186495e-04 2.964119e-04 1.184977e+02 6.490890e-02
+612 1.813246e-05 1.900705e-05 1.540385e-05 0.000000e+00 2.226685e-04 2.893818e-04 1.184977e+02 6.491607e-02
+613 1.905447e-05 1.987856e-05 1.585363e-05 0.000000e+00 2.244155e-04 2.846300e-04 1.184977e+02 6.492275e-02
+614 1.975421e-05 1.792234e-05 1.271118e-05 0.000000e+00 2.269241e-04 2.827586e-04 1.184977e+02 6.493014e-02
+615 2.032935e-05 1.760457e-05 1.297082e-05 0.000000e+00 2.268789e-04 2.808000e-04 1.184977e+02 6.493879e-02
+616 2.129062e-05 1.805505e-05 1.104319e-05 0.000000e+00 2.255317e-04 2.802836e-04 1.184977e+02 6.494745e-02
+617 2.247423e-05 1.577150e-05 1.068375e-05 0.000000e+00 2.200369e-04 2.821751e-04 1.184977e+02 6.495524e-02
+618 2.364261e-05 1.112483e-05 7.422985e-06 0.000000e+00 2.169937e-04 2.863739e-04 1.184977e+02 6.496104e-02
+619 2.389792e-05 8.089759e-06 5.386435e-06 0.000000e+00 2.153950e-04 2.914872e-04 1.184976e+02 6.496545e-02
+620 2.328233e-05 8.145743e-06 4.168635e-06 0.000000e+00 2.119473e-04 2.943824e-04 1.184976e+02 6.497031e-02
+621 2.205035e-05 6.220445e-06 4.976999e-06 0.000000e+00 2.065843e-04 2.971119e-04 1.184976e+02 6.497515e-02
+622 2.089833e-05 3.108851e-06 2.065017e-06 0.000000e+00 2.064912e-04 2.986919e-04 1.184976e+02 6.497964e-02
+623 1.986454e-05 3.888966e-06 3.915562e-06 0.000000e+00 2.087545e-04 2.955608e-04 1.184976e+02 6.498374e-02
+624 1.859357e-05 6.277868e-06 3.001288e-06 0.000000e+00 2.133848e-04 2.891705e-04 1.184976e+02 6.498792e-02
+625 1.721312e-05 6.178342e-06 4.739663e-06 0.000000e+00 2.155710e-04 2.827750e-04 1.184976e+02 6.499269e-02
+626 1.584558e-05 5.111060e-06 3.476241e-06 0.000000e+00 2.200339e-04 2.764001e-04 1.184976e+02 6.499641e-02
+627 1.436778e-05 6.414966e-06 5.486123e-06 0.000000e+00 2.246146e-04 2.698181e-04 1.184976e+02 6.499800e-02
+628 1.271073e-05 9.120236e-06 7.340498e-06 0.000000e+00 2.294534e-04 2.632894e-04 1.184976e+02 6.499702e-02
+629 1.113662e-05 8.634443e-06 1.071874e-05 0.000000e+00 2.306081e-04 2.596202e-04 1.184976e+02 6.499334e-02
+630 1.015077e-05 7.118243e-06 9.466232e-06 0.000000e+00 2.325955e-04 2.588208e-04 1.184977e+02 6.498840e-02
+631 9.698788e-06 9.593308e-06 1.321694e-05 0.000000e+00 2.314973e-04 2.579111e-04 1.184977e+02 6.498396e-02
+632 9.510577e-06 1.273953e-05 1.304552e-05 0.000000e+00 2.296157e-04 2.585990e-04 1.184977e+02 6.497937e-02
+633 9.782350e-06 1.362591e-05 1.559290e-05 0.000000e+00 2.248965e-04 2.618699e-04 1.184977e+02 6.497337e-02
+634 1.045280e-05 1.448200e-05 1.428090e-05 0.000000e+00 2.225766e-04 2.669346e-04 1.184977e+02 6.496611e-02
+635 1.137808e-05 1.749919e-05 1.664363e-05 0.000000e+00 2.198971e-04 2.699931e-04 1.184977e+02 6.495832e-02
+636 1.199685e-05 2.033829e-05 1.639721e-05 0.000000e+00 2.178087e-04 2.716072e-04 1.184977e+02 6.495154e-02
+637 1.229659e-05 1.960452e-05 1.713419e-05 0.000000e+00 2.134675e-04 2.722383e-04 1.184977e+02 6.494653e-02
+638 1.252133e-05 1.825422e-05 1.287564e-05 0.000000e+00 2.141559e-04 2.709165e-04 1.184977e+02 6.494265e-02
+639 1.268587e-05 1.946013e-05 1.419746e-05 0.000000e+00 2.162790e-04 2.665723e-04 1.184977e+02 6.494127e-02
+640 1.279740e-05 1.950350e-05 1.133431e-05 0.000000e+00 2.220063e-04 2.602793e-04 1.184977e+02 6.494235e-02
+641 1.309604e-05 1.609543e-05 1.164505e-05 0.000000e+00 2.248326e-04 2.550818e-04 1.184977e+02 6.494419e-02
+642 1.320875e-05 1.291179e-05 8.044887e-06 0.000000e+00 2.292120e-04 2.491274e-04 1.184977e+02 6.494724e-02
+643 1.323630e-05 1.153247e-05 8.652675e-06 0.000000e+00 2.322880e-04 2.425800e-04 1.184977e+02 6.495078e-02
+644 1.297050e-05 9.583945e-06 5.673375e-06 0.000000e+00 2.360562e-04 2.366586e-04 1.184977e+02 6.495329e-02
+645 1.246829e-05 6.718462e-06 5.055046e-06 0.000000e+00 2.347898e-04 2.339971e-04 1.184977e+02 6.495522e-02
+646 1.223368e-05 5.165042e-06 1.382135e-06 0.000000e+00 2.338530e-04 2.335078e-04 1.184977e+02 6.495807e-02
+647 1.215576e-05 6.735961e-06 4.119505e-06 0.000000e+00 2.311762e-04 2.333778e-04 1.184977e+02 6.496000e-02
+648 1.199852e-05 7.354028e-06 2.648855e-06 0.000000e+00 2.296753e-04 2.347493e-04 1.184977e+02 6.496167e-02
+649 1.166371e-05 5.757214e-06 4.721681e-06 0.000000e+00 2.242888e-04 2.395936e-04 1.184977e+02 6.496364e-02
+650 1.159699e-05 5.059598e-06 3.066158e-06 0.000000e+00 2.209943e-04 2.438858e-04 1.184977e+02 6.496518e-02
+651 1.169087e-05 5.556494e-06 6.012267e-06 0.000000e+00 2.180989e-04 2.454085e-04 1.184977e+02 6.496571e-02
+652 1.174318e-05 5.351491e-06 5.263919e-06 0.000000e+00 2.173361e-04 2.451144e-04 1.184977e+02 6.496673e-02
+653 1.136038e-05 5.172216e-06 8.049180e-06 0.000000e+00 2.132022e-04 2.428242e-04 1.184977e+02 6.497000e-02
+654 1.142894e-05 6.053315e-06 7.982875e-06 0.000000e+00 2.132746e-04 2.402463e-04 1.184977e+02 6.497295e-02
+655 1.221963e-05 9.066959e-06 1.286363e-05 0.000000e+00 2.158346e-04 2.343495e-04 1.184977e+02 6.497630e-02
+656 1.383604e-05 1.117630e-05 1.259119e-05 0.000000e+00 2.226430e-04 2.289344e-04 1.184977e+02 6.498021e-02
+657 1.580334e-05 1.235605e-05 1.528435e-05 0.000000e+00 2.252616e-04 2.247881e-04 1.184977e+02 6.498474e-02
+658 1.788925e-05 1.451707e-05 1.384585e-05 0.000000e+00 2.282699e-04 2.194625e-04 1.184977e+02 6.499051e-02
+659 1.963239e-05 1.710191e-05 1.670091e-05 0.000000e+00 2.305542e-04 2.141356e-04 1.184977e+02 6.499618e-02
+660 2.128019e-05 1.874592e-05 1.493771e-05 0.000000e+00 2.337794e-04 2.088948e-04 1.184977e+02 6.500300e-02
+661 2.260336e-05 2.002919e-05 1.685108e-05 0.000000e+00 2.300218e-04 2.055798e-04 1.184977e+02 6.501107e-02
+662 2.369094e-05 1.977564e-05 1.512008e-05 0.000000e+00 2.264520e-04 2.055941e-04 1.184977e+02 6.501996e-02
+663 2.476374e-05 1.951183e-05 1.648169e-05 0.000000e+00 2.234712e-04 2.063314e-04 1.184976e+02 6.502715e-02
+664 2.579345e-05 1.808927e-05 1.289527e-05 0.000000e+00 2.228025e-04 2.089788e-04 1.184976e+02 6.503331e-02
+665 2.602959e-05 1.573508e-05 1.264974e-05 0.000000e+00 2.175906e-04 2.128424e-04 1.184976e+02 6.504262e-02
+666 2.556506e-05 1.377508e-05 9.010497e-06 0.000000e+00 2.137356e-04 2.158856e-04 1.184976e+02 6.505325e-02
+667 2.468514e-05 1.239953e-05 9.736435e-06 0.000000e+00 2.113247e-04 2.161291e-04 1.184976e+02 6.506308e-02
+668 2.349232e-05 1.122675e-05 6.221403e-06 0.000000e+00 2.121634e-04 2.136834e-04 1.184976e+02 6.507357e-02
+669 2.190120e-05 1.075790e-05 6.899534e-06 0.000000e+00 2.087912e-04 2.102137e-04 1.184976e+02 6.508583e-02
+670 2.025420e-05 9.012172e-06 4.023059e-06 0.000000e+00 2.085711e-04 2.064906e-04 1.184976e+02 6.509812e-02
+671 1.813847e-05 8.152308e-06 4.279532e-06 0.000000e+00 2.121159e-04 2.016035e-04 1.184976e+02 6.510787e-02
+672 1.601559e-05 7.835640e-06 1.863326e-06 0.000000e+00 2.199442e-04 1.968063e-04 1.184976e+02 6.511541e-02
+673 1.420817e-05 6.841116e-06 3.517890e-06 0.000000e+00 2.231003e-04 1.937287e-04 1.184976e+02 6.512306e-02
+674 1.286355e-05 5.300546e-06 2.358576e-06 0.000000e+00 2.246764e-04 1.900240e-04 1.184976e+02 6.513026e-02
+675 1.216255e-05 4.131755e-06 5.247152e-06 0.000000e+00 2.257288e-04 1.861216e-04 1.184976e+02 6.513250e-02
+676 1.179202e-05 4.044802e-06 4.309726e-06 0.000000e+00 2.278416e-04 1.836829e-04 1.184976e+02 6.513201e-02
+677 1.195977e-05 4.537136e-06 7.645212e-06 0.000000e+00 2.227132e-04 1.821939e-04 1.184976e+02 6.513341e-02
+678 1.239154e-05 4.071918e-06 7.199446e-06 0.000000e+00 2.178363e-04 1.838671e-04 1.184976e+02 6.513303e-02
+679 1.231919e-05 5.537157e-06 9.410909e-06 0.000000e+00 2.140407e-04 1.857870e-04 1.184976e+02 6.512834e-02
+680 1.193660e-05 9.093785e-06 1.042511e-05 0.000000e+00 2.129543e-04 1.865163e-04 1.184976e+02 6.512291e-02
+681 1.152100e-05 1.195176e-05 1.406347e-05 0.000000e+00 2.087525e-04 1.893244e-04 1.184976e+02 6.511797e-02
+682 1.168947e-05 1.331435e-05 1.399653e-05 0.000000e+00 2.049579e-04 1.913017e-04 1.184976e+02 6.511492e-02
+683 1.188669e-05 1.559446e-05 1.602955e-05 0.000000e+00 2.028421e-04 1.908243e-04 1.184976e+02 6.511078e-02
+684 1.226738e-05 1.841946e-05 1.461408e-05 0.000000e+00 2.035288e-04 1.901775e-04 1.184976e+02 6.510508e-02
+685 1.285315e-05 2.029728e-05 1.683562e-05 0.000000e+00 2.003028e-04 1.874841e-04 1.184976e+02 6.510320e-02
+686 1.302167e-05 1.884517e-05 1.480113e-05 0.000000e+00 2.001309e-04 1.846921e-04 1.184976e+02 6.510316e-02
+687 1.322441e-05 1.890267e-05 1.497707e-05 0.000000e+00 2.026808e-04 1.803351e-04 1.184976e+02 6.510195e-02
+688 1.352930e-05 1.982588e-05 1.465256e-05 0.000000e+00 2.088772e-04 1.749470e-04 1.184976e+02 6.510102e-02
+689 1.401497e-05 1.773944e-05 1.460491e-05 0.000000e+00 2.126339e-04 1.713474e-04 1.184976e+02 6.510150e-02
+690 1.429918e-05 1.468427e-05 1.123442e-05 0.000000e+00 2.139797e-04 1.685452e-04 1.184976e+02 6.510301e-02
+691 1.425835e-05 1.391022e-05 1.006609e-05 0.000000e+00 2.120122e-04 1.657735e-04 1.184976e+02 6.510576e-02
+692 1.415251e-05 1.445084e-05 7.354985e-06 0.000000e+00 2.089497e-04 1.656477e-04 1.184976e+02 6.510775e-02
+693 1.438692e-05 1.390746e-05 8.030845e-06 0.000000e+00 2.043078e-04 1.673042e-04 1.184976e+02 6.510920e-02
+694 1.427268e-05 1.057460e-05 5.309240e-06 0.000000e+00 2.009238e-04 1.694991e-04 1.184977e+02 6.511152e-02
+695 1.407685e-05 1.004546e-05 4.675305e-06 0.000000e+00 1.956378e-04 1.707298e-04 1.184977e+02 6.511345e-02
+696 1.371678e-05 9.885874e-06 4.579868e-06 0.000000e+00 1.910072e-04 1.700603e-04 1.184977e+02 6.511664e-02
+697 1.375349e-05 6.584825e-06 3.780792e-06 0.000000e+00 1.872724e-04 1.700764e-04 1.184977e+02 6.511967e-02
+698 1.401307e-05 3.040346e-06 2.042982e-06 0.000000e+00 1.857995e-04 1.707980e-04 1.184977e+02 6.512097e-02
+699 1.466005e-05 3.144132e-06 2.992850e-06 0.000000e+00 1.852661e-04 1.697761e-04 1.184977e+02 6.512152e-02
+700 1.517398e-05 4.333828e-06 4.055554e-06 0.000000e+00 1.840417e-04 1.695849e-04 1.184977e+02 6.512253e-02
diff --git a/test/unit/energy_comparison/weibel_driver.cc b/test/unit/energy_comparison/weibel_driver.cc
new file mode 100644
index 00000000..6058ccad
--- /dev/null
+++ b/test/unit/energy_comparison/weibel_driver.cc
@@ -0,0 +1,393 @@
+//#define CATCH_CONFIG_MAIN  // This tells Catch to provide a main()
+#define CATCH_CONFIG_RUNNER // We will provide a custom main
+#include "catch.hpp"
+
+// TODO: this import may ultimately be a bad idea, but it lets you paste an input deck in...
+
+#include "deck/wrapper.h"
+
+#include "src/species_advance/species_advance.h"
+#include "src/vpic/vpic.h"
+
+#include "compare_energies.h"
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+std::string energy_file_name = "./energies";
+
+std::string energy_gold_file_name = EXPAND_AND_STRINGIFY( GOLD_ENERGY_FILE );
+
+void vpic_simulation::user_diagnostics() {
+    dump_energies(energy_file_name.c_str(), 1);
+}
+
+begin_initialization {
+// AKA:
+//void
+//vpic_simulation::user_initialization( int num_cmdline_arguments,
+                                      //char ** cmdline_argument )
+//{
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  // Arguments can be passed from the command line to the input deck
+  // if( num_cmdline_arguments!=3 ) {
+  //   sim_log( "Usage: " << cmdline_argument[0] << " mass_ratio seed" );
+  //   abort(0);
+  // }
+  seed_entropy(1); //seed_entropy( atoi( cmdline_argument[2] ) );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  //double L    = 1; // Length normalization (sheet thickness)
+  double de   = 1; // Length normalization (electron inertial length)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = 1836; //25; //atof(cmdline_argument[1]); // Ion mass / electron mass
+  double vthe = 0.25/sqrt(2.0); //0.0424264068711;       //0.424264068711;       // Electron thermal velocity
+  double vthi = 0.25/sqrt(2.0); //0.0424264068711;       //0.424264068711;       // Ion thermal velocity
+  double vthex =0.05/sqrt(2.0); //0.0141421356237;      // 0.141421356237;      // Electron thermal velocity in x-direction.
+  double vthix =0.05/sqrt(2.0); //0.0141421356237;      // 0.141421356237;Ion thermal velocity in x-direction.
+
+  double n0      = 1.0;    //  Background plasma density
+  double b0 = 0.0;         // In plane magnetic field.
+  double tauwpe    = 200000;    // simulation wpe's to run
+
+  // Numerical parameters
+  double topology_x = nproc();  // Number of domains in x, y, and z
+  double topology_y = 1;
+  double topology_z = 1;  // For load balance, best to keep "1" or "2" for Harris sheet
+  double Lx        = 2.09439510239320; //4.62*de; //6.7*de; //10.0*de;  // How big should the box be in the x direction
+  double Ly        = 1; //0.0721875*de;  // How big should the box be in the y direction
+  double Lz        = 1; //0.0721875*de;  // How big should the box be in the z direction
+  double nx        = 16; //64; //64; //32;    // Global resolution in the x direction
+  double ny        = 1;    // Global resolution in the y direction
+  double nz        = 1; //32;     // Global resolution in the z direction
+  double nppc      = 200; //800; //200; //2048; //1024; //128;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99f; //0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.0; // Level of radiation damping
+
+
+  // Derived quantities
+  double mi = me*mi_me;             // Ion mass
+  double wpe  = c/de;               // electron plasma frequency
+  double wpi  = wpe/sqrt(mi_me);    // ion plasma frequency
+  double di   = c/wpi;              // ion inertial length
+
+  double hx = Lx/nx;
+  double hy = Ly/ny;
+  double hz = Lz/nz;
+
+  double Npe = n0*Ly*Lz*Lx;    // Number physical electrons.
+  double Npi = Npe;            // Number of physical ions in box
+  double Ne  = nppc*nx*ny*nz;  // total macro electrons in box
+
+  Ne = trunc_granular(Ne,nproc());
+  double Ni   = Ne;                                   // Total macro ions in box
+
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  // printf("in harris.cxx: dt=%.7f\n",  dt);
+  // exit(1);
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = 700; //4000; // int(tauwpe/(wpe*dt));
+  status_interval      = 0; //2000;
+  sync_shared_interval = 0; //status_interval;
+  clean_div_e_interval = 0; //turn off cleaning (GY)//status_interval;
+  clean_div_b_interval = 0; //status_interval; //(GY)
+
+  global->energies_interval  = 1; //status_interval;
+  global->fields_interval    = status_interval;
+  global->ehydro_interval    = status_interval;
+  global->ihydro_interval    = status_interval;
+  global->eparticle_interval = status_interval; // Do not dump
+  global->iparticle_interval = status_interval; // Do not dump
+  global->restart_interval   = status_interval; // Do not dump
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+  grid->dx = hx;
+  grid->dy = hy;
+  grid->dz = hz;
+  grid->dt = dt;
+  grid->cvac = c;
+  //grid->damp = damp;
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  // define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+  //                        0.5*Lx, Ly, Lz,  // High corner
+  //                        nx, ny, nz,      // Resolution
+  //                        1, nproc(), 1 ); // Topology
+  define_periodic_grid(  0, -0.5*Ly, -0.5*Lz,    // Low corner
+			  Lx, 0.5*Ly, 0.5*Lz,     // High corner
+			  nx, ny, nz,             // Resolution
+			  topology_x, topology_y, topology_z); // Topology
+
+  //   printf("in harris.cxx: g->neighbor[6*265]=%jd\n",  grid->neighbor[6*265]);
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  // set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  // set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  // set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  // set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  // species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  // species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+  //species_t *electron = define_species("electron",-ec,me,2.4*Ne/nproc(),-1,25,0);
+  //species_t *ion      = define_species("ion",      ec,mi,2.4*Ne/nproc(),-1,25,0);
+
+  species_t *electron = define_species("electron",-ec,me,2.4*Ne/nproc(),-1,0,0); //turn off sorting (GY)
+  species_t *ion      = define_species("ion",      ec,mi,2.4*Ne/nproc(),-1,0,0); //(GY)
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  sim_log( "***********************************************" );
+  sim_log ( "mi/me = " << mi_me );
+  sim_log ( "tauwpe = " << tauwpe );
+  sim_log ( "num_step = " << num_step );
+  sim_log ( "Lx/di = " << Lx/di );
+  sim_log ( "Lx/de = " << Lx/de );
+  sim_log ( "Ly/di = " << Ly/di );
+  sim_log ( "Ly/de = " << Ly/de );
+  sim_log ( "Lz/di = " << Lz/di );
+  sim_log ( "Lz/de = " << Lz/de );
+  sim_log ( "nx = " << nx );
+  sim_log ( "ny = " << ny );
+  sim_log ( "nz = " << nz );
+  sim_log ( "damp = " << damp );
+  sim_log ( "courant = " << c*dt/dg );
+  sim_log ( "nproc = " << nproc ()  );
+  sim_log ( "nppc = " << nppc );
+  sim_log ( " b0 = " << b0 );
+  sim_log ( " di = " << di );
+  sim_log ( " Ne = " << Ne );
+  sim_log ( "total # of particles = " << 2*Ne );
+  sim_log ( "dt*wpe = " << wpe*dt );
+  sim_log ( "dx/de = " << Lx/(de*nx) );
+  sim_log ( "dy/de = " << Ly/(de*ny) );
+  sim_log ( "dz/de = " << Lz/(de*nz) );
+  sim_log ( "dx/debye = " << (Lx/nx)/(vthe/wpe)  );
+  sim_log ( "n0 = " << n0 );
+  sim_log ( "vthi/c = " << vthi/c );
+  sim_log ( "vthe/c = " << vthe/c );
+  sim_log( "" );
+
+  ////////////////////////////
+  // Load fields and particles
+
+  // sim_log( "Loading fields" );
+
+  // set_region_field( everywhere, 0, 0, 0,                    // Electric field
+  //                   0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  // Do a fast load of the particles
+  //seed_rand( rng_seed*nproc() + rank() );  //Generators desynchronized
+  double xmin = grid->x0 , xmax = grid->x0+(grid->dx)*(grid->nx);
+  double ymin = grid->y0 , ymax = grid->y0+(grid->dy)*(grid->ny);
+  double zmin = grid->z0 , zmax = grid->z0+(grid->dz)*(grid->nz);
+
+  sim_log( "-> Uniform Bi-Maxwellian" );
+
+  double n1,n2,n3;
+
+  repeat ( Ne/nproc() ) {
+
+      double x = uniform( rng(0), xmin, xmax );
+      double y = uniform( rng(0), ymin, ymax );
+      double z = uniform( rng(0), zmin, zmax );
+      n1 = normal(rng(0),0,vthex);
+      n2 = normal(rng(0),0,vthe );
+      n3 = normal(rng(0),0,vthe );
+
+      inject_particle( electron, x, y, z,
+              n1,
+              n2,
+              n3,we, 0, 0);
+
+      n1 = normal(rng(0),0,vthix);
+      n2 = normal(rng(0),0,vthi );
+      n3 = normal(rng(0),0,vthi );
+
+      inject_particle( ion, x, y, z,
+              n1,
+              n2,
+              n3,wi, 0 ,0 );
+
+  }
+
+  sim_log( "Finished loading particles" );
+
+  //exit(1);
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" )
+{
+    // Before we run this, we must make sure we remove the energy file
+    std::ofstream ofs;
+    ofs.open(energy_file_name, std::ofstream::out | std::ofstream::trunc);
+    ofs.close();
+
+    // Init and run sim
+    vpic_simulation simulation = vpic_simulation();
+
+    // TODO: We should do this in a safer manner
+    simulation.initialize( 0, NULL );
+
+    while( simulation.advance() );
+
+    simulation.finalize();
+
+    if( world_rank==0 ) log_printf( "normal exit\n" );
+
+    std::cout << "Comparing " << energy_file_name << " to " <<
+        energy_gold_file_name << std::endl;
+
+    // Compare energies to make sure everything worked out OK (within 1%)
+    const unsigned short e_mask = 0b0000001110;
+    const unsigned short b_mask = 0b0001110000;
+    const unsigned short particle_mask = 0b011000000;
+
+    SECTION("e_field") {
+        // Test the sum of the e_field
+        REQUIRE(
+                test_utils::compare_energies(energy_file_name, energy_gold_file_name,
+                    0.3, e_mask, test_utils::FIELD_ENUM::Sum, 1, "Weibel.e.out")
+               );
+    }
+
+    SECTION("b_field") {
+        // Test the sum of the b_field
+        REQUIRE(
+                test_utils::compare_energies(energy_file_name, energy_gold_file_name,
+                    0.03, b_mask, test_utils::FIELD_ENUM::Sum, 1, "Weibel.b.out")
+               );
+    }
+
+
+    SECTION("particle_energy") {
+        // Test particle energies individually
+        REQUIRE(
+                test_utils::compare_energies(energy_file_name, energy_gold_file_name,
+                    0.01, particle_mask, test_utils::FIELD_ENUM::Sum, 1, "Weibel.p.out")
+               );
+
+    }
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
+
+// Manually implement catch main
+int main( int argc, char* argv[] )
+{
+
+    // Setup
+    boot_services( &argc, &argv );
+
+    int result = Catch::Session().run( argc, argv );
+
+    // clean-up...
+    halt_services();
+
+    return result;
+}
diff --git a/test/unit/particle_push/CMakeLists.txt b/test/unit/particle_push/CMakeLists.txt
new file mode 100644
index 00000000..cbff3d3a
--- /dev/null
+++ b/test/unit/particle_push/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (NO_EXPLICIT_VECTOR)
+    add_executable(array_syntax ./array_syntax.cc)
+    target_link_libraries(array_syntax vpic)
+    add_test(NAME array_syntax COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ./array_syntax)
+endif(NO_EXPLICIT_VECTOR)
diff --git a/test/unit/particle_push/array_syntax.cc b/test/unit/particle_push/array_syntax.cc
new file mode 100644
index 00000000..ce47456a
--- /dev/null
+++ b/test/unit/particle_push/array_syntax.cc
@@ -0,0 +1,147 @@
+#define CATCH_CONFIG_MAIN  // This tells Catch to provide a main()
+#include "catch.hpp"
+
+#include "src/species_advance/species_advance.h"
+#include "src/vpic/vpic.h"
+#include "test/integrated/particle_push/advance_p.h"
+
+void vpic_simulation::user_diagnostics() {}
+
+void
+vpic_simulation::user_initialization( int num_cmdline_arguments,
+                                      char ** cmdline_argument )
+{
+  double L  = 1e2;
+  int npart = 127;
+  int nstep = 100;
+
+  define_units( 1, 1 );
+  define_timestep( 1 );
+  define_periodic_grid( 0, 0, 0,   // Grid low corner
+                        L, L, L,   // Grid high corner
+                        1, 1, 1,   // Grid resolution
+                        1, 1, 1 ); // Processor configuration
+  define_material( "vacuum", 1.0, 1.0, 0.0 );
+  define_field_array();
+
+  field(1,1,1).ex  = 1;
+  field(1,2,1).ex  = 1;
+  field(1,1,2).ex  = 1;
+  field(1,2,2).ex  = 1;
+
+  field(1,1,1).ey  = 2;
+  field(1,1,2).ey  = 2;
+  field(2,1,1).ey  = 2;
+  field(2,1,2).ey  = 2;
+
+  field(1,1,1).ez  = 3;
+  field(2,1,1).ez  = 3;
+  field(1,2,1).ez  = 3;
+  field(2,2,1).ez  = 3;
+
+  species_t * sp =
+    define_species( "test_species", 1., 1., npart, npart, 0, 0 );
+
+  species_t* sp2 =
+    define_species( "test_species2", 1., 1., npart, npart, 0, 0 );
+
+  for (int i = 0; i < npart; i++)
+  {
+      float x = uniform( rng(0), 0, L);
+      float y = uniform( rng(0), 0, L);
+      float z = uniform( rng(0), 0, L);
+
+      // Put two sets of particle in the exact same space
+      inject_particle( sp2, x, y, z, 0., 0., 0., 1., 0., 0);
+      inject_particle( sp , x, y, z, 0., 0., 0., 1., 0., 0);
+  }
+
+  // Create a second accumulator_array
+  accumulator_array_t* accumulator_array2 = new_accumulator_array( grid );
+
+  clear_accumulator_array(accumulator_array);
+  clear_accumulator_array(accumulator_array2);
+
+  // Hack into vpic internals
+  int failed = 0;
+  load_interpolator_array( interpolator_array, field_array );
+  for( int n=0; n<nstep; n++ ) {
+
+    advance_p( sp, accumulator_array, interpolator_array );
+    advance_p2( sp2, accumulator_array2, interpolator_array );
+
+    // This is how many pipelines there are inside the array
+    for (int n = 0; n < accumulator_array->n_pipeline+1; n++)
+    {
+        accumulator_t* a = accumulator_array->a + (n * accumulator_array2->stride);
+        accumulator_t* a2 = accumulator_array2->a + (n * accumulator_array2->stride);
+        for (int i = 0; i < grid->nv; i++)
+        {
+            if (
+                    (a[i].jx[0] != a2[i].jx[0]) ||
+                    (a[i].jx[1] != a2[i].jx[1]) ||
+                    (a[i].jx[2] != a2[i].jx[2]) ||
+                    (a[i].jx[3] != a2[i].jx[3]) ||
+                    (a[i].jy[0] != a2[i].jy[0]) ||
+                    (a[i].jy[1] != a2[i].jy[1]) ||
+                    (a[i].jy[2] != a2[i].jy[2]) ||
+                    (a[i].jy[3] != a2[i].jy[3]) ||
+                    (a[i].jz[0] != a2[i].jz[0]) ||
+                    (a[i].jz[1] != a2[i].jz[1]) ||
+                    (a[i].jz[2] != a2[i].jz[2]) ||
+                    (a[i].jz[3] != a2[i].jz[3])
+            )
+            {
+                std::cout << " Failed at " << i << std::endl;
+                failed++;
+            }
+        }
+      if( failed )
+      {  std::cout << "FAIL" << std::endl; 
+      }
+      REQUIRE_FALSE(failed);
+    }
+
+  }
+
+  std::cout << "pass" << std::endl;
+}
+
+TEST_CASE( "vectors can be sized and resized", "[vector]" ) {
+
+    std::vector<int> v( 5 );
+
+    REQUIRE( v.size() == 5 );
+    REQUIRE( v.capacity() >= 5 );
+
+    //boot_checkpt(NULL, NULL);
+    int pargc = 0;
+    char str[] = "bin/vpic";
+    char **pargv = (char **) malloc(sizeof(char **));
+    pargv[0] = str;
+    //serial.boot(&pargc, &pargv);
+    //thread.boot(&pargc, &pargv);
+    boot_services( &pargc, &pargv );
+
+    SECTION( "resizing bigger changes size and capacity" )
+    {
+        int num_particles = 64;
+        v.resize( 10 );
+
+        REQUIRE( v.size() == 10 );
+        REQUIRE( v.capacity() >= 10 );
+
+        // initialize all the variables
+        //particle_t *p_arr = particle_arr(num_particles, 1);
+        //species_t *sp = make_species(p_arr, 1, 1);
+
+        vpic_simulation* simulation = new vpic_simulation;
+        simulation->initialize( pargc, pargv );
+
+        simulation->finalize();
+        delete simulation;
+        if( world_rank==0 ) log_printf( "normal exit\n" ); 
+
+        halt_mp();
+    }
+}