From f3fd13059cc7b7f10a5829c655f5fd927a5cb7ff Mon Sep 17 00:00:00 2001
From: Amanda Bienz <bienz@Amandas-MBP-3.local>
Date: Thu, 11 Jul 2024 15:11:30 -0600
Subject: [PATCH] Split into RAPtor and RAPtor-sparse

---
 .gitmodules                                   |    6 +
 CMakeLists.txt                                |   52 +-
 config/raptor-sparseConfig.cmake.in           |    8 +
 raptor-sparse                                 |    1 +
 raptor/CMakeLists.txt                         |   22 +-
 raptor/aggregation/aggregate.hpp              |    3 +-
 raptor/aggregation/candidates.hpp             |    3 +-
 raptor/aggregation/mis.hpp                    |    3 +-
 raptor/aggregation/par_aggregate.hpp          |    3 +-
 raptor/aggregation/par_candidates.hpp         |    3 +-
 raptor/aggregation/par_mis.hpp                |    3 +-
 raptor/aggregation/par_prolongation.hpp       |    4 +-
 raptor/aggregation/prolongation.hpp           |    4 +-
 raptor/core/CMakeLists.txt                    |   47 -
 raptor/core/README.md                         |    1 -
 raptor/core/comm_data.cpp                     |   88 -
 raptor/core/comm_data.hpp                     | 1428 -------------
 raptor/core/comm_mat.cpp                      |  810 --------
 raptor/core/comm_pkg.cpp                      |  203 --
 raptor/core/comm_pkg.hpp                      | 1841 -----------------
 raptor/core/matrix.cpp                        | 1438 -------------
 raptor/core/matrix.hpp                        | 1353 ------------
 raptor/core/mpi_types.cpp                     |  331 ---
 raptor/core/mpi_types.hpp                     |  133 --
 raptor/core/par_matrix.cpp                    | 1116 ----------
 raptor/core/par_matrix.hpp                    |  852 --------
 raptor/core/par_vector.cpp                    |  123 --
 raptor/core/par_vector.hpp                    |  179 --
 raptor/core/partition.hpp                     |  349 ----
 raptor/core/tap_comm.cpp                      | 1158 -----------
 raptor/core/tests/CMakeLists.txt              |   56 -
 raptor/core/tests/test_block_matrix.cpp       |  212 --
 raptor/core/tests/test_bsr_matrix.cpp         |   79 -
 raptor/core/tests/test_matrix.cpp             |   64 -
 .../core/tests/test_par_block_conversion.cpp  |  105 -
 raptor/core/tests/test_par_block_matrix.cpp   |  136 --
 raptor/core/tests/test_par_bsr.cpp            |  148 --
 raptor/core/tests/test_par_comm.cpp           |   83 -
 raptor/core/tests/test_par_matrix.cpp         |   90 -
 raptor/core/tests/test_par_transpose.cpp      |   50 -
 raptor/core/tests/test_par_vector.cpp         |   70 -
 raptor/core/tests/test_tap_comm.cpp           |   81 -
 raptor/core/tests/test_transpose.cpp          |   45 -
 raptor/core/topology.hpp                      |  174 --
 raptor/core/types.hpp                         |   77 -
 raptor/core/utilities.hpp                     |  211 --
 raptor/core/vector.cpp                        |  167 --
 raptor/core/vector.hpp                        |  224 --
 raptor/gallery/CMakeLists.txt                 |   47 -
 raptor/gallery/diffusion.cpp                  |   83 -
 raptor/gallery/diffusion.hpp                  |   61 -
 raptor/gallery/laplacian27pt.cpp              |   36 -
 raptor/gallery/laplacian27pt.hpp              |   30 -
 raptor/gallery/matrix_IO.cpp                  |  112 -
 raptor/gallery/matrix_IO.hpp                  |   24 -
 raptor/gallery/matrix_market.cpp              |  538 -----
 raptor/gallery/matrix_market.hpp              |  137 --
 raptor/gallery/par_matrix_IO.cpp              |  187 --
 raptor/gallery/par_matrix_IO.hpp              |   26 -
 raptor/gallery/par_matrix_market.cpp          |  309 ---
 raptor/gallery/par_matrix_market.hpp          |   23 -
 raptor/gallery/par_random.cpp                 |   30 -
 raptor/gallery/par_random.hpp                 |   19 -
 raptor/gallery/par_stencil.cpp                |  228 --
 raptor/gallery/par_stencil.hpp                |   19 -
 raptor/gallery/random.cpp                     |   28 -
 raptor/gallery/random.hpp                     |   19 -
 raptor/gallery/stencil.cpp                    |  196 --
 raptor/gallery/stencil.hpp                    |   20 -
 raptor/gallery/tests/CMakeLists.txt           |   34 -
 raptor/gallery/tests/test_aniso.cpp           |   58 -
 raptor/gallery/tests/test_laplacian.cpp       |   54 -
 raptor/gallery/tests/test_matrix_market.cpp   |   39 -
 raptor/gallery/tests/test_par_aniso.cpp       |   99 -
 raptor/gallery/tests/test_par_laplacian.cpp   |   98 -
 .../gallery/tests/test_par_matrix_market.cpp  |   50 -
 raptor/gallery/tests/test_stencil.cpp         |   94 -
 raptor/krylov/bicgstab.hpp                    |    4 +-
 raptor/krylov/cg.hpp                          |    4 +-
 raptor/krylov/par_bicgstab.hpp                |    4 +-
 raptor/krylov/par_cg.hpp                      |    4 +-
 raptor/krylov/partial_inner.hpp               |    3 +-
 raptor/multilevel/level.hpp                   |    4 +-
 raptor/multilevel/multilevel.hpp              |    6 +-
 raptor/multilevel/par_level.hpp               |    4 +-
 raptor/multilevel/par_multilevel.hpp          |    6 +-
 .../CMakeLists 2.txt}                         |    0
 raptor/precondition/CMakeLists.txt            |   37 +
 .../par_diag_scale.cpp                        |    0
 .../par_diag_scale.hpp                        |    3 +-
 .../linalg => precondition}/par_relax.cpp     |    2 -
 .../linalg => precondition}/par_relax.hpp     |    3 +-
 .../{util/linalg => precondition}/relax.cpp   |    4 +-
 .../{util/linalg => precondition}/relax.hpp   |    3 +-
 raptor/precondition/tests/CMakeLists.txt      |   53 +
 raptor/{core => precondition}/tests/README.md |    0
 .../tests/test_bsr_gs_aniso.cpp               |    0
 .../tests/test_bsr_gs_laplacian.cpp           |    0
 .../tests/test_bsr_jacobi_aniso.cpp           |    0
 .../tests/test_bsr_jacobi_laplacian.cpp       |    0
 .../tests/test_bsr_spmv_aniso.cpp             |    0
 .../tests/test_bsr_spmv_laplacian.cpp         |    0
 .../tests/test_bsr_spmv_random.cpp            |    0
 .../tests/test_gs_aniso.cpp                   |    0
 .../tests/test_gs_laplacian.cpp               |    0
 .../tests/test_jacobi_aniso.cpp               |    0
 .../tests/test_jacobi_laplacian.cpp           |    0
 .../tests/test_par_add.cpp                    |    0
 .../tests/test_par_scale_aniso.cpp            |    0
 .../tests/test_par_spmv_aniso.cpp             |    0
 .../tests/test_par_spmv_laplacian.cpp         |    0
 .../tests/test_par_spmv_random.cpp            |    0
 .../tests/test_parmetis.cpp                   |    0
 .../tests/test_ptscotch.cpp                   |    0
 .../tests/test_repartition.cpp                |    0
 .../tests/test_sor_aniso.cpp                  |    0
 .../tests/test_sor_laplacian.cpp              |    0
 .../tests/test_spmv_aniso.cpp                 |    0
 .../tests/test_spmv_laplacian.cpp             |    0
 .../tests/test_spmv_random.cpp                |    0
 .../tests/test_tap_spmv_aniso.cpp             |    0
 .../tests/test_tap_spmv_laplacian.cpp         |    0
 .../tests/test_tap_spmv_random.cpp            |    0
 raptor/profiling/profile_comm.cpp             |    2 +-
 raptor/raptor.hpp                             |   60 +-
 raptor/ruge_stuben/cf_splitting.hpp           |    3 +-
 raptor/ruge_stuben/interpolation.cpp          |    1 -
 raptor/ruge_stuben/interpolation.hpp          |    3 +-
 raptor/ruge_stuben/par_cf_splitting.hpp       |    3 +-
 raptor/ruge_stuben/par_interpolation.cpp      |    3 +-
 raptor/ruge_stuben/par_interpolation.hpp      |    3 +-
 raptor/tests/compare.hpp                      |    3 +-
 raptor/tests/par_compare.hpp                  |    3 +-
 raptor/util/linalg/CMakeLists.txt             |   46 -
 raptor/util/linalg/add.cpp                    |  114 -
 raptor/util/linalg/external/CMakeLists.txt    |   34 -
 .../util/linalg/external/parmetis_wrapper.hpp |  112 -
 .../util/linalg/external/ptscotch_wrapper.hpp |  105 -
 raptor/util/linalg/matmult.cpp                |  352 ----
 raptor/util/linalg/par_add.cpp                |  309 ---
 raptor/util/linalg/par_matmult.cpp            |  563 -----
 raptor/util/linalg/par_spmv.cpp               |  342 ---
 raptor/util/linalg/repartition.cpp            |  392 ----
 raptor/util/linalg/repartition.hpp            |   22 -
 raptor/util/linalg/spmv.cpp                   |  437 ----
 raptor/util/tests/CMakeLists.txt              |  155 --
 raptor/util/tests/README.md                   |    1 -
 147 files changed, 162 insertions(+), 19385 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 config/raptor-sparseConfig.cmake.in
 create mode 160000 raptor-sparse
 delete mode 100644 raptor/core/CMakeLists.txt
 delete mode 100644 raptor/core/README.md
 delete mode 100644 raptor/core/comm_data.cpp
 delete mode 100644 raptor/core/comm_data.hpp
 delete mode 100644 raptor/core/comm_mat.cpp
 delete mode 100644 raptor/core/comm_pkg.cpp
 delete mode 100644 raptor/core/comm_pkg.hpp
 delete mode 100644 raptor/core/matrix.cpp
 delete mode 100644 raptor/core/matrix.hpp
 delete mode 100644 raptor/core/mpi_types.cpp
 delete mode 100644 raptor/core/mpi_types.hpp
 delete mode 100644 raptor/core/par_matrix.cpp
 delete mode 100644 raptor/core/par_matrix.hpp
 delete mode 100644 raptor/core/par_vector.cpp
 delete mode 100644 raptor/core/par_vector.hpp
 delete mode 100644 raptor/core/partition.hpp
 delete mode 100644 raptor/core/tap_comm.cpp
 delete mode 100644 raptor/core/tests/CMakeLists.txt
 delete mode 100644 raptor/core/tests/test_block_matrix.cpp
 delete mode 100644 raptor/core/tests/test_bsr_matrix.cpp
 delete mode 100644 raptor/core/tests/test_matrix.cpp
 delete mode 100644 raptor/core/tests/test_par_block_conversion.cpp
 delete mode 100644 raptor/core/tests/test_par_block_matrix.cpp
 delete mode 100644 raptor/core/tests/test_par_bsr.cpp
 delete mode 100644 raptor/core/tests/test_par_comm.cpp
 delete mode 100644 raptor/core/tests/test_par_matrix.cpp
 delete mode 100644 raptor/core/tests/test_par_transpose.cpp
 delete mode 100644 raptor/core/tests/test_par_vector.cpp
 delete mode 100644 raptor/core/tests/test_tap_comm.cpp
 delete mode 100644 raptor/core/tests/test_transpose.cpp
 delete mode 100644 raptor/core/topology.hpp
 delete mode 100644 raptor/core/types.hpp
 delete mode 100644 raptor/core/utilities.hpp
 delete mode 100644 raptor/core/vector.cpp
 delete mode 100644 raptor/core/vector.hpp
 delete mode 100644 raptor/gallery/CMakeLists.txt
 delete mode 100644 raptor/gallery/diffusion.cpp
 delete mode 100644 raptor/gallery/diffusion.hpp
 delete mode 100644 raptor/gallery/laplacian27pt.cpp
 delete mode 100644 raptor/gallery/laplacian27pt.hpp
 delete mode 100644 raptor/gallery/matrix_IO.cpp
 delete mode 100644 raptor/gallery/matrix_IO.hpp
 delete mode 100644 raptor/gallery/matrix_market.cpp
 delete mode 100644 raptor/gallery/matrix_market.hpp
 delete mode 100644 raptor/gallery/par_matrix_IO.cpp
 delete mode 100644 raptor/gallery/par_matrix_IO.hpp
 delete mode 100644 raptor/gallery/par_matrix_market.cpp
 delete mode 100644 raptor/gallery/par_matrix_market.hpp
 delete mode 100644 raptor/gallery/par_random.cpp
 delete mode 100644 raptor/gallery/par_random.hpp
 delete mode 100644 raptor/gallery/par_stencil.cpp
 delete mode 100644 raptor/gallery/par_stencil.hpp
 delete mode 100644 raptor/gallery/random.cpp
 delete mode 100644 raptor/gallery/random.hpp
 delete mode 100644 raptor/gallery/stencil.cpp
 delete mode 100644 raptor/gallery/stencil.hpp
 delete mode 100644 raptor/gallery/tests/CMakeLists.txt
 delete mode 100644 raptor/gallery/tests/test_aniso.cpp
 delete mode 100644 raptor/gallery/tests/test_laplacian.cpp
 delete mode 100644 raptor/gallery/tests/test_matrix_market.cpp
 delete mode 100644 raptor/gallery/tests/test_par_aniso.cpp
 delete mode 100644 raptor/gallery/tests/test_par_laplacian.cpp
 delete mode 100644 raptor/gallery/tests/test_par_matrix_market.cpp
 delete mode 100644 raptor/gallery/tests/test_stencil.cpp
 rename raptor/{util/CMakeLists.txt => precondition/CMakeLists 2.txt} (100%)
 create mode 100644 raptor/precondition/CMakeLists.txt
 rename raptor/{util/linalg => precondition}/par_diag_scale.cpp (100%)
 rename raptor/{util/linalg => precondition}/par_diag_scale.hpp (86%)
 rename raptor/{util/linalg => precondition}/par_relax.cpp (99%)
 rename raptor/{util/linalg => precondition}/par_relax.hpp (90%)
 rename raptor/{util/linalg => precondition}/relax.cpp (98%)
 rename raptor/{util/linalg => precondition}/relax.hpp (94%)
 create mode 100644 raptor/precondition/tests/CMakeLists.txt
 rename raptor/{core => precondition}/tests/README.md (100%)
 rename raptor/{util => precondition}/tests/test_bsr_gs_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_gs_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_jacobi_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_jacobi_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_spmv_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_spmv_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_bsr_spmv_random.cpp (100%)
 rename raptor/{util => precondition}/tests/test_gs_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_gs_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_jacobi_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_jacobi_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_par_add.cpp (100%)
 rename raptor/{util => precondition}/tests/test_par_scale_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_par_spmv_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_par_spmv_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_par_spmv_random.cpp (100%)
 rename raptor/{util => precondition}/tests/test_parmetis.cpp (100%)
 rename raptor/{util => precondition}/tests/test_ptscotch.cpp (100%)
 rename raptor/{util => precondition}/tests/test_repartition.cpp (100%)
 rename raptor/{util => precondition}/tests/test_sor_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_sor_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_spmv_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_spmv_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_spmv_random.cpp (100%)
 rename raptor/{util => precondition}/tests/test_tap_spmv_aniso.cpp (100%)
 rename raptor/{util => precondition}/tests/test_tap_spmv_laplacian.cpp (100%)
 rename raptor/{util => precondition}/tests/test_tap_spmv_random.cpp (100%)
 delete mode 100644 raptor/util/linalg/CMakeLists.txt
 delete mode 100644 raptor/util/linalg/add.cpp
 delete mode 100644 raptor/util/linalg/external/CMakeLists.txt
 delete mode 100644 raptor/util/linalg/external/parmetis_wrapper.hpp
 delete mode 100644 raptor/util/linalg/external/ptscotch_wrapper.hpp
 delete mode 100644 raptor/util/linalg/matmult.cpp
 delete mode 100644 raptor/util/linalg/par_add.cpp
 delete mode 100644 raptor/util/linalg/par_matmult.cpp
 delete mode 100644 raptor/util/linalg/par_spmv.cpp
 delete mode 100644 raptor/util/linalg/repartition.cpp
 delete mode 100644 raptor/util/linalg/repartition.hpp
 delete mode 100644 raptor/util/linalg/spmv.cpp
 delete mode 100644 raptor/util/tests/CMakeLists.txt
 delete mode 100644 raptor/util/tests/README.md

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..eb8f8d57
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "external/googletest"]
+	path = external/googletest
+	url = https://github.com/google/googletest.git
+[submodule "raptor-sparse"]
+	path = raptor-sparse
+	url = https://github.com/raptor-library/raptor-sparse.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b292551a..027a1808 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,10 @@ enable_language(CXX)
 set(CMAKE_CXX_STANDARD 11)
 
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wredundant-decls -Wcast-align -Wshadow")
-#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -funroll-loops")
+
+
+add_compile_definitions(RAPTOR_SPARSE_TEST_FOLDER="${CMAKE_SOURCE_DIR}/raptor-sparse/src/tests/")
+add_compile_definitions(RAPTOR_TEST_FOLDER="${CMAKE_SOURCE_DIR}/raptor/tests/")
 
 include(FeatureSummary)
 
@@ -18,7 +21,6 @@ option(WITH_HYPRE "Add Hypre" OFF)
 option(WITH_MUELU "Add Trilinos MueLu" OFF)
 option(WITH_MFEM "Add MFEM" OFF)
 option(WITH_PETSC "Add Petsc" OFF)
-option(WITH_AMPI "Using AMPI" OFF)
 option(WITH_MPI "Using MPI" ON)
 option(WITH_HOSTFILE "Use a Hostfile with MPI" OFF)
 
@@ -26,7 +28,6 @@ add_feature_info(hypre WITH_HYPRE "Hypre preconditioner")
 add_feature_info(ml WITH_MUELU "Trilinos MueLu preconditioner")
 add_feature_info(mfem WITH_MFEM "MFEM matrix gallery")
 add_feature_info(petsc WITH_PETSC "Petsc Interface")
-add_feature_info(ampi WITH_AMPI "Compile with AMPI")
 add_feature_info(crayxe CRAYXE "Compile on CrayXE")
 add_feature_info(bgq BGQ "Compile on BGQ")
 add_feature_info(ptscotch WITH_PTSCOTCH "Enable PTScotch Partitioning")
@@ -45,8 +46,9 @@ if (WITH_MPI)
     SET(MPIRUN mpirun)
 endif (WITH_MPI)
 
-#include_directories("external")
 set(raptor_INCDIR ${CMAKE_CURRENT_SOURCE_DIR}/raptor)
+set(raptor_sparse_DIR raptor-sparse/src)
+set(raptor_sparse_INCDIR ${CMAKE_CURRENT_SOURCE_DIR}/${raptor_sparse_DIR})
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 
 SET(HYPRE_DIR "" CACHE STRING "Directory of HYPRE")
@@ -58,37 +60,12 @@ SET(PETSC_DIR "" CACHE STRING "Directory of petsc")
 SET(PTSCOTCH_DIR "" CACHE STRING "Directory of Ptscotch")
 SET(PARMETIS_DIR "" CACHE STRING "Directory of ParMetis")
 SET(HOSTFILE "" CACHE STRING "Set Hostfile")
+SET(BLAS_PATH "" CACHE STRING "/usr/linb/x86_64-linux-gnu")
+SET(LAPACK_PATH "" CACHE STRING "/usr/linb/x86_64-linux-gnu")
 
-if (CRAYXE)
-    set(EXTERNAL_LIBS "")
-    set(MPIRUN aprun)
-    set(CMAKE_AR  "gcc-ar-4.7")
-    set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> qcs <TARGET> <LINK_FLAGS> <OBJECTS>")
-    set(CMAKE_CXX_ARCHIVE_FINISH   true)
-elseif (BGQ)
-    find_library(LAPACK_LIB NAMES liblapack lapack HINTS
-        "/soft/libraries/alcf/current/xl/LAPACK/lib")
-    find_library(BLAS_LIB NAMES libblas blas HINTS
-        "/soft/libraries/alcf/current/xl/BLAS/lib")
-    find_library(ESSL_LIB NAMES libesslbg esslbg HINTS
-        "/soft/libraries/essl/current/essl/5.1/lib64")
-    find_library(XLF_LIB NAMES libxlf90_r.a xlf90_r HINTS
-        "/soft/compilers/ibmcmp-may2016/xlf/bg/14.1/bglib64")
-    find_library(XLOPT_LIB NAMES libxlopt.a xlopt HINTS
-        "/soft/compilers/ibmcmp-may2016/xlf/bg/14.1/bglib64")
-    find_library(XLFMATH_LIB NAMES libxlfmath.a xlfmath HINTS
-        "/soft/compilers/ibmcmp-may2016/xlf/bg/14.1/bglib64")
-    find_library(XL_LIB NAMES libxl.a xl HINTS
-        "/soft/compilers/ibmcmp-may2016/xlf/bg/14.1/bglib64")
-    find_library(XLSMP_LIB NAMES libxlomp_ser.a xlomp_ser HINTS
-        "/soft/compilers/ibmcmp-may2016/xlsmp/bg/3.1/bglib64")
-    set(EXTERNAL_LIBS ${LAPACK_LIB} ${BLAS_LIB} ${ESSL_LIB} ${XLF_LIB}
-        ${XLOPT_LIB} ${XLFMATH_LIB} ${XL_LIB} ${XLSMP_LIB})
-else()
-	find_library(LAPACK_LIB NAMES liblapack.so.3 lapack HINTS "/usr/lib/x86_64-linux-gnu/")
-	find_library(BLAS_LIB NAMES libblas.so.3 blas HINTS "/usr/lib/x86_64-linux-gnu/")
-	set(EXTERNAL_LIBS ${LAPACK_LIB} ${BLAS_LIB})
-endif()
+find_library(LAPACK_LIB NAMES liblapack.so.3 lapack HINTS ${LAPACK_PATH})
+find_library(BLAS_LIB NAMES libblas.so.3 blas HINTS ${BLAS_PATH})
+set(EXTERNAL_LIBS ${LAPACK_LIB} ${BLAS_LIB})
 
 if (WITH_HOSTFILE)
     find_file (FILE_OF_HOST, ${HOSTFILE})
@@ -180,10 +157,9 @@ if(WITH_PETSC)
     endif(PETSC_FOUND)
 endif(WITH_PETSC)
 
-if (WITH_AMPI)
-	add_definitions(-DUSE_AMPI)
-endif(WITH_AMPI)
-
+include_directories(${CMAKE_SOURCE_DIR}/raptor-sparse)
+include_directories(${CMAKE_SOURCE_DIR}/raptor-sparse/src)
+add_subdirectory(raptor-sparse/src)
 add_subdirectory(raptor)
 
 if (BUILD_EXAMPLES)
diff --git a/config/raptor-sparseConfig.cmake.in b/config/raptor-sparseConfig.cmake.in
new file mode 100644
index 00000000..9cb5d2e7
--- /dev/null
+++ b/config/raptor-sparseConfig.cmake.in
@@ -0,0 +1,8 @@
+@PACKAGE_INIT@
+
+if(NOT TARGET raptor-sparse AND NOT raptor-sparse_BINARY_DIR)
+  include("${CMAKE_CURRENT_LIST_DIR}/raptor-sparseTargets.cmake")
+  endif()
+
+find_package(Threads REQUIRED)
+find_package(MPI COMPONENTS CXX REQUIRED)
diff --git a/raptor-sparse b/raptor-sparse
new file mode 160000
index 00000000..0463db14
--- /dev/null
+++ b/raptor-sparse
@@ -0,0 +1 @@
+Subproject commit 0463db14e9f99bac651217ba614e0f99a5bc2394
diff --git a/raptor/CMakeLists.txt b/raptor/CMakeLists.txt
index 353ca2aa..bf01cfcd 100644
--- a/raptor/CMakeLists.txt
+++ b/raptor/CMakeLists.txt
@@ -1,6 +1,4 @@
-add_subdirectory(core)
-add_subdirectory(gallery)
-add_subdirectory(util)
+add_subdirectory(precondition)
 add_subdirectory(ruge_stuben)
 add_subdirectory(aggregation)
 add_subdirectory(multilevel)
@@ -17,9 +15,10 @@ else()
         )
 endif()
 
-add_library(raptor ${core_SOURCES} ${core_HEADERS} 
-	${gallery_SOURCES} ${gallery_HEADERS} ${ext_gallery_HEADERS} 
-	${util_SOURCES} ${util_HEADERS}
+message(STATUS ${sparse_SOURCES})
+
+add_library(raptor  
+    ${precond_SOURCES} ${precond_HEADERS}
 	${par_SOURCES} strength.cpp 
 	${ruge_stuben_SOURCES} ${ruge_stuben_HEADERS}
         ${aggregation_SOURCES} ${aggregation_HEADERS}
@@ -28,7 +27,7 @@ add_library(raptor ${core_SOURCES} ${core_HEADERS}
         ${profile_SOURCES} ${profile_HEADERS}
         ${external_SOURCES} ${external_HEADERS})
 
-target_link_libraries(raptor PUBLIC ${MPI_C_LIBRARIES} ${MFEM_LIBRARIES} ${METIS_LIBRARIES} ${HYPRE_LIBRARIES}
+target_link_libraries(raptor PUBLIC raptor-sparse ${MPI_C_LIBRARIES} ${MFEM_LIBRARIES} ${METIS_LIBRARIES} ${HYPRE_LIBRARIES}
     ${MUELU_LIBRARIES} ${PETSC_LIBRARIES} ${PTSCOTCH_LIBRARIES} ${PARMETIS_LIBRARIES} ${EXTERNAL_LIBS})
 
 target_include_directories(raptor
@@ -42,11 +41,8 @@ install(TARGETS raptor
   EXPORT raptorTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR})
 install(FILES raptor.hpp DESTINATION "include/raptor")
-install(FILES ${core_HEADERS} DESTINATION "include/raptor/core")
-install(FILES ${gallery_HEADERS}  DESTINATION "include/raptor/gallery")
 install(FILES ${ext_gallery_HEADERS} DESTINATION "include/raptor/gallery/external")
-install(FILES ${util_HEADERS} DESTINATION "include/raptor/util/linalg")
-install(FILES ${ext_util_HEADERS} DESTINATION "include/raptor/util/linalg/external")
+install(FILES ${precond_HEADERS} DESTINATION "include/raptor/precondition")
 install(FILES ${ruge_stuben_HEADERS} DESTINATION "include/raptor/ruge_stuben")
 install(FILES ${aggregation_HEADERS} DESTINATION "include/raptor/aggregation")
 install(FILES ${multilevel_HEADERS} DESTINATION "include/raptor/multilevel")
@@ -72,12 +68,10 @@ install(
 
 if(ENABLE_UNIT_TESTS)
     add_subdirectory(tests)
-    add_subdirectory(core/tests)
-    add_subdirectory(util/tests)
+    add_subdirectory(precondition/tests)
     add_subdirectory(ruge_stuben/tests)
     add_subdirectory(aggregation/tests)
     add_subdirectory(multilevel/tests)
-    add_subdirectory(gallery/tests)
     add_subdirectory(krylov/tests)
     add_subdirectory(external/tests)
 endif()
diff --git a/raptor/aggregation/aggregate.hpp b/raptor/aggregation/aggregate.hpp
index 1c8754e1..dc700657 100644
--- a/raptor/aggregation/aggregate.hpp
+++ b/raptor/aggregation/aggregate.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_AGGREGATE_HPP
 #define RAPTOR_AGGREGATION_AGGREGATE_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 #include "mis.hpp"
 
 namespace raptor {
diff --git a/raptor/aggregation/candidates.hpp b/raptor/aggregation/candidates.hpp
index d0e79ef1..6b4cbe53 100644
--- a/raptor/aggregation/candidates.hpp
+++ b/raptor/aggregation/candidates.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_CANDIDATES_HPP
 #define RAPTOR_AGGREGATION_CANDIDATES_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 // TODO -- currently only accepts constant vector
diff --git a/raptor/aggregation/mis.hpp b/raptor/aggregation/mis.hpp
index a9417211..2fc14bc0 100644
--- a/raptor/aggregation/mis.hpp
+++ b/raptor/aggregation/mis.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_MIS_HPP
 #define RAPTOR_AGGREGATION_MIS_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/aggregation/par_aggregate.hpp b/raptor/aggregation/par_aggregate.hpp
index 9a5ada41..90057eb7 100644
--- a/raptor/aggregation/par_aggregate.hpp
+++ b/raptor/aggregation/par_aggregate.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_PAR_AGGREGATE_HPP
 #define RAPTOR_AGGREGATION_PAR_AGGREGATE_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 #include "par_mis.hpp"
 
 namespace raptor {
diff --git a/raptor/aggregation/par_candidates.hpp b/raptor/aggregation/par_candidates.hpp
index d033acae..6404b60a 100644
--- a/raptor/aggregation/par_candidates.hpp
+++ b/raptor/aggregation/par_candidates.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_PAR_CANDIDATES_HPP
 #define RAPTOR_AGGREGATION_PAR_CANDIDATES_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 // TODO -- currently only accepts constant vector
diff --git a/raptor/aggregation/par_mis.hpp b/raptor/aggregation/par_mis.hpp
index 33f854eb..0f27054b 100644
--- a/raptor/aggregation/par_mis.hpp
+++ b/raptor/aggregation/par_mis.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_PAR_MIS_HPP
 #define RAPTOR_AGGREGATION_PAR_MIS_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/aggregation/par_prolongation.hpp b/raptor/aggregation/par_prolongation.hpp
index c349faa5..56b7cd9b 100644
--- a/raptor/aggregation/par_prolongation.hpp
+++ b/raptor/aggregation/par_prolongation.hpp
@@ -3,9 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_PAR_PROLONGATION_HPP
 #define RAPTOR_AGGREGATION_PAR_PROLONGATION_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 ParCSRMatrix* jacobi_prolongation(ParCSRMatrix* A, ParCSRMatrix* T, bool tap_comm = false,
diff --git a/raptor/aggregation/prolongation.hpp b/raptor/aggregation/prolongation.hpp
index 8394b565..386adac9 100644
--- a/raptor/aggregation/prolongation.hpp
+++ b/raptor/aggregation/prolongation.hpp
@@ -3,9 +3,7 @@
 #ifndef RAPTOR_AGGREGATION_PROLONGATION_HPP
 #define RAPTOR_AGGREGATION_PROLONGATION_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 CSRMatrix* jacobi_prolongation(CSRMatrix* A, CSRMatrix* T, double omega = 4.0/3, 
diff --git a/raptor/core/CMakeLists.txt b/raptor/core/CMakeLists.txt
deleted file mode 100644
index 543c32cc..00000000
--- a/raptor/core/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Include the directory itself as a path to include directories
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
- 
-# Create a variable called core_SOURCES containing all .cpp files:
-if (WITH_MPI)
-    set(par_core_HEADERS
-        core/mpi_types.hpp
-        core/topology.hpp
-        core/partition.hpp
-        core/comm_data.hpp
-        core/comm_pkg.hpp
-        core/par_vector.hpp
-        core/par_matrix.hpp
-        )
-    set(par_core_SOURCES
-        core/mpi_types.cpp
-        core/comm_data.cpp
-        core/tap_comm.cpp
-        core/comm_pkg.cpp
-        core/comm_mat.cpp
-        core/par_vector.cpp
-        core/par_matrix.cpp
-        )
-else ()
-    set(par_core_HEADERS
-        ""
-        )
-
-    set (par_core_SOURCES
-        ""
-        )
-endif()
-
-set(core_SOURCES 
-    core/vector.cpp
-    core/matrix.cpp
-    ${par_core_SOURCES}
-    PARENT_SCOPE
-    )
-set(core_HEADERS
-    core/types.hpp
-    core/vector.hpp
-    core/matrix.hpp
-    core/utilities.hpp
-    ${par_core_HEADERS}
-    PARENT_SCOPE
-    )
diff --git a/raptor/core/README.md b/raptor/core/README.md
deleted file mode 100644
index c10090a9..00000000
--- a/raptor/core/README.md
+++ /dev/null
@@ -1 +0,0 @@
-vector, matrix, interface, and partition definitions
diff --git a/raptor/core/comm_data.cpp b/raptor/core/comm_data.cpp
deleted file mode 100644
index afb9c0df..00000000
--- a/raptor/core/comm_data.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "comm_data.hpp"
-
-namespace raptor 
-{
-template<>
-std::vector<double>& CommData::get_buffer<double>(const int block_size)
-{
-    return buffer;
-}
-template<>
-std::vector<int>& CommData::get_buffer<int>(const int block_size)
-{
-    return int_buffer;
-}
-template<> 
-std::vector<char>& CommData::get_buffer<char>(const int block_size)
-{
-    return pack_buffer;
-}
-
-template<>
-RAPtor_MPI_Datatype CommData::get_type<int>()
-{
-    return RAPtor_MPI_INT;
-}
-template<>
-RAPtor_MPI_Datatype CommData::get_type<double>()
-{
-    return RAPtor_MPI_DOUBLE;
-}
-
-template<>
-void CommData::send<int>(const int* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size, 
-        std::function<int(int, int)> init_result_func, int init_result_func_val)
-{
-    int_send(values, key, mpi_comm, block_size, init_result_func, 
-            init_result_func_val);
-}
-template<>
-void CommData::send<double>(const double* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size, 
-        std::function<double(double, double)> init_result_func, double init_result_func_val)
-{
-    double_send(values, key, mpi_comm, block_size, init_result_func,
-            init_result_func_val);
-}
-
-template<>
-void CommData::send<int>(const int* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-{
-    int_send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-}
-template<>
-void CommData::send<double>(const double* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-{
-    double_send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-}
-
-
-template <>
-void CommData::recv<int>(int key, RAPtor_MPI_Comm mpi_comm, 
-        const std::vector<int>& off_proc_states,
-        std::function<bool(int)> compare_func,
-        int* s_recv_ptr, int* n_recv_ptr, const int block_size)
-{
-    int_recv(key, mpi_comm, off_proc_states, compare_func,
-            s_recv_ptr, n_recv_ptr, block_size);
-}
-
-template <>
-void CommData::recv<double>(int key, RAPtor_MPI_Comm mpi_comm, 
-        const std::vector<int>& off_proc_states,
-        std::function<bool(int)> compare_func,
-        int* s_recv_ptr, int* n_recv_ptr, const int block_size)
-{
-    double_recv(key, mpi_comm, off_proc_states, compare_func,
-            s_recv_ptr, n_recv_ptr, block_size);
-}
-
-
-
-}
diff --git a/raptor/core/comm_data.hpp b/raptor/core/comm_data.hpp
deleted file mode 100644
index 474af490..00000000
--- a/raptor/core/comm_data.hpp
+++ /dev/null
@@ -1,1428 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_COMMDATA_HPP
-#define RAPTOR_CORE_COMMDATA_HPP
-
-#define WITH_RAPtor_MPI 1
-
-#include <mpi.h>
-#include "mpi_types.hpp"
-#include "vector.hpp"
-#include "matrix.hpp"
-#include "utilities.hpp"
-
-/**************************************************************
- *****   CommData Class
- **************************************************************
- **************************************************************/
-namespace raptor
-{
-    // Forward Declaration
-class CommData
-{
-public:
-    /**************************************************************
-    *****   CommData Class Constructor
-    **************************************************************
-    ***** Initializes an empty CommData, setting number and size of 
-    ***** messages to zero.
-
-    **************************************************************/
-    CommData()
-    {
-        num_msgs = 0;
-        size_msgs = 0;
-        indptr.emplace_back(0);
-    }
-
-    CommData(CommData* data)
-    {
-        num_msgs = data->num_msgs;
-        size_msgs = data->size_msgs;
-        std::copy(data->procs.begin(), data->procs.end(),
-                std::back_inserter(procs));
-        std::copy(data->indptr.begin(), data->indptr.end(), 
-                std::back_inserter(indptr));
-
-        if (num_msgs)
-        {
-            requests.resize(num_msgs);
-        }
-
-        if (size_msgs)
-        {
-            buffer.resize(size_msgs);
-            int_buffer.resize(size_msgs);
-        }
-    }
-
-    /**************************************************************
-    *****   ParComm Class Destructor
-    **************************************************************
-    ***** 
-    **************************************************************/
-    virtual ~CommData()
-    {
-    };
-
-    virtual void add_msg(int proc, int msg_size, int* msg_indices = NULL) = 0;
-
-    void finalize()
-    {
-        if (num_msgs)
-        {
-            requests.resize(num_msgs);
-	}
-	if (size_msgs)
-	{
-            buffer.resize(size_msgs);
-            int_buffer.resize(size_msgs);
-        }
-    }
-
-    virtual void probe(int size, int key, RAPtor_MPI_Comm mpi_comm) = 0;
-
-    virtual CommData* copy() = 0;
-    virtual CommData* copy(const std::vector<int>& col_to_new) = 0;
-
-    template <typename T>
-    static RAPtor_MPI_Datatype get_type();
-
-
-    template <typename T>
-    std::vector<T>& get_buffer(const int block_size = 1);
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1,
-            std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-            T init_result_func_val = 0);
-    virtual void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val) = 0;
-    virtual void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val) = 0;
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size = 1);
-    virtual void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size) = 0;
-    virtual void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size) = 0;
-
-
-    virtual void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            const double* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1) = 0;
-    virtual void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            double const* const* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1) = 0;
-    virtual int get_msg_size(const int* rowptr, 
-            const bool has_vals, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1) = 0;
-
-
-    template <typename T>
-    void recv(int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1)
-    {
-        if (num_msgs == 0) return;
-
-        int proc, start, end;
-        int size = size_msgs * block_size;
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int) buf.size() < size) buf.resize(size);
-
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            RAPtor_MPI_Irecv(&(buf[start*block_size]), (end - start) * block_size, datatype,
-                    proc, key, mpi_comm, &(requests[i]));
-        }
-    }   
-
-    template <typename T>
-    void recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1);
-    virtual void int_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1) = 0;
-    virtual void double_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1) = 0;
-
-    void recv(CSRMatrix* recv_mat, int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1,
-            const bool vals = true)
-    {
-        if (num_msgs == 0) return;
-
-        int proc, start, end, size;
-        int ctr, row_size, row_count;
-        int count, recv_size;
-        RAPtor_MPI_Status recv_status;
-        std::vector<char> recv_buffer;
-
-        recv_size = 0;
-        row_count = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            size = end - start;
-            
-            // Recv message of any size from proc
-            RAPtor_MPI_Probe(proc, key, mpi_comm, &recv_status);
-            RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_PACKED, &count);
-
-            // Resize recv_buffer as needed
-            if (count > (int) recv_buffer.size())
-            {
-                recv_buffer.resize(count);
-            }
-            RAPtor_MPI_Recv(&(recv_buffer[0]), count, RAPtor_MPI_PACKED, proc, key,
-                    mpi_comm, &recv_status);
-
-            // Go through recv, adding indices to matrix recv_mat
-            ctr = 0;
-            for (int j = 0; j < size; j++)
-            {
-                RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &row_size, 1, RAPtor_MPI_INT,
-                        mpi_comm);
-                recv_mat->idx1[row_count + 1] = recv_size + row_size;
-                row_count++;
-                recv_mat->idx2.resize(recv_size + row_size);
-                RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &recv_mat->idx2[recv_size],
-                        row_size, RAPtor_MPI_INT, mpi_comm);
-                
-                if (vals)
-                {
-                    if (block_size > 1)
-                    {
-                        BSRMatrix* recv_mat_bsr = (BSRMatrix*) recv_mat;
-                        recv_mat_bsr->block_vals.resize(recv_size + row_size);
-                        for (int k = 0; k < row_size; k++)
-                        {
-                            recv_mat_bsr->block_vals[recv_size + k] = new double[block_size];
-                            RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, 
-                                    recv_mat_bsr->block_vals[recv_size + k],
-                                    block_size, RAPtor_MPI_DOUBLE, mpi_comm);
-                        }
-                    }
-                    else
-                    {
-                        recv_mat->vals.resize(recv_size + row_size);
-                        RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &recv_mat->vals[recv_size],
-                                row_size, RAPtor_MPI_DOUBLE, mpi_comm);
-                    }
-                }
-                recv_size += row_size;
-            }
-        }
-        recv_mat->nnz = recv_mat->idx2.size();
-    }
-
- 
-    void waitall()
-    {
-        if (num_msgs)
-        {
-            RAPtor_MPI_Waitall(num_msgs, requests.data(), RAPtor_MPI_STATUSES_IGNORE);
-        }
-    }
-    void waitall(int n_msgs)
-    {
-        if (n_msgs)
-        {
-            RAPtor_MPI_Waitall(n_msgs, requests.data(), RAPtor_MPI_STATUSES_IGNORE);
-        }
-    }
-
-    void pack_values(const double* values, int row_start, int size, char* send_buffer,
-           int bytes, int* ctr, RAPtor_MPI_Comm mpi_comm, int block_size)
-    {
-        RAPtor_MPI_Pack(&(values[row_start]), size, RAPtor_MPI_DOUBLE, send_buffer, 
-                bytes, ctr, mpi_comm);
-    }
-    void pack_values(double const* const* values, int row_start, int size, 
-            char* send_buffer, int bytes, int* ctr, RAPtor_MPI_Comm mpi_comm, int block_size)
-    {
-        for (int i = 0; i < size; i++)
-        {
-            RAPtor_MPI_Pack(values[row_start + i], block_size, RAPtor_MPI_DOUBLE, send_buffer,
-                    bytes, ctr, mpi_comm);
-        }
-    }
-
-    template <typename T>
-    void unpack(std::vector<T>& unpacked_buffer, RAPtor_MPI_Comm mpi_comm, const int block_size = 1)
-    {
-        if (num_msgs == 0) return;
-
-        int position = 0;
-        int flat_size = size_msgs * block_size;
-        if (unpacked_buffer.size() < flat_size) unpacked_buffer.resize(flat_size);
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        RAPtor_MPI_Unpack(pack_buffer.data(), pack_buffer.size(), &position,
-                unpacked_buffer.data(), flat_size, datatype, mpi_comm);
-    }
-
-    void reset_buffer()
-    {
-        pack_buffer.resize(size_msgs);
-    }
-
-    int num_msgs;
-    int size_msgs;
-    std::vector<int> procs;
-    std::vector<int> indptr;
-    std::vector<RAPtor_MPI_Request> requests;
-    std::vector<double> buffer;
-    std::vector<int> int_buffer;
-    std::vector<char> pack_buffer;
-
-};
-
-class ContigData : public CommData
-{
-public:
-    ContigData() : CommData()
-    {
-    }
-
-    ContigData(ContigData* data) : CommData(data)
-    {
-
-    }
-
-    ~ContigData()
-    {
-    }
-
-    ContigData* copy()
-    {
-        return new ContigData(this);
-    }
-    ContigData* copy(const std::vector<int>& col_to_new)
-    {
-        bool comm_proc;
-        int proc, start, end;
-        int new_idx;
-
-        ContigData* data = new ContigData();
-
-        data->size_msgs = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            comm_proc = false;
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                new_idx = col_to_new[j];
-                if (new_idx != -1)
-                {
-                    comm_proc = true;
-                    data->size_msgs++;
-                }
-            }
-            if (comm_proc)
-            {
-                data->procs.emplace_back(proc);
-                data->indptr.emplace_back(data->size_msgs);
-            }
-        }
-        data->num_msgs = data->procs.size();
-        data->finalize();
-
-        return data;
-    }
-
-    void add_msg(int proc, int msg_size, int* msg_indices = NULL)
-    {
-        int last_ptr = indptr[num_msgs];
-        procs.emplace_back(proc);
-        indptr.emplace_back(last_ptr + msg_size);
-
-        num_msgs++;
-        size_msgs += msg_size;
-    }
-
-    void probe(int n_recv, int key, RAPtor_MPI_Comm mpi_comm)
-    {
-        int size;
-        RAPtor_MPI_Status recv_status;
-
-        size_msgs = 0;
-        indptr[0] = 0;
-        for (int i = 0; i < n_recv; i++)
-        {
-            RAPtor_MPI_Recv(&size, 1, RAPtor_MPI_INT, RAPtor_MPI_ANY_SOURCE, key,
-                    mpi_comm, &recv_status);
-            procs.emplace_back(recv_status.RAPtor_MPI_SOURCE);
-            size_msgs += size;
-            indptr.emplace_back(size_msgs);
-        }
-        num_msgs = procs.size();
-        finalize();
-    }
-
-
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func, 
-                init_result_func_val);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func,
-                init_result_func_val);
-    }
-
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }        
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1,
-            std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-            T init_result_func_val = 0)
-    {
-        if (num_msgs == 0) return;
-
-        int start, end;
-        int proc;
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            RAPtor_MPI_Isend(&(values[start*block_size]), (end - start) * block_size,
-                    datatype, proc, key, mpi_comm, &(requests[i]));
-        }
-    }
-
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size = 1)
-    {
-        if (num_msgs == 0)
-        {
-            *n_send_ptr = 0;
-            return;
-        }
-
-        int n_sends;
-        int proc, start, end, idx;
-        int ctr, prev_ctr;
-        bool comparison;
-        int size = size_msgs * block_size; 
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        n_sends = 0;
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                comparison = false;
-                idx = j * block_size;
-                for (int k = 0; k < block_size; k++)
-                {
-                    if (compare_func(states[idx + k]))
-                    {
-                        comparison = true;
-                        break;
-                    }
-                }
-                if (comparison)
-                {
-                    for (int k = 0; k < block_size; k++)
-                    {
-                        buf[ctr++] = values[idx+k];
-                    }
-                }
-            }
-            size = ctr - prev_ctr;
-            if (size)
-            {
-                RAPtor_MPI_Isend(&(buf[prev_ctr]), size, datatype, 
-                        proc, key, mpi_comm, &(requests[n_sends++]));
-                prev_ctr = ctr;
-            }
-        }        
-        *n_send_ptr = n_sends;
-    }
-
-    void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            const double* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key,
-                mpi_comm, block_size);
-    }
-
-    void send(char* send_buffer,
-        const int* rowptr,
-        const int* col_indices,
-        double const* const* values,
-        int key, RAPtor_MPI_Comm mpi_comm,
-        const int block_size = 1)     
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key,
-                mpi_comm, block_size);
-    }
-
-    int get_msg_size(const int* rowptr, const bool has_vals, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        int start, end;
-        int row_start, row_end;
-        int num_ints, num_doubles;
-        int double_bytes, bytes;
-
-        // Calculate total msg size
-        start = indptr[0];
-        end = indptr[num_msgs];
-        row_start = rowptr[start];
-        row_end = rowptr[end];
-        num_ints = (row_end - row_start) + (end - start);
-        num_doubles = (row_end - row_start) * block_size;
-        RAPtor_MPI_Pack_size(num_ints, RAPtor_MPI_INT, mpi_comm, &bytes);
-
-        if (has_vals)
-        {
-            RAPtor_MPI_Pack_size(num_doubles, RAPtor_MPI_DOUBLE, mpi_comm, &double_bytes);
-            bytes += double_bytes;
-        }
-
-        return bytes;
-    }
-
-    // values can be double* (CSRMatrix) or double** (BSRMatrix)
-    template <typename T>
-    void send_helper(char* send_buffer,
-        const int* rowptr,
-        const int* col_indices,
-        const T& values,
-        int key, RAPtor_MPI_Comm mpi_comm,
-        const int block_size = 1)
-    {   
-        if (num_msgs == 0) return;
-
-        int start, end, proc;
-        int ctr, prev_ctr, size;
-        int row_start, row_end;
-        int bytes;
-
-        bytes = get_msg_size(rowptr, values, mpi_comm, block_size);
-
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                row_start = rowptr[j];
-                row_end = rowptr[j+1];
-                size = row_end - row_start;
-                RAPtor_MPI_Pack(&size, 1, RAPtor_MPI_INT, send_buffer, bytes, 
-                        &ctr, mpi_comm);
-                RAPtor_MPI_Pack(&(col_indices[row_start]), size, RAPtor_MPI_INT,
-                        send_buffer, bytes, &ctr, mpi_comm);
-                if (values)
-                {
-                    pack_values(values, row_start, size, send_buffer, bytes, 
-                            &ctr, mpi_comm, block_size);
-                }
-            }
-            RAPtor_MPI_Isend(&(send_buffer[prev_ctr]), ctr - prev_ctr, RAPtor_MPI_PACKED, proc, 
-                    key, mpi_comm, &(requests[i]));
-            prev_ctr = ctr;
-        }
-    } 
-
-    void int_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-    {
-        cond_recv<int>(key, mpi_comm, off_proc_states, compare_func, s_recv_ptr,
-                n_recv_ptr, block_size);
-    }
-    void double_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-    {
-        cond_recv<double>(key, mpi_comm, off_proc_states, compare_func, s_recv_ptr,
-                n_recv_ptr, block_size);
-    }
-
-    template <typename T>
-    void cond_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-   {
-        if (num_msgs == 0)  
-        {
-            *s_recv_ptr = 0;
-            *n_recv_ptr = 0;
-            return;
-        }
-
-        int n_recvs, ctr, prev_ctr;
-        int proc, start, end, idx;
-        int size = size_msgs * block_size;
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        n_recvs = 0;
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx = j * block_size;
-                for (int k = 0; k < block_size; k++)
-                {
-                    if (compare_func(off_proc_states[idx + k]))
-                    {
-                        ctr += block_size;
-                        break;
-                    }
-                }
-            }
-            if (ctr - prev_ctr)
-            {
-                RAPtor_MPI_Irecv(&(buf[prev_ctr]), ctr - prev_ctr, datatype,
-                        proc, key, mpi_comm, &(requests[n_recvs++]));
-                prev_ctr = ctr;
-            }
-        }
-
-        *n_recv_ptr = n_recvs;
-        *s_recv_ptr = ctr;
-    }
-
-}; 
-
-class NonContigData : public CommData
-{
-public:
-    NonContigData() : CommData()
-    {
-    }
-
-    NonContigData(NonContigData* data) : CommData(data)
-    {
-        std::copy(data->indices.begin(), data->indices.end(),
-                std::back_inserter(indices));
-    }
-
-    ~NonContigData()
-    {
-    }
-
-    NonContigData* copy()
-    {
-        return new NonContigData(this);
-    }
-
-    NonContigData* copy(const std::vector<int>& col_to_new)
-    {
-        bool comm_proc;
-        int proc, start, end;
-        int idx, new_idx;
-
-        NonContigData* data = new NonContigData();
-
-        data->size_msgs = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            comm_proc = false;
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx = indices[j];
-                new_idx = col_to_new[idx];
-                if (new_idx != -1)
-                {
-                    comm_proc = true;
-                    data->indices.emplace_back(new_idx);
-                }
-            }
-            if (comm_proc)
-            {
-                data->procs.emplace_back(proc);
-                data->indptr.emplace_back(data->indices.size());
-            }
-        }
-        data->size_msgs = data->indices.size();
-        data->num_msgs = data->procs.size();
-        data->finalize();
-
-        return data;
-    }
-
-    void add_msg(int proc,
-            int msg_size,
-            int* msg_indices = NULL)
-    {
-        int last_ptr = indptr[num_msgs];
-        procs.emplace_back(proc);
-        indptr.emplace_back(last_ptr + msg_size);
-        if (msg_indices)
-        {
-            for (int i = 0; i < msg_size; i++)
-            {
-                indices.emplace_back(msg_indices[i]);
-            }
-        }
-
-        num_msgs++;
-        size_msgs += msg_size;
-    }
-
-    void probe(int size, int key, RAPtor_MPI_Comm mpi_comm)
-    {
-        int proc, count;
-        int size_recvd;
-        RAPtor_MPI_Status recv_status;
-
-        size_msgs = size;
-        indices.resize(size_msgs);
-        indptr[0] = 0;
-        size_recvd = 0;
-        while (size_recvd < size_msgs)
-        {
-            RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, key, mpi_comm, &recv_status);
-            proc = recv_status.RAPtor_MPI_SOURCE;
-            RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_INT, &count);
-            RAPtor_MPI_Recv(&(indices[size_recvd]), count, RAPtor_MPI_INT, proc, 
-                    key, mpi_comm, &recv_status);
-            size_recvd += count;
-            procs.emplace_back(proc);
-            indptr.emplace_back(size_recvd);
-        }
-        num_msgs = procs.size();
-        finalize();
-    }
-
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func, 
-                init_result_func_val);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func,
-                init_result_func_val);
-    }
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }     
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1,
-            std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-            T init_result_func_val = 0)
-    {
-	if (num_msgs == 0) return;
-
-
-        int start, end;
-        int proc, idx, pos;
-        int size = size_msgs * block_size;
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx = indices[j] * block_size;
-                pos = j * block_size;
-                for (int k = 0; k < block_size; k++)
-                {
-                    buf[pos + k] = values[idx + k];
-                }
-            }
-            RAPtor_MPI_Isend(&(buf[start*block_size]), (end - start) * block_size,
-                    datatype, proc, key, mpi_comm, &(requests[i]));
-        }
-    }
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size = 1)
-    {
-        if (num_msgs == 0)
-        {
-            *n_send_ptr = 0;
-            return;
-        }
-
-        int n_sends;
-        int proc, start, end;
-        int idx;
-        int ctr, prev_ctr;
-        bool comparison;
-        int size = size_msgs * block_size; 
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        n_sends = 0;
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx = indices[j] * block_size;
-                comparison = false;
-                for (int k = 0; k < block_size; k++)
-                {
-                    // If compare true for any idx in block
-                    // Add full block to message
-                    if (compare_func(states[idx + k]))
-                    {
-                        comparison = true;
-                        break;
-                    }
-                }
-                if (comparison)
-                {
-                    for (int k = 0; k < block_size; k++)
-                    {
-                        buf[ctr++] = values[idx+k];
-                    }
-                }
-            }
-            if (ctr - prev_ctr)
-            {
-                RAPtor_MPI_Isend(&(buf[prev_ctr]), ctr - prev_ctr, datatype, 
-                        proc, key, mpi_comm, &(requests[n_sends++]));
-                prev_ctr = ctr;
-            }
-        }
-
-        *n_send_ptr = n_sends;
-    }
-
-    void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            const double* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key,
-                mpi_comm, block_size);
-    }
-
-    void send(char* send_buffer,
-        const int* rowptr,
-        const int* col_indices,
-        double const* const* values,
-        int key, RAPtor_MPI_Comm mpi_comm,
-        const int block_size = 1)     
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key,
-                mpi_comm, block_size);
-    }
-
-    int get_msg_size(const int* rowptr, const bool has_vals, RAPtor_MPI_Comm mpi_comm,
-            const int block_size = 1)
-    {
-        int num_ints, num_doubles;
-        int double_bytes, bytes;
-
-        // Calculate message size
-        num_ints = indptr[num_msgs] - indptr[0];
-        num_doubles = 0;
-        for (std::vector<int>::iterator it = indices.begin();
-                it != indices.end(); ++it)
-        {
-            num_doubles += (rowptr[*it+1] - rowptr[*it]);
-        }
-        num_ints += num_doubles;
-        RAPtor_MPI_Pack_size(num_ints, RAPtor_MPI_INT, mpi_comm, &bytes);
-
-        if (has_vals)
-        {
-            RAPtor_MPI_Pack_size(num_doubles * block_size, RAPtor_MPI_DOUBLE, mpi_comm, &double_bytes);
-            bytes += double_bytes;
-        }
-
-        return bytes;
-    }
-
-    template <typename T>
-    void send_helper(char* send_buffer,
-        const int* rowptr,
-        const int* col_indices,
-        const T& values,
-        int key, RAPtor_MPI_Comm mpi_comm,
-        const int block_size = 1)     
-    {
-        if (num_msgs == 0) return;
-
-        int start, end, proc;
-        int ctr, prev_ctr, size;
-        int row, row_start, row_end;
-        int bytes;
-
-        // Resize send buffer
-        bytes = get_msg_size(rowptr, values, mpi_comm, block_size);
-
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                row = indices[j];
-                row_start = rowptr[row];
-                row_end = rowptr[row+1];
-                size = (row_end - row_start);
-                RAPtor_MPI_Pack(&size, 1, RAPtor_MPI_INT, send_buffer, bytes, 
-                        &ctr, mpi_comm);
-                RAPtor_MPI_Pack(&(col_indices[row_start]), size, RAPtor_MPI_INT, 
-                        send_buffer, bytes, &ctr, mpi_comm);
-                if (values)
-                {                    
-                    pack_values(values, row_start, size, send_buffer, bytes, &ctr, 
-                            mpi_comm, block_size);
-                }
-            }
-            RAPtor_MPI_Isend(&(send_buffer[prev_ctr]), ctr - prev_ctr, RAPtor_MPI_PACKED, proc, 
-                    key, mpi_comm, &(requests[i]));
-            prev_ctr = ctr;
-        }
-    }
-
- 
-    void int_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-    {
-        cond_recv<int>(key, mpi_comm, off_proc_states, compare_func, s_recv_ptr,
-                n_recv_ptr, block_size);
-    }
-    void double_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-    {
-        cond_recv<double>(key, mpi_comm, off_proc_states, compare_func, s_recv_ptr,
-                n_recv_ptr, block_size);
-    }
-
-    template <typename T>
-    void cond_recv(int key, RAPtor_MPI_Comm mpi_comm, 
-            const std::vector<int>& off_proc_states,
-            std::function<bool(int)> compare_func,
-            int* s_recv_ptr, int* n_recv_ptr, const int block_size = 1)
-   {
-        if (num_msgs == 0)
-        {
-            *s_recv_ptr = 0;
-            *n_recv_ptr = 0;
-            return;
-        }
-
-        int n_recvs, ctr, prev_ctr;
-        int proc, start, end, idx;
-        int size = size_msgs * block_size;
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        n_recvs = 0;
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx = indices[j] * block_size;
-                for (int k = 0; k < block_size; k++)
-                {
-                    if (compare_func(off_proc_states[idx + k]))
-                    {
-                        ctr += block_size;
-                        break;
-                    }
-                }
-            }
-            if (ctr - prev_ctr)
-            {
-                RAPtor_MPI_Irecv(&(buf[prev_ctr]), ctr - prev_ctr, datatype, proc,
-                        key, mpi_comm, &(requests[n_recvs++]));
-                prev_ctr = ctr;
-            }
-        }
-
-        *n_recv_ptr = n_recvs;
-        *s_recv_ptr = ctr;    
-   }
-
-    std::vector<int> indices;
-
-}; 
-
-class DuplicateData : public NonContigData
-{
-public:
-    DuplicateData() : NonContigData()
-    {
-    }
-
-    DuplicateData(DuplicateData* data) : NonContigData(data)
-    {
-        std::copy(data->indptr_T.begin(), data->indptr_T.end(),
-                std::back_inserter(indptr_T));
-    }
-
-    ~DuplicateData()
-    {
-    }
-
-    DuplicateData* copy()
-    {
-        return new DuplicateData(this);
-    }
-    DuplicateData* copy(const std::vector<int>& col_to_new)
-    {
-        bool comm_proc, comm_idx;
-        int proc, start, end;
-        int idx, new_idx;
-        int idx_start, idx_end;
-
-        DuplicateData* data = new DuplicateData();
-
-        data->indptr_T.emplace_back(0);
-        for (int i = 0; i < num_msgs; i++)
-        {
-            comm_proc = false;
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                comm_idx = false;
-                idx_start = indptr_T[j];
-                idx_end = indptr_T[j+1];
-                for (int k = idx_start; k < idx_end; k++)
-                {
-                    idx = indices[k];
-                    new_idx = col_to_new[idx];
-                    if (new_idx != -1)
-                    {
-                        comm_idx = true;
-                        data->indices.emplace_back(new_idx);
-                    }
-                }
-                if (comm_idx)
-                {
-                    comm_proc = true;
-                    data->indptr_T.emplace_back(data->indices.size());
-                }
-            }
-            if (comm_proc)
-            {
-                data->procs.emplace_back(proc);
-                data->indptr.emplace_back(data->indptr_T.size() - 1);
-            }
-        }
-        data->size_msgs = data->indptr_T.size() - 1;
-        data->num_msgs = data->procs.size();
-        data->finalize();
-
-        return data;
-    }
-
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func, 
-                init_result_func_val);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        send(values, key, mpi_comm, block_size, init_result_func,
-                init_result_func_val);
-    }
-    void int_send(const int* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }
-    void double_send(const double* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size)
-    {
-        send(values, key, mpi_comm, states, compare_func, n_send_ptr, block_size);
-    }     
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm, const int block_size = 1,
-            std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-            T init_result_func_val = 0)
-    {
-        if (num_msgs == 0) return;
-
-
-        int start, end;
-        int proc, idx;
-        int idx_start, idx_end;
-        int size = size_msgs * block_size;
-        int pos;
-
-        RAPtor_MPI_Datatype datatype = get_type<T>();
-        std::vector<T>& buf = get_buffer<T>();
-        if ((int)buf.size() < size) buf.resize(size);
-
-        std::vector<T> tmp(block_size);
-
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                idx_start = indptr_T[j];
-                idx_end = indptr_T[j+1];
-                std::fill(tmp.begin(), tmp.end(), init_result_func_val);
-                for (int k = idx_start; k < idx_end; k++)
-                {
-                    idx = indices[k] * block_size;
-                    for (int l = 0; l < block_size; l++)
-                    {
-                        tmp[l] = init_result_func(tmp[l], values[idx+l]);
-                    }
-                }
-                pos  = j * block_size;
-                for (int k = 0; k < block_size; k++)
-                {
-                    buf[pos + k] = tmp[k];
-                }
-            }
-            RAPtor_MPI_Isend(&(buf[start * block_size]), (end - start) * block_size,
-                   datatype, proc, key, mpi_comm, &(requests[i]));
-        }
-    }
-
-    template <typename T>
-    void send(const T* values, int key, RAPtor_MPI_Comm mpi_comm,
-            const std::vector<int>& states, std::function<bool(int)> compare_func,
-            int* n_send_ptr, const int block_size = 1)
-    {
-
-    }
-
-    void append_val(std::vector<double>& vec, const double val, int block_size)
-    {
-        vec.emplace_back(val);
-    }
-    void append_val(std::vector<double>& vec, const double* val, int block_size)
-    {
-        for (int i = 0; i < block_size; i++)
-            vec.emplace_back(val[i]);
-    }
-    
-    template <typename T>
-    void combine_entries(int j, const int* rowptr, const int* col_indices, 
-            const T& values, int block_size, std::vector<int>& send_indices, 
-            std::vector<double>& send_values, int* size_ptr)
-    {
-        int idx_start, idx_end;
-        int row_start, row_end;
-        int size, row, idx, ctr;
-
-        idx_start = indptr_T[j];
-        idx_end = indptr_T[j+1];
-        for (int k = idx_start; k < idx_end; k++)
-        {
-            row = indices[k];
-            row_start = rowptr[row];
-            row_end = rowptr[row+1];
-            for (int l = row_start; l < row_end; l++)
-            {
-                send_indices.emplace_back(col_indices[l]);
-                append_val(send_values, values[l], block_size);
-            }
-        }
-        if (send_indices.size())
-        {
-            vec_sort(send_indices, send_values);
-            size = 1;
-
-            int s_send = send_indices.size();
-            for (int k = 1; k < s_send; k++)
-            {
-                ctr = k * block_size;
-                if (send_indices[k] != send_indices[size - 1])
-                {
-                    idx = size * block_size;
-                    for (int i = 0; i < block_size; i++)
-                    {
-                        send_values[idx + i] = send_values[ctr + i];
-                    }
-                    send_indices[size++] = send_indices[k];
-                }
-                else
-                {
-                    idx = (size - 1) * block_size;
-                    for (int i = 0; i < block_size; i++)
-                    {
-                        send_values[idx + i] += send_values[ctr + i];
-                    }
-                }
-            } 
-        }
-        else size = 0;
-
-        *size_ptr = size;
-    }
-    
-    void combine_entries(int j, const int* rowptr, const int* col_indices, 
-            std::vector<int>& send_indices, int* size_ptr)
-    {
-        int idx_start, idx_end;
-        int row_start, row_end;
-        int size, row;
-
-        idx_start = indptr_T[j];
-        idx_end = indptr_T[j+1];
-        for (int k = idx_start; k < idx_end; k++)
-        {
-            row = indices[k];
-            row_start = rowptr[row];
-            row_end = rowptr[row+1];
-            for (int l = row_start; l < row_end; l++)
-            {
-                send_indices.emplace_back(col_indices[l]);
-            }
-        }
-        if (send_indices.size())
-        {
-            size = 1;
-            std::sort(send_indices.begin(), send_indices.end());
-            int s_send = send_indices.size();
-            for (int k = 1; k < s_send; k++)
-            {
-                if (send_indices[k] != send_indices[size - 1])
-                {
-                    send_indices[size++] = send_indices[k];
-                }
-            }
-        }
-        else size = 0;
-
-        *size_ptr = size;
-    }
-
-
-    // TODO -- how to communicate block matrices?
-    //
-    void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            const double* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key, mpi_comm, block_size);
-    }
-    void send(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            double const* const* values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        send_helper(send_buffer, rowptr, col_indices, values, key, mpi_comm, block_size);
-    }
-
-    int get_msg_size(const int* rowptr, const bool has_vals, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        int num_ints, num_doubles;
-        int double_bytes, bytes;
-
-        // Calculate message size (upper bound)
-        num_ints = indptr[num_msgs] - indptr[0];
-        num_doubles = 0;
-        for (std::vector<int>::iterator it = indices.begin();
-                it != indices.end(); ++it)
-        {
-            num_doubles += (rowptr[*it+1] - rowptr[*it]);
-        }
-        num_ints += num_doubles;
-        RAPtor_MPI_Pack_size(num_ints, RAPtor_MPI_INT, mpi_comm, &bytes);
-        if (has_vals)
-        {
-            RAPtor_MPI_Pack_size(num_doubles * block_size, RAPtor_MPI_DOUBLE, mpi_comm, &double_bytes);
-            bytes += double_bytes;
-        }
-
-        return bytes;
-    }
-
-    template <typename T>
-    void send_helper(char* send_buffer,
-            const int* rowptr, 
-            const int* col_indices,
-            const T& values, 
-            int key, RAPtor_MPI_Comm mpi_comm, 
-            const int block_size = 1)
-    {
-        if (num_msgs == 0) return;
-
-        int start, end, proc;
-        int ctr, prev_ctr, size;
-        int bytes;
-
-        // Resize send buffer
-        bytes = get_msg_size(rowptr, values, mpi_comm, block_size);
-
-        ctr = 0;
-        prev_ctr = 0;
-        for (int i = 0; i < num_msgs; i++)
-        {
-            proc = procs[i];
-            start = indptr[i];
-            end = indptr[i+1];
-            for (int j = start; j < end; j++)
-            {
-                std::vector<int> send_indices;
-                std::vector<double> send_values;
-                
-                if (values)
-                {
-                    combine_entries(j, rowptr, col_indices, values, block_size,
-                            send_indices, send_values, &size);
-                }
-                else
-                {
-                    combine_entries(j, rowptr, col_indices, send_indices, &size);
-                }
-                RAPtor_MPI_Pack(&size, 1, RAPtor_MPI_INT, send_buffer, bytes, &ctr, mpi_comm);
-                RAPtor_MPI_Pack(send_indices.data(), size, RAPtor_MPI_INT, send_buffer,
-                    bytes, &ctr, mpi_comm);
-
-                if (values)
-                {
-                    pack_values(send_values.data(), 0, size, send_buffer, bytes, &ctr, 
-                            mpi_comm, block_size);
-                }
-            }
-            RAPtor_MPI_Isend(&(send_buffer[prev_ctr]), ctr - prev_ctr, RAPtor_MPI_PACKED, proc, 
-                    key, mpi_comm, &(requests[i]));
-            prev_ctr = ctr;
-        }
-    }
-
-     std::vector<int> indptr_T;
-
-}; 
-
-}
-#endif
-
diff --git a/raptor/core/comm_mat.cpp b/raptor/core/comm_mat.cpp
deleted file mode 100644
index d1fa40da..00000000
--- a/raptor/core/comm_mat.cpp
+++ /dev/null
@@ -1,810 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "comm_pkg.hpp"
-#include "par_matrix.hpp"
-
-using namespace raptor;
-
-// Forward Declarations
-
-// Helper Methods
-template <typename T> std::vector<T>& create_mat(int n, int m, int b_n, int b_m,
-        CSRMatrix** mat_ptr);
-template <typename T> CSRMatrix* communication_helper(const int* rowptr,
-        const int* col_indices, const T& values,
-        CommData* send_comm, CommData* recv_comm, int key, RAPtor_MPI_Comm mpi_comm, 
-        const int b_rows, const int b_cols, const bool has_vals = true);
-template <typename T> void init_comm_helper(char* send_buffer,
-        const int* rowptr, const int* col_indices, const T& values,
-        CommData* send_comm, int key, RAPtor_MPI_Comm mpi_comm, const int b_rows, 
-        const int b_cols);
-CSRMatrix* complete_comm_helper(CommData* send_comm, 
-        CommData* recv_comm, int key, RAPtor_MPI_Comm mpi_comm, const int b_rows, 
-        const int b_cols, const bool has_vals = true);
-
-template <typename T> CSRMatrix* transpose_recv(CSRMatrix* recv_mat_T, 
-        std::vector<T>& T_vals, NonContigData* send_data, int n);
-template <typename T> CSRMatrix* combine_recvs(CSRMatrix* L_mat, CSRMatrix* R_mat, 
-        std::vector<T>& L_vals, std::vector<T>& R_vals, const int b_rows, 
-        const int b_cols, NonContigData* local_L_recv, NonContigData* local_R_recv, 
-        std::vector<int>& row_sizes);
-template <typename T> CSRMatrix* combine_recvs_T(CSRMatrix* L_mat, 
-        CSRMatrix* final_mat, NonContigData* local_L_send, NonContigData* final_send, 
-        std::vector<T>& L_vals, std::vector<T>& final_vals, int n, 
-        int b_rows, int b_cols);
-
-
-// Main Methods
-CSRMatrix* CommPkg::communicate(ParCSRMatrix* A, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_par_mat_comm(A, send_buffer, has_vals);
-    return complete_mat_comm(A->on_proc->b_rows, A->on_proc->b_cols,
-            has_vals);
-}
-CSRMatrix* CommPkg::communicate(ParBSRMatrix* A, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_par_mat_comm(A, send_buffer, has_vals);
-    return complete_mat_comm(A->on_proc->b_rows, A->on_proc->b_cols,
-            has_vals);
-}
-void CommPkg::init_par_mat_comm(ParCSRMatrix* A, std::vector<char>& send_buffer,
-        const bool has_vals)
-{
-    int start, end;
-    int ctr;
-    int global_col;
-
-    int nnz = A->on_proc->nnz + A->off_proc->nnz;
-    std::vector<int> rowptr(A->local_num_rows + 1);
-    std::vector<int> col_indices;
-    std::vector<double> values;
-    if (nnz)
-    {
-        col_indices.resize(nnz);
-        if (has_vals)
-            values.resize(nnz);
-    }
-
-    ctr = 0;
-    rowptr[0] = ctr;
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        start = A->on_proc->idx1[i];
-        end = A->on_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            global_col = A->on_proc_column_map[A->on_proc->idx2[j]];
-            if (has_vals) values[ctr] = A->on_proc->vals[j];
-            col_indices[ctr++] = global_col;
-        }
-
-        start = A->off_proc->idx1[i];
-        end = A->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            global_col = A->off_proc_column_map[A->off_proc->idx2[j]];
-            if (has_vals) values[ctr] = A->off_proc->vals[j];
-            col_indices[ctr++] = global_col;
-        }
-        rowptr[i+1] = ctr;
-    }
-    return init_mat_comm(send_buffer, rowptr, col_indices, values, 
-            A->on_proc->b_rows, A->on_proc->b_cols, has_vals);
-}
-void CommPkg::init_par_mat_comm(ParBSRMatrix* A, std::vector<char>& send_buffer,
-        const bool has_vals)
-{
-    int start, end;
-    int ctr;
-    int global_col;
-
-    int nnz = A->on_proc->nnz + A->off_proc->nnz;
-    std::vector<int> rowptr(A->local_num_rows + 1);
-    std::vector<int> col_indices;
-    std::vector<double*> values;
-    if (nnz)
-    {
-        col_indices.resize(nnz);
-        if (has_vals)
-            values.resize(nnz);
-    }
-
-    BSRMatrix* A_on = (BSRMatrix*) A->on_proc;
-    BSRMatrix* A_off = (BSRMatrix*) A->off_proc;
-
-    ctr = 0;
-    rowptr[0] = ctr;
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        start = A->on_proc->idx1[i];
-        end = A->on_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            global_col = A->on_proc_column_map[A->on_proc->idx2[j]];
-            if (has_vals) values[ctr] = A->on_proc->copy_val(A_on->block_vals[j]);
-            col_indices[ctr++] = global_col;
-        }
-
-        start = A->off_proc->idx1[i];
-        end = A->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            global_col = A->off_proc_column_map[A->off_proc->idx2[j]];
-            if (has_vals) values[ctr] = A->off_proc->copy_val(A_off->block_vals[j]);
-            col_indices[ctr++] = global_col;
-        }
-        rowptr[i+1] = ctr;
-    }
-    return init_mat_comm(send_buffer, rowptr, col_indices, values, 
-            A->on_proc->b_rows, A->on_proc->b_cols, has_vals);
-}
-
-CSRMatrix* ParComm::communicate(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values, 
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_mat_comm(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm(b_rows, b_cols, has_vals);
-}
-CSRMatrix* ParComm::communicate(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values, 
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_mat_comm(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm(b_rows, b_cols, has_vals);
-}
-
-void ParComm::init_mat_comm(std::vector<char>& send_buffer,
-        const std::vector<int>& rowptr, const std::vector<int>& col_indices, 
-        const std::vector<double>& values, const int b_rows, const int b_cols, 
-        const bool has_vals)
-{
-    int s = send_data->get_msg_size(rowptr.data(), values.data(), mpi_comm, b_rows * b_cols);
-    send_buffer.resize(s);
-    init_comm_helper(send_buffer.data(), rowptr.data(), col_indices.data(), values.data(),
-            send_data, key, mpi_comm, b_rows, b_cols);
-}
-void ParComm::init_mat_comm(std::vector<char>& send_buffer,
-        const std::vector<int>& rowptr, const std::vector<int>& col_indices, 
-        const std::vector<double*>& values, const int b_rows, const int b_cols,
-        const bool has_vals)
-{
-    int s = send_data->get_msg_size(rowptr.data(), values.data(), mpi_comm, b_rows * b_cols);
-    send_buffer.resize(s);
-    init_comm_helper(send_buffer.data(), rowptr.data(), col_indices.data(), values.data(),
-            send_data, key, mpi_comm, b_rows, b_cols);
-}
-
-CSRMatrix* ParComm::complete_mat_comm(const int b_rows, const int b_cols, 
-        const bool has_vals)
-{
-    CSRMatrix* recv_mat = complete_comm_helper(send_data, recv_data, key, mpi_comm,
-            b_rows, b_cols, has_vals);
-    key++;
-    return recv_mat;
-}
-
-
-CSRMatrix* ParComm::communicate_T(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_mat_comm_T(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm_T(n_result_rows, b_rows, b_cols, has_vals);
-}
-CSRMatrix* ParComm::communicate_T(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    init_mat_comm_T(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm_T(n_result_rows, b_rows, b_cols, has_vals);
-}
-void ParComm::init_mat_comm_T(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    int s = recv_data->get_msg_size(rowptr.data(), values.data(), mpi_comm, b_rows * b_cols);
-    send_buffer.resize(s);
-    init_comm_helper(send_buffer.data(), rowptr.data(), col_indices.data(), values.data(),
-            recv_data, key, mpi_comm, b_rows, b_cols);
-}
-void ParComm::init_mat_comm_T(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    int s = recv_data->get_msg_size(rowptr.data(), values.data(), mpi_comm, b_rows * b_cols);
-    send_buffer.resize(s);
-    init_comm_helper(send_buffer.data(), rowptr.data(), col_indices.data(), values.data(),
-            recv_data, key, mpi_comm, b_rows, b_cols);
-}
-CSRMatrix* ParComm::complete_mat_comm_T(const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{
-    CSRMatrix* recv_mat_T = complete_comm_helper(recv_data, send_data, key, mpi_comm,
-            b_rows, b_cols, has_vals);
-
-    CSRMatrix* recv_mat;
-    if (b_rows > 1 || b_cols > 1)
-    {
-        BSRMatrix* recv_mat_T_bsr = (BSRMatrix*) recv_mat_T;
-        recv_mat = transpose_recv(recv_mat_T_bsr, recv_mat_T_bsr->block_vals,
-                send_data, n_result_rows);
-    }
-    else
-    {
-        recv_mat = transpose_recv(recv_mat_T, recv_mat_T->vals, 
-                send_data, n_result_rows);
-    }
-    
-    delete recv_mat_T;
-    return recv_mat;
-}
-
-
-
-
-
-
-CSRMatrix* TAPComm::communicate(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;  
-    init_mat_comm(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm(b_rows, b_cols, has_vals);
-}
-
-CSRMatrix* TAPComm::communicate(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{   
-    std::vector<char> send_buffer;  
-    init_mat_comm(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm(b_rows, b_cols, has_vals);
-}
-void TAPComm::init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{  
-    int block_size = b_rows * b_cols;
-    int l_bytes = local_L_par_comm->send_data->get_msg_size(rowptr.data(),
-            values.data(), local_L_par_comm->mpi_comm, block_size);
-    int g_bytes;
-
-    if (local_S_par_comm)
-    {
-        CSRMatrix* S_mat = local_S_par_comm->communicate(rowptr, col_indices, values, 
-                b_rows, b_cols, has_vals);
-        g_bytes = global_par_comm->send_data->get_msg_size(S_mat->idx1.data(),
-                S_mat->vals.data(), global_par_comm->mpi_comm, block_size);
-        send_buffer.resize(l_bytes + g_bytes);
-
-        init_comm_helper(&(send_buffer[0]), S_mat->idx1.data(),
-                S_mat->idx2.data(), S_mat->vals.data(), global_par_comm->send_data, 
-                global_par_comm->key, global_par_comm->mpi_comm, b_rows, b_cols);
-        delete S_mat;
-    }
-    else
-    {
-        g_bytes = global_par_comm->send_data->get_msg_size(rowptr.data(),
-                values.data(), global_par_comm->mpi_comm, block_size);
-        send_buffer.resize(l_bytes + g_bytes);
-        init_comm_helper(&(send_buffer[0]), rowptr.data(), col_indices.data(),
-                values.data(), global_par_comm->send_data, global_par_comm->key, 
-                global_par_comm->mpi_comm, b_rows, b_cols);
-    }
-
-    init_comm_helper(&(send_buffer[g_bytes]), rowptr.data(), col_indices.data(),
-            values.data(), local_L_par_comm->send_data, local_L_par_comm->key, 
-            local_L_par_comm->mpi_comm, b_rows, b_cols);
-}
-
-
-void TAPComm::init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{  
-    int block_size = b_rows * b_cols;
-    int l_bytes = local_L_par_comm->send_data->get_msg_size(rowptr.data(),
-            values.data(), local_L_par_comm->mpi_comm, block_size);
-    int g_bytes;
-
-    if (local_S_par_comm)
-    {
-        BSRMatrix* S_mat = (BSRMatrix*) local_S_par_comm->communicate(rowptr, col_indices, values, 
-                b_rows, b_cols, has_vals);
-        g_bytes = global_par_comm->send_data->get_msg_size(S_mat->idx1.data(),
-                S_mat->block_vals.data(), global_par_comm->mpi_comm, block_size);
-        send_buffer.resize(l_bytes + g_bytes);
-
-        init_comm_helper(&(send_buffer[0]), S_mat->idx1.data(),
-                S_mat->idx2.data(), S_mat->vals.data(), global_par_comm->send_data, 
-                global_par_comm->key, global_par_comm->mpi_comm, b_rows, b_cols);
-        delete S_mat;
-    }
-    else
-    {
-        g_bytes = global_par_comm->send_data->get_msg_size(rowptr.data(),
-                values.data(), global_par_comm->mpi_comm, block_size);
-        send_buffer.resize(l_bytes + g_bytes);
-        init_comm_helper(&(send_buffer[0]), rowptr.data(), col_indices.data(),
-                values.data(), global_par_comm->send_data, global_par_comm->key, 
-                global_par_comm->mpi_comm, b_rows, b_cols);
-    }
-
-    init_comm_helper(&(send_buffer[g_bytes]), rowptr.data(), col_indices.data(),
-            values.data(), local_L_par_comm->send_data, local_L_par_comm->key, 
-            local_L_par_comm->mpi_comm, b_rows, b_cols);
-}
-
-CSRMatrix* TAPComm::complete_mat_comm(const int b_rows, const int b_cols, const bool has_vals)
-{  
-    CSRMatrix* G_mat = global_par_comm->complete_mat_comm(b_rows, b_cols, has_vals);
-    CSRMatrix* L_mat = local_L_par_comm->complete_mat_comm(b_rows, b_cols, has_vals);
-
-    CSRMatrix* R_mat;
-    CSRMatrix* recv_mat;
-    if (b_rows > 1 || b_cols > 1)
-    {
-        BSRMatrix* G_mat_bsr = (BSRMatrix*) G_mat;
-        R_mat = local_R_par_comm->communicate(G_mat_bsr->idx1, G_mat_bsr->idx2, 
-            G_mat_bsr->block_vals, b_rows, b_cols, has_vals);
-
-        BSRMatrix* R_mat_bsr = (BSRMatrix*) R_mat;
-        BSRMatrix* L_mat_bsr = (BSRMatrix*) L_mat;
-
-        // Create recv_mat (combination of L_mat and R_mat)
-        recv_mat = combine_recvs(L_mat_bsr, R_mat_bsr, 
-                L_mat_bsr->block_vals, R_mat_bsr->block_vals, b_rows, b_cols,
-                (NonContigData*) local_L_par_comm->recv_data,
-                (NonContigData*) local_R_par_comm->recv_data,
-                get_buffer<int>());
-    }
-    else
-    {
-        R_mat = local_R_par_comm->communicate(G_mat->idx1, G_mat->idx2, 
-                G_mat->vals, b_rows, b_cols, has_vals);
-
-        // Create recv_mat (combination of L_mat and R_mat)
-        recv_mat = combine_recvs(L_mat, R_mat, 
-                L_mat->vals, R_mat->vals, b_rows, b_cols,
-                (NonContigData*) local_L_par_comm->recv_data,
-                (NonContigData*) local_R_par_comm->recv_data,
-                get_buffer<int>());
-    }
-    delete G_mat;
-    delete R_mat;
-    delete L_mat;
-
-    return recv_mat;
-}
-
-
-CSRMatrix* TAPComm::communicate_T(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{   
-    std::vector<char> send_buffer;
-    init_mat_comm_T(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm_T(n_result_rows, b_rows, b_cols, has_vals);
-}
-
-CSRMatrix* TAPComm::communicate_T(const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{  
-    std::vector<char> send_buffer;
-    init_mat_comm_T(send_buffer, rowptr, col_indices, values, b_rows, b_cols, has_vals);
-    return complete_mat_comm_T(n_result_rows, b_rows, b_cols, has_vals);    
-}
-void TAPComm::init_mat_comm_T(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    int block_size = b_rows * b_cols;
-
-    // Transpose communication with local_R_par_comm
-    CSRMatrix* R_mat = communication_helper(rowptr.data(), col_indices.data(), 
-            values.data(), local_R_par_comm->recv_data, 
-            local_R_par_comm->send_data, local_R_par_comm->key,
-            local_R_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-    local_R_par_comm->key++;
-
-    // Calculate size of send_buffer for global and local_L
-    int l_bytes = local_L_par_comm->recv_data->get_msg_size(rowptr.data(),
-            values.data(), local_L_par_comm->mpi_comm, block_size);
-    int g_bytes = global_par_comm->recv_data->get_msg_size(R_mat->idx1.data(),
-            R_mat->vals.data(), global_par_comm->mpi_comm, block_size);
-    send_buffer.resize(l_bytes + g_bytes);
-
-    // Initialize global_par_comm
-    init_comm_helper(&(send_buffer[0]), R_mat->idx1.data(), R_mat->idx2.data(),
-            R_mat->vals.data(), global_par_comm->recv_data, global_par_comm->key,
-            global_par_comm->mpi_comm, b_rows, b_cols);
-    delete R_mat;
-
-    // Initialize local_L_par_comm
-    init_comm_helper(&(send_buffer[g_bytes]), rowptr.data(), col_indices.data(), 
-            values.data(), local_L_par_comm->recv_data, 
-            local_L_par_comm->key, local_L_par_comm->mpi_comm, 
-            b_rows, b_cols);
-}
-void TAPComm::init_mat_comm_T(std::vector<char>& send_buffer, const std::vector<int>& rowptr, 
-        const std::vector<int>& col_indices, const std::vector<double*>& values,
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    int block_size = b_rows * b_cols;
-
-    // Transpose communication with local_R_par_comm
-    BSRMatrix* R_mat = (BSRMatrix*) communication_helper(rowptr.data(), col_indices.data(), 
-            values.data(), local_R_par_comm->recv_data, 
-            local_R_par_comm->send_data, local_R_par_comm->key,
-            local_R_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-    local_R_par_comm->key++;
-
-    // Calculate size of send_buffer for global and local_L
-    int l_bytes = local_L_par_comm->recv_data->get_msg_size(rowptr.data(),
-            values.data(), local_L_par_comm->mpi_comm, block_size);
-    int g_bytes = global_par_comm->recv_data->get_msg_size(R_mat->idx1.data(),
-            R_mat->block_vals.data(), global_par_comm->mpi_comm, block_size);
-    send_buffer.resize(l_bytes + g_bytes);
-
-    // Initialize global_par_comm
-    init_comm_helper(&(send_buffer[0]), R_mat->idx1.data(), R_mat->idx2.data(),
-            R_mat->block_vals.data(), global_par_comm->recv_data, global_par_comm->key,
-            global_par_comm->mpi_comm, b_rows, b_cols);
-    delete R_mat;
-
-    // Initialize local_L_par_comm
-    init_comm_helper(&(send_buffer[g_bytes]), rowptr.data(), col_indices.data(), 
-            values.data(), local_L_par_comm->recv_data, 
-            local_L_par_comm->key, local_L_par_comm->mpi_comm, 
-            b_rows, b_cols);
-
-}
-CSRMatrix* TAPComm::complete_mat_comm_T(const int n_result_rows, const int b_rows, const int b_cols, const bool has_vals)
-{
-    CSRMatrix* G_mat = complete_comm_helper(global_par_comm->recv_data, 
-            global_par_comm->send_data, global_par_comm->key, 
-            global_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-    global_par_comm->key++;
-
-
-    CSRMatrix* L_mat = complete_comm_helper(local_L_par_comm->recv_data, 
-            local_L_par_comm->send_data, local_L_par_comm->key,
-            local_L_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-    local_L_par_comm->key++;
-
-
-    CSRMatrix* final_mat;
-    CSRMatrix* recv_mat;
-    ParComm* final_comm;
-    if (b_rows > 1 || b_cols > 1)
-    {
-        BSRMatrix* L_mat_bsr = (BSRMatrix*) L_mat;
-        if (local_S_par_comm)
-        {
-            BSRMatrix* G_mat_bsr = (BSRMatrix*) G_mat;
-            final_mat = communication_helper(G_mat_bsr->idx1.data(), G_mat_bsr->idx2.data(),
-                    G_mat_bsr->block_vals.data(), local_S_par_comm->recv_data, 
-                    local_S_par_comm->send_data, local_S_par_comm->key, 
-                    local_S_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-            local_S_par_comm->key++;
-            delete G_mat;
-            final_comm = local_S_par_comm;
-        }
-        else
-        {
-            final_mat = G_mat;
-            final_comm = global_par_comm;
-        }
-        BSRMatrix* final_mat_bsr = (BSRMatrix*) final_mat;
-
-        recv_mat = combine_recvs_T(L_mat_bsr, final_mat_bsr,
-                local_L_par_comm->send_data, final_comm->send_data,
-                L_mat_bsr->vals, final_mat_bsr->vals, n_result_rows, b_rows, b_cols);
-    }
-    else
-    {
-        if (local_S_par_comm)
-        {
-            final_mat = communication_helper(G_mat->idx1.data(), G_mat->idx2.data(),
-                    G_mat->vals.data(), local_S_par_comm->recv_data, local_S_par_comm->send_data, 
-                    local_S_par_comm->key, local_S_par_comm->mpi_comm, b_rows, b_cols, has_vals);
-            local_S_par_comm->key++;
-            delete G_mat;
-            final_comm = local_S_par_comm;
-        }
-        else
-        {
-            final_mat = G_mat;
-            final_comm = global_par_comm;
-        }
-
-        recv_mat = combine_recvs_T(L_mat, final_mat,
-                local_L_par_comm->send_data, final_comm->send_data,
-                L_mat->vals, final_mat->vals, n_result_rows, b_rows, b_cols);
-    }
-
-
-
-
-    delete L_mat;
-    delete final_mat;
-
-    return recv_mat;
-}
-
-
-
-
-
-
-// Helper Methods
-// Create matrix (either CSR or BSR)
-template<> std::vector<double>& create_mat<double>(int n, int m, int b_n, int b_m, 
-        CSRMatrix** mat_ptr)
-{  
-    CSRMatrix* recv_mat = new CSRMatrix(n, m);
-    *mat_ptr = recv_mat;
-    return recv_mat->vals;
-}
-template<> std::vector<double*>& create_mat<double*>(int n, int m, int b_n, int b_m, 
-        CSRMatrix** mat_ptr)
-{  
-    BSRMatrix* recv_mat = new BSRMatrix(n, m, b_n, b_m);
-    *mat_ptr = recv_mat;
-    return recv_mat->block_vals;
-}
-
-template <typename T> // double* or double**
-CSRMatrix* communication_helper(const int* rowptr,
-        const int* col_indices, const T& values,
-        CommData* send_comm, CommData* recv_comm, int key, RAPtor_MPI_Comm mpi_comm, 
-        const int b_rows, const int b_cols, const bool has_vals)
-{
-    std::vector<char> send_buffer;
-    int s = send_comm->get_msg_size(rowptr, values, mpi_comm, b_rows * b_cols);
-    send_buffer.resize(s);
-    init_comm_helper(send_buffer.data(), rowptr, col_indices, values, send_comm,
-            key, mpi_comm, b_rows, b_cols);
-    return complete_comm_helper(send_comm, recv_comm, key, mpi_comm, 
-            b_rows, b_cols, has_vals);
-}    
-template <typename T> // double* or double**
-void init_comm_helper(char* send_buffer, const int* rowptr,
-        const int* col_indices, const T& values,
-        CommData* send_comm, int key, RAPtor_MPI_Comm mpi_comm, 
-        const int b_rows, const int b_cols)
-{
-    int block_size = b_rows * b_cols;
-    if (profile) mat_t -= RAPtor_MPI_Wtime();
-    send_comm->send(send_buffer, rowptr, col_indices, values,
-            key, mpi_comm, block_size);
-    if (profile) mat_t += RAPtor_MPI_Wtime();
-}    
-CSRMatrix* complete_comm_helper(CommData* send_comm, CommData* recv_comm, int key, 
-        RAPtor_MPI_Comm mpi_comm, const int b_rows, const int b_cols, const bool has_vals)
-{
-    CSRMatrix* recv_mat;
-
-    // Form recv_mat
-    int block_size = b_rows * b_cols;
-    if (b_rows > 1 || b_cols > 1)
-        recv_mat = new BSRMatrix(recv_comm->size_msgs, -1, b_rows, b_cols);
-    else
-        recv_mat = new CSRMatrix(recv_comm->size_msgs, -1);
-
-    // Recv contents of recv_mat
-    if (profile) mat_t -= RAPtor_MPI_Wtime();
-    recv_comm->recv(recv_mat, key, mpi_comm, block_size, has_vals);
-    if (send_comm->num_msgs)
-        RAPtor_MPI_Waitall(send_comm->num_msgs, send_comm->requests.data(),
-                RAPtor_MPI_STATUSES_IGNORE);
-    if (profile) mat_t += RAPtor_MPI_Wtime();
-    return recv_mat;
-}    
-
-
-
-template <typename T>
-CSRMatrix* transpose_recv(CSRMatrix* recv_mat_T, std::vector<T>& T_vals,
-        NonContigData* send_data, int n)
-{
-    int idx, ptr;
-    int start, end;
-
-    CSRMatrix* recv_mat;
-    std::vector<T>& vals = create_mat<T>(n, -1, recv_mat_T->b_rows, 
-            recv_mat_T->b_cols, &recv_mat);
-
-    if (n == 0) return recv_mat;
-
-    std::vector<int> row_sizes(n, 0);
-    for (int i = 0; i < send_data->size_msgs; i++)
-    {
-        idx = send_data->indices[i];
-        start = recv_mat_T->idx1[i];
-        end = recv_mat_T->idx1[i+1];
-        row_sizes[idx] += end - start;
-    }
-    recv_mat->idx1[0] = 0;
-    for (int i = 0; i < n; i++)
-    {
-        recv_mat->idx1[i+1] = recv_mat->idx1[i] + row_sizes[i];
-        row_sizes[i] = 0;
-    }
-    recv_mat->nnz = recv_mat->idx1[n];
-    if (recv_mat->nnz)
-    {
-        recv_mat->idx2.resize(recv_mat->nnz);
-        if (T_vals.size())
-            vals.resize(recv_mat->nnz);
-    }
-    for (int i = 0; i < send_data->size_msgs; i++)
-    {
-        idx = send_data->indices[i];
-        start = recv_mat_T->idx1[i];
-        end = recv_mat_T->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            ptr = recv_mat->idx1[idx] + row_sizes[idx]++;
-            recv_mat->idx2[ptr] = recv_mat_T->idx2[j];
-            if (recv_mat_T->vals.size())
-                vals[ptr] = T_vals[j];
-        }
-    }
-    return recv_mat;
-}
-
-template <typename T>
-CSRMatrix* combine_recvs(CSRMatrix* L_mat, CSRMatrix* R_mat, 
-        std::vector<T>& L_vals, std::vector<T>& R_vals,
-        const int b_rows, const int b_cols,
-        NonContigData* local_L_recv, NonContigData* local_R_recv,
-        std::vector<int>& row_sizes)
-{
-    int row;
-    int start, end;
-
-    CSRMatrix* recv_mat;
-    std::vector<T>& vals = create_mat<T>(L_mat->n_rows + R_mat->n_rows, -1, b_rows, b_cols,
-            &recv_mat);
-    recv_mat->nnz = L_mat->nnz + R_mat->nnz;
-    int ptr;
-    if (recv_mat->nnz)
-    {
-        recv_mat->idx2.resize(recv_mat->nnz);
-        if (L_vals.size() || R_vals.size()) 
-            vals.resize(recv_mat->nnz);
-    }
-
-    for (int i = 0; i < R_mat->n_rows; i++)
-    {
-        start = R_mat->idx1[i];
-        end = R_mat->idx1[i+1];
-        row = local_R_recv->indices[i];
-        row_sizes[row] = end - start;
-    }
-    for (int i = 0; i < L_mat->n_rows; i++)
-    {
-        start = L_mat->idx1[i];
-        end = L_mat->idx1[i+1];
-        row = local_L_recv->indices[i];
-        row_sizes[row] = end - start;
-    }
-    recv_mat->idx1[0] = 0;
-    for (int i = 0; i < recv_mat->n_rows; i++)
-    {
-        recv_mat->idx1[i+1] = recv_mat->idx1[i] + row_sizes[i];
-        row_sizes[i] = 0;
-    }
-    for (int i = 0; i < R_mat->n_rows; i++)
-    {
-        start = R_mat->idx1[i];
-        end = R_mat->idx1[i+1];
-        row = local_R_recv->indices[i];
-        for (int j = start; j < end; j++)
-        {
-            ptr = recv_mat->idx1[row] + row_sizes[row]++;
-            recv_mat->idx2[ptr] = R_mat->idx2[j];
-            if (vals.size()) 
-                vals[ptr] = R_mat->copy_val(R_vals[j]);
-        }
-    }
-    for (int i = 0; i < L_mat->n_rows; i++)
-    {
-        start = L_mat->idx1[i];
-        end = L_mat->idx1[i+1];
-        row = local_L_recv->indices[i];
-        for (int j = start; j < end; j++)
-        {
-            ptr = recv_mat->idx1[row] + row_sizes[row]++;
-            recv_mat->idx2[ptr] = L_mat->idx2[j];
-            if (vals.size())
-                vals[ptr] = L_mat->copy_val(L_vals[j]);
-        }
-    }
-
-    return recv_mat;
-}
-   
-template <typename T>
-CSRMatrix* combine_recvs_T(CSRMatrix* L_mat, CSRMatrix* final_mat,
-        NonContigData* local_L_send, NonContigData* final_send,
-        std::vector<T>& L_vals, std::vector<T>& final_vals,
-        int n, int b_rows, int b_cols)
-{
-    int row_start, row_end, row_size;
-    int row, idx;
-
-    CSRMatrix* recv_mat;
-    std::vector<T>& vals = create_mat<T>(n, -1, b_rows, b_cols,
-            &recv_mat);
-
-    std::vector<int> row_sizes(n, 0);
-    int nnz = L_mat->nnz + final_mat->nnz;
-    if (nnz)
-    {
-        recv_mat->idx2.resize(nnz);
-        if (L_vals.size() || final_vals.size())
-            vals.resize(nnz);
-    }
-    for (int i = 0; i < final_send->size_msgs; i++)
-    {
-        row = final_send->indices[i];
-        row_size = final_mat->idx1[i+1] - final_mat->idx1[i];
-        row_sizes[row] += row_size;
-    }
-    for (int i = 0; i < local_L_send->size_msgs; i++)
-    {
-        row = local_L_send->indices[i];
-        row_size = L_mat->idx1[i+1] - L_mat->idx1[i];
-        row_sizes[row] += row_size;
-    }
-    recv_mat->idx1[0] = 0;
-    for (int i = 0; i < n; i++)
-    {
-        recv_mat->idx1[i+1] = recv_mat->idx1[i] + row_sizes[i];
-        row_sizes[i] = 0;
-    }
-    for (int i = 0; i < final_send->size_msgs; i++)
-    {
-        row = final_send->indices[i];
-        row_start = final_mat->idx1[i];
-        row_end = final_mat->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            idx = recv_mat->idx1[row] + row_sizes[row]++;
-            recv_mat->idx2[idx] = final_mat->idx2[j];
-            if (final_vals.size())
-                vals[idx] = final_vals[j];
-        }
-    }
-    for (int i = 0; i < local_L_send->size_msgs; i++)
-    {
-        row = local_L_send->indices[i];
-        row_start = L_mat->idx1[i];
-        row_end = L_mat->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            idx = recv_mat->idx1[row] + row_sizes[row]++;
-            recv_mat->idx2[idx] = L_mat->idx2[j];
-            if (L_vals.size())
-                vals[idx] = L_vals[j];
-        }
-    }
-    recv_mat->nnz = recv_mat->idx2.size();
-    recv_mat->sort();
-
-    return recv_mat;
-}
-
-
-
diff --git a/raptor/core/comm_pkg.cpp b/raptor/core/comm_pkg.cpp
deleted file mode 100644
index 5f27de8f..00000000
--- a/raptor/core/comm_pkg.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "comm_pkg.hpp"
-#include "par_matrix.hpp"
-#include "utilities.hpp"
-
-namespace raptor
-{
-    template<>
-    std::vector<double>& CommPkg::get_buffer<double>()
-    {
-        return get_double_buffer();
-    }
-    template<>
-    std::vector<int>& CommPkg::get_buffer<int>()
-    {
-        return get_int_buffer();
-    }
-
-    template<>
-    std::vector<double>& CommPkg::communicate<double>(const double* values,
-            const int block_size)
-    {
-        init_double_comm(values, block_size);
-        return complete_double_comm(block_size);
-    }
-    template<>
-    std::vector<int>& CommPkg::communicate<int>(const int* values,
-            const int block_size)
-    {
-        init_int_comm(values, block_size);
-        return complete_int_comm(block_size);
-    }
-
-    template<>
-    void CommPkg::init_comm<double>(const double* values,
-            const int block_size)
-    {
-        init_double_comm(values, block_size);
-    }
-    template<>
-    void CommPkg::init_comm<int>(const int* values, const int block_size)
-    {
-        init_int_comm(values, block_size);
-    }
-
-    template<>
-    std::vector<double>& CommPkg::complete_comm<double>(const int block_size)
-    {
-        return complete_double_comm(block_size);
-    }
-    template<>
-    std::vector<int>& CommPkg::complete_comm<int>(const int block_size)
-    {
-        return complete_int_comm(block_size);
-    }
-
-    template<>
-    void CommPkg::communicate_T(const double* values,
-            std::vector<double>& result,
-            const int block_size,
-            std::function<double(double, double)> result_func,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        init_double_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_double_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::communicate_T(const double* values,
-            std::vector<int>& result,
-            const int block_size,
-            std::function<int(int, double)> result_func,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        init_double_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_double_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::communicate_T(const int* values,
-            std::vector<int>& result,
-            const int block_size,
-            std::function<int(int, int)> result_func,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        init_int_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_int_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::communicate_T(const int* values,
-            std::vector<double>& result,
-            const int block_size,
-            std::function<double(double, int)> result_func,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        init_int_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_int_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::communicate_T<double>(const double* values,
-            const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        init_double_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_double_comm_T(block_size, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::communicate_T<int>(const int* values,
-            const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        init_int_comm_T(values, block_size, init_result_func, init_result_func_val);
-        complete_int_comm_T(block_size, init_result_func, init_result_func_val);
-    }
-
-    template<>
-    void CommPkg::init_comm_T<double>(const double* values,
-            const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        init_double_comm_T(values, block_size, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::init_comm_T<int>(const int* values,
-            const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        init_int_comm_T(values, block_size, init_result_func, init_result_func_val);
-    }
-
-    template<>
-    void CommPkg::complete_comm_T<double, double>(span<double> result,
-            const int block_size,
-            std::function<double(double, double)> result_func,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        complete_double_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::complete_comm_T<double, int>(span<int> result,
-            const int block_size,
-            std::function<int(int, double)> result_func,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        complete_double_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::complete_comm_T<int, int>(span<int> result,
-            const int block_size,
-            std::function<int(int, int)> result_func,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        complete_int_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::complete_comm_T<int, double>(span<double> result,
-            const int block_size,
-            std::function<double(double, int)> result_func,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        complete_int_comm_T(result, block_size, result_func, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::complete_comm_T<double>(const int block_size,
-            std::function<double(double, double)> init_result_func,
-            double init_result_func_val)
-    {
-        complete_double_comm_T(block_size, init_result_func, init_result_func_val);
-    }
-    template<>
-    void CommPkg::complete_comm_T<int>(const int block_size,
-            std::function<int(int, int)> init_result_func,
-            int init_result_func_val)
-    {
-        complete_int_comm_T(block_size, init_result_func, init_result_func_val);
-    }
-}
-
-
-using namespace raptor;
-
-std::vector<double>& CommPkg::communicate(ParVector& v, const int block_size)
-{
-    init_double_comm(v.local.data(), block_size);
-    return complete_double_comm(block_size);
-}
-
-void CommPkg::init_comm(ParVector& v, const int block_size)
-{
-    init_double_comm(v.local.data(), block_size);
-}
diff --git a/raptor/core/comm_pkg.hpp b/raptor/core/comm_pkg.hpp
deleted file mode 100644
index e7d75458..00000000
--- a/raptor/core/comm_pkg.hpp
+++ /dev/null
@@ -1,1841 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_PARCOMM_HPP
-#define RAPTOR_CORE_PARCOMM_HPP
-
-#include <mpi.h>
-#include "comm_data.hpp"
-#include "matrix.hpp"
-#include "partition.hpp"
-#include "par_vector.hpp"
-
-#define STANDARD_PPN 4
-#define STANDARD_PROC_LAYOUT 1
-
-/**************************************************************
- *****   CommPkg Class:
- **************************************************************
- ***** This class constructs a parallel communicator, containing
- ***** which messages must be sent/recieved for matrix operations
- *****
- ***** Methods
- ***** -------
- ***** communicate(data_t* values)
- *****    Communicates values to processes, based on underlying
- *****    communication package
- ***** form_col_to_proc(...)
- *****    Maps each column in off_proc_column_map to process
- *****    on which corresponding values are stored
- **************************************************************/
-namespace raptor
-{
-    class ParCSRMatrix;
-    class ParBSRMatrix;
-
-    class CommPkg
-    {
-      public:
-        CommPkg(Partition* partition)
-        {
-            topology = partition->topology;
-            topology->num_shared++;
-            num_shared = 0;
-        }
-
-        CommPkg(Topology* _topology)
-        {
-            topology = _topology;
-            topology->num_shared++;
-            num_shared = 0;
-        }
-
-        virtual ~CommPkg()
-        {
-            if (topology)
-            {
-                if (topology->num_shared)
-                {
-                    topology->num_shared--;
-                }
-                else
-                {
-                    delete topology;
-                }
-            }
-        }
-
-        void delete_comm()
-        {
-            if (num_shared == 0)
-                delete this;
-            else num_shared--;
-        }
-
-        // Matrix Communication
-        // TODO -- Block transpose communication
-        //      -- Should b_rows / b_cols be switched?
-        virtual CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual CSRMatrix* complete_mat_comm(const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true) = 0;
-
-        virtual CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true) = 0;
-        virtual CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true) = 0;
-        virtual void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double*>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) = 0;
-        virtual CSRMatrix* complete_mat_comm_T(const int n_result_rows,
-                const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true) = 0;
-
-        std::vector<double>& get_vals(CSRMatrix* A)
-        {
-            return A->vals;
-        }
-        std::vector<double*> get_vals(BSRMatrix* A)
-        {
-            return A->block_vals;
-        }
-
-        CSRMatrix* communicate_sparsity(ParCSRMatrix* A)
-        {
-            return communicate(A, false);
-        }
-
-        CSRMatrix* communicate(ParCSRMatrix* A, const bool has_vals = true);
-        CSRMatrix* communicate(ParBSRMatrix* A, const bool has_vals = true);
-        void init_par_mat_comm(ParCSRMatrix* A, std::vector<char>& send_buffer,
-                const bool has_vals = true);
-        void init_par_mat_comm(ParBSRMatrix* A, std::vector<char>& send_buffer,
-                const bool has_vals = true);
-
-        CSRMatrix* communicate(CSRMatrix* A, const int has_vals = true)
-        {
-            return communicate(A->idx1, A->idx2, get_vals(A), A->b_rows, A->b_cols, has_vals);
-        }
-        CSRMatrix* communicate_T(CSRMatrix* A, const int has_vals = true)
-        {
-            return communicate_T(A->idx1, A->idx2, get_vals(A), A->n_rows, A->b_rows,
-                    A->b_cols, has_vals);
-        }
-
-        // Vector Communication
-        std::vector<double>& communicate(ParVector& v, const int block_size = 1);
-        void init_comm(ParVector& v, const int block_size = 1);
-
-        // Standard Communication
-        template<typename T>
-        std::vector<T>& communicate(const std::vector<T>& values, const int block_size = 1)
-        {
-            return communicate(values.data(), block_size);
-        }
-        template<typename T>
-        void init_comm(const std::vector<T>& values, const int block_size = 1)
-        {
-            init_comm(values.data(), block_size);
-        }
-        template<typename T> void init_comm(const T* values, const int block_size = 1);
-        template<typename T> std::vector<T>& complete_comm(const int block_size = 1);
-        template<typename T> std::vector<T>& communicate(const T* values, const int block_size = 1);
-        virtual void init_double_comm(const double* values, const int block_size) = 0;
-        virtual void init_int_comm(const int* values, const int block_size) = 0;
-        virtual std::vector<double>& complete_double_comm(const int block_size) = 0;
-        virtual std::vector<int>& complete_int_comm(const int block_size) = 0;
-
-        // Transpose Communication
-        template<typename T, typename U>
-        void communicate_T(const std::vector<T>& values, std::vector<U>& result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            communicate_T(values.data(), result, block_size, result_func,
-                    init_result_func, init_result_func_val);
-        }
-        template<typename T>
-        void communicate_T(const std::vector<T>& values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            communicate_T(values.data(), block_size, init_result_func,
-                    init_result_func_val);
-        }
-        template<typename T>
-        void init_comm_T(const std::vector<T>& values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            init_comm_T(values.data(), block_size, init_result_func, init_result_func_val);
-        }
-        template<typename T> void init_comm_T(const T* values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0);
-        template<typename T, typename U> void complete_comm_T(span<U> result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0);
-        template<typename T> void complete_comm_T(
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0);
-        template<typename T, typename U> void communicate_T(const T* values,
-                std::vector<U>& result, const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0);
-        template<typename T> void communicate_T(const T* values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0);
-        virtual void init_double_comm_T(const double* values,
-                const int block_size,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0) = 0;
-        virtual void init_int_comm_T(const int* values,
-                const int block_size,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0) = 0;
-        virtual void complete_double_comm_T(span<double> result,
-                const int block_size,
-                std::function<double(double, double)> result_func = &sum_func<double, double>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>, double init_result_func_val = 0) = 0;
-        virtual void complete_double_comm_T(span<int> result,
-                const int block_size,
-                std::function<int(int, double)> result_func = &sum_func<double, int>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>, double init_result_func_val = 0) = 0;
-        virtual void complete_int_comm_T(span<int> result,
-                const int block_size,
-                std::function<int(int, int)> result_func = &sum_func<int, int>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0) = 0;
-        virtual void complete_int_comm_T(span<double> result,
-                const int block_size,
-                std::function<double(double, int)> result_func = &sum_func<int, double>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0) = 0;
-        virtual void complete_double_comm_T(const int block_size,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>, double init_result_func_val = 0) = 0;
-        virtual void complete_int_comm_T(const int block_size,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0) = 0;
-
-        // Helper methods
-        template <typename T> std::vector<T>& get_buffer();
-        virtual std::vector<double>& get_double_buffer() = 0;
-        virtual std::vector<int>& get_int_buffer() = 0;
-
-        // Class Variables
-        Topology* topology;
-        std::vector<double> buffer;
-        std::vector<int> int_buffer;
-        int num_shared;
-    };
-
-
-    /**************************************************************
-    *****   ParComm Class
-    **************************************************************
-    ***** This class constructs a standard parallel communicator:
-    ***** which messages must be sent/recieved for matrix operations
-    *****
-    ***** Attributes
-    ***** -------------
-    ***** num_sends : index_t
-    *****    Number of messages this process must send during
-    *****    matrix operations
-    ***** num_recvs : index_t
-    *****    Number of messages this process will recv during
-    *****    matrix operations
-    ***** size_sends : index_t
-    *****    Total number of elements this process sends in all
-    *****    messages
-    ***** size_recvs : index_t
-    *****    Total number of elements this process recvs from
-    *****    all messages
-    ***** send_procs : std::vector<int>
-    *****    Distant processes messages are to be sent to
-    ***** send_row_starts : std::vector<int>
-    *****    Pointer to first position in send_row_indices
-    *****    that a given process will send.
-    ***** send_row_indices : std::vector<int>
-    *****    The indices of values that must be sent to each
-    *****    process in send_procs
-    ***** recv_procs : std::vector<int>
-    *****    Distant processes messages are to be recvd from
-    ***** recv_col_starts : std::vector<int>
-    *****    Pointer to first column recvd from each process
-    *****    in recv_procs
-    ***** col_to_proc : std::vector<int>
-    *****    Maps each local column in the off-diagonal block
-    *****    to the process that holds corresponding data
-    **************************************************************/
-    class ParComm : public CommPkg
-    {
-      public:
-        /**************************************************************
-        *****   ParComm Class Constructor
-        **************************************************************
-        ***** Initializes an empty ParComm, setting send and recv
-        ***** sizes to 0
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** _key : int (optional)
-        *****    Tag to be used in RAPtor_MPI Communication (default 0)
-        **************************************************************/
-        ParComm(Partition* partition, int _key = 0,
-                RAPtor_MPI_Comm _comm = RAPtor_MPI_COMM_WORLD,
-                CommData* r_data = NULL) : CommPkg(partition)
-        {
-            mpi_comm = _comm;
-            key = _key;
-            send_data = new NonContigData();
-            if (r_data)
-                recv_data = r_data;
-            else
-                recv_data = new ContigData();
-        }
-
-        ParComm(Topology* topo, int _key = 0,
-                RAPtor_MPI_Comm _comm = RAPtor_MPI_COMM_WORLD,
-                CommData* r_data = NULL) : CommPkg(topo)
-        {
-            mpi_comm = _comm;
-            key = _key;
-            send_data = new NonContigData();
-            if (r_data)
-                recv_data = r_data;
-            else
-                recv_data = new ContigData();
-        }
-
-        /**************************************************************
-        *****   ParComm Class Constructor
-        **************************************************************
-        ***** Initializes a ParComm object based on the off_proc Matrix
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** off_proc_column_map : std::vector<int>&
-        *****    Maps local off_proc columns indices to global
-        ***** _key : int (optional)
-        *****    Tag to be used in RAPtor_MPI Communication (default 9999)
-        **************************************************************/
-        ParComm(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                int _key = 9999,
-                RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD,
-                CommData* r_data = NULL) : CommPkg(partition)
-        {
-            mpi_comm = comm;
-            std::vector<int> off_proc_col_to_proc(off_proc_column_map.size());
-            partition->form_col_to_proc(off_proc_column_map, off_proc_col_to_proc);
-            init_par_comm(off_proc_column_map, off_proc_col_to_proc, _key, comm, r_data);
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                send_data->indices[i] -= partition->first_local_col;
-            }
-        }
-
-        ParComm(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                const std::vector<int>& on_proc_column_map,
-                int _key = 9999,
-                RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD,
-                CommData* r_data = NULL) : CommPkg(partition)
-        {
-            mpi_comm = comm;
-            int idx;
-            int ctr = 0;
-            std::vector<int> part_col_to_new;
-            std::vector<int> off_proc_col_to_proc(off_proc_column_map.size());
-            partition->form_col_to_proc(off_proc_column_map, off_proc_col_to_proc);
-
-            init_par_comm(off_proc_column_map, off_proc_col_to_proc, _key, comm, r_data);
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                send_data->indices[i] -= partition->first_local_col;
-            }
-
-            if (partition->local_num_cols)
-            {
-                part_col_to_new.resize(partition->local_num_cols, -1);
-            }
-            for (std::vector<int>::const_iterator it = on_proc_column_map.begin();
-                    it != on_proc_column_map.end(); ++it)
-            {
-                part_col_to_new[*it - partition->first_local_col] = ctr++;
-            }
-
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                idx = send_data->indices[i];
-                send_data->indices[i] = part_col_to_new[idx];
-                assert(part_col_to_new[idx] >= 0);
-            }
-
-        }
-
-        ParComm(Topology* _topology,
-                const std::vector<int>& off_proc_column_map,
-                const std::vector<int>& off_proc_col_to_proc,
-                const std::vector<int>& local_row_map,
-                int _key = 9999,
-                RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD,
-                CommData* r_data = NULL) : CommPkg(_topology)
-        {
-            mpi_comm = comm;
-            init_par_comm(off_proc_column_map, off_proc_col_to_proc,
-                    _key, comm, r_data);
-            std::map<int,int> global_to_local;
-            for (int i = 0; i < (int)local_row_map.size(); i++)
-            {
-                global_to_local[local_row_map[i]] = i;
-            }
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                send_data->indices[i] = global_to_local[send_data->indices[i]];
-            }
-
-        }
-
-        void init_par_comm(const std::vector<int>& off_proc_column_map,
-                const std::vector<int>& off_proc_col_to_proc,
-                int _key, RAPtor_MPI_Comm comm,
-                CommData* r_data = NULL)
-        {
-            // Get RAPtor_MPI Information
-            int rank, num_procs;
-            RAPtor_MPI_Comm_rank(comm, &rank);
-            RAPtor_MPI_Comm_size(comm, &num_procs);
-
-            // Initialize class variables
-            key = _key;
-
-            send_data = new NonContigData();
-
-            if (r_data)
-                recv_data = r_data;
-            else
-                recv_data = new ContigData();
-
-            // Declare communication variables
-            int proc, prev_proc;
-            int tag = 12345;  // TODO -- switch this to key?
-            int off_proc_num_cols = off_proc_column_map.size();
-
-            std::vector<int> tmp_send_buffer;
-
-
-            // Determine processes columns are received from,
-            // and adds corresponding messages to recv data.
-            // Assumes columns are partitioned across processes
-            // in contiguous blocks, and are sorted
-            if (off_proc_num_cols)
-            {
-                prev_proc = off_proc_col_to_proc[0];
-                int prev_idx = 0;
-                for (int i = 1; i < off_proc_num_cols; i++)
-                {
-                    proc = off_proc_col_to_proc[i];
-                    if (proc != prev_proc)
-                    {
-                        recv_data->add_msg(prev_proc, i - prev_idx);
-                        prev_proc = proc;
-                        prev_idx = i;
-                    }
-                }
-                recv_data->add_msg(prev_proc, off_proc_num_cols - prev_idx);
-                recv_data->finalize();
-            }
-
-            // For each process I recv from, send the global column indices
-            // for which I must recv corresponding rows
-            std::vector<int> recv_sizes(num_procs, 0);
-            for (int i = 0; i < recv_data->num_msgs; i++)
-                recv_sizes[recv_data->procs[i]] =
-                    recv_data->indptr[i+1] - recv_data->indptr[i];
-            RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, recv_sizes.data(), num_procs, RAPtor_MPI_INT,
-                    RAPtor_MPI_SUM, RAPtor_MPI_COMM_WORLD);
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            recv_data->send(off_proc_column_map.data(), tag, comm);
-            send_data->probe(recv_sizes[rank], tag, comm);
-            recv_data->waitall();
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-        }
-
-        ParComm(ParComm* comm) : CommPkg(comm->topology)
-        {
-            mpi_comm = comm->mpi_comm;
-            send_data = comm->send_data->copy();
-            recv_data = comm->recv_data->copy();
-            key = comm->key;
-        }
-
-        ParComm(ParComm* comm, const std::vector<int>& off_proc_col_to_new)
-            : CommPkg(comm->topology)
-        {
-            mpi_comm = comm->mpi_comm;
-
-            if (comm == NULL)
-            {
-                key = 0;
-                return;
-            }
-            key = comm->key;
-
-            init_off_proc_new(comm, off_proc_col_to_new);
-        }
-
-        ParComm(ParComm* comm, const std::vector<int>& on_proc_col_to_new,
-                const std::vector<int>& off_proc_col_to_new)
-            : CommPkg(comm->topology)
-        {
-            mpi_comm = comm->mpi_comm;
-            int idx, new_idx;
-
-            if (comm == NULL)
-            {
-                key = 0;
-                return;
-            }
-            key = comm->key;
-
-            init_off_proc_new(comm, off_proc_col_to_new);
-
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                idx = send_data->indices[i];
-                new_idx = on_proc_col_to_new[idx];
-                if (new_idx != -1)
-                {
-                    send_data->indices[i] = new_idx;
-                }
-            }
-        }
-
-
-        void init_off_proc_new(ParComm* comm, const std::vector<int>& off_proc_col_to_new)
-        {
-            bool comm_proc;
-            int proc, start, end;
-
-            std::function<int(int, int)> compare_func = [](const int a, const int b)
-            {
-                if (b >= 0) return b;
-                else return a;
-            };
-            comm->communicate_T(off_proc_col_to_new, 1, compare_func, -1);
-
-            recv_data = comm->recv_data->copy(off_proc_col_to_new);
-
-            send_data = new NonContigData();
-            for (int i = 0; i < comm->send_data->num_msgs; i++)
-            {
-                comm_proc = false;
-                proc = comm->send_data->procs[i];
-                start = comm->send_data->indptr[i];
-                end = comm->send_data->indptr[i+1];
-                for (int j = start; j < end; j++)
-                {
-                    if (comm->send_data->int_buffer[j] != -1)
-                    {
-                        comm_proc = true;
-                        send_data->indices.emplace_back(comm->send_data->indices[j]);
-                    }
-                }
-                if (comm_proc)
-                {
-                    send_data->procs.emplace_back(proc);
-                    send_data->indptr.emplace_back(send_data->indices.size());
-                }
-            }
-            send_data->num_msgs = send_data->procs.size();
-            send_data->size_msgs = send_data->indices.size();
-            send_data->finalize();
-
-
-        }
-
-        /**************************************************************
-        *****   ParComm Class Destructor
-        **************************************************************
-        *****
-        **************************************************************/
-        ~ParComm()
-        {
-            delete send_data;
-            delete recv_data;
-        }
-
-        // Standard Communication
-        void init_double_comm(const double* values, const int block_size = 1)
-        {
-            initialize(values, block_size);
-        }
-        void init_int_comm(const int* values, const int block_size = 1)
-        {
-            initialize(values);
-        }
-        std::vector<double>& complete_double_comm(const int block_size = 1)
-        {
-            return complete<double>(block_size);
-        }
-        std::vector<int>& complete_int_comm(const int block_size = 1)
-        {
-            return complete<int>(block_size);
-        }
-        template<typename T>
-        std::vector<T>& communicate(const std::vector<T>& values,
-                const int block_size = 1)
-        {
-            return CommPkg::communicate(values.data(), block_size);
-        }
-        template<typename T>
-        std::vector<T>& communicate(const T* values, const int block_size = 1)
-        {
-            return CommPkg::communicate(values, block_size);
-        }
-
-        template<typename T>
-        void initialize(const T* values, const int block_size = 1)
-        {
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            send_data->send(values, key, mpi_comm, block_size);
-            recv_data->recv<T>(key, mpi_comm, block_size);
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-        }
-
-        template<typename T>
-        std::vector<T>& complete(const int block_size = 1)
-        {
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            send_data->waitall();
-            recv_data->waitall();
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-            key++;
-
-            // Extract packed data to appropriate buffer
-            std::vector<T>& buf = recv_data->get_buffer<T>();
-
-            return buf;
-        }
-
-        // Transpose Communication
-        void init_double_comm_T(const double* values,
-                const int block_size = 1,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            initialize_T(values, block_size, init_result_func, init_result_func_val);
-        }
-        void init_int_comm_T(const int* values,
-                const int block_size = 1,
-                std::function<int(int, int)> init_result_func =
-                    &sum_func<int, int>,
-                    int init_result_func_val = 0)
-        {
-            initialize_T(values, block_size, init_result_func, init_result_func_val);
-        }
-        void complete_double_comm_T(span<double> result,
-                const int block_size = 1,
-                std::function<double(double, double)> result_func = &sum_func<double, double>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            complete_T<double>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_double_comm_T(span<int> result,
-                const int block_size = 1,
-                std::function<int(int, double)> result_func = &sum_func<double, int>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            complete_T<double>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(span<double> result,
-                const int block_size = 1,
-                std::function<double(double, int)> result_func = &sum_func<int, double>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(span<int> result,
-                const int block_size = 1,
-                std::function<int(int, int)> result_func = &sum_func<int, int>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_double_comm_T(const int block_size = 1,
-                std::function<double(double, double)> init_result_func =
-                &sum_func<double, double>,
-                double init_result_func_val = 0)
-        {
-            complete_T<double>(block_size, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(const int block_size = 1,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(block_size, init_result_func, init_result_func_val);
-        }
-        template<typename T, typename U>
-        void communicate_T(const std::vector<T>& values, std::vector<U>& result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values.data(), result, block_size,
-                    result_func, init_result_func, init_result_func_val);
-        }
-        template<typename T, typename U>
-        void communicate_T(const T* values, std::vector<U>& result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values, result, block_size,
-                    result_func, init_result_func, init_result_func_val);
-        }
-        template<typename T>
-        void communicate_T(const std::vector<T>& values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values.data(), block_size, init_result_func,
-                    init_result_func_val);
-        }
-        template<typename T>
-        void communicate_T(const T* values, const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values, block_size, init_result_func, init_result_func_val);
-        }
-
-        template<typename T>
-        void initialize_T(const T* values, const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            recv_data->send(values, key, mpi_comm, block_size, init_result_func, init_result_func_val);
-            send_data->recv<T>(key, mpi_comm, block_size);
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-        }
-
-        template<typename T, typename U>
-        void complete_T(span<U> result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            // TODO - dont need to copy into sendbuf first
-            complete_T<T>(block_size, init_result_func, init_result_func_val);
-
-            int idx, pos;
-            std::vector<T>& sendbuf = send_data->get_buffer<T>();
-
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                idx = send_data->indices[i] * block_size;
-                pos = i * block_size;
-                for (int j = 0; j < block_size; j++)
-                {
-                    result[idx + j]  = result_func(result[idx + j], sendbuf[pos + j]);
-                }
-            }
-        }
-
-        template<typename T>
-        void complete_T(const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            send_data->waitall();
-            recv_data->waitall();
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-            key++;
-        }
-
-        // Conditional communication
-        template <typename T>
-        std::vector<T>& conditional_comm(
-                const std::vector<T>& vals,
-                const std::vector<int>& states,
-                const std::vector<int>& off_proc_states,
-                std::function<bool(int)> compare_func,
-                const int block_size = 1)
-        {
-            int ctr, n_sends, n_recvs;
-            int tag = 325493;
-            bool comparison;
-
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            send_data->send(vals.data(), tag, mpi_comm, states, compare_func, &n_sends, block_size);
-            recv_data->recv<T>(tag, mpi_comm, off_proc_states,
-                    compare_func, &ctr, &n_recvs, block_size);
-
-            send_data->waitall(n_sends);
-            recv_data->waitall(n_recvs);
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-
-            std::vector<T>& recvbuf = recv_data->get_buffer<T>();
-
-            ctr--;
-            for (int i = recv_data->size_msgs - 1; i >= 0; i--)
-            {
-                int idx = i * block_size;
-                comparison = false;
-                for (int j = 0; j < block_size; j++)
-                {
-                    if (compare_func(off_proc_states[idx+j]))
-                    {
-                        comparison = true;
-                        break;
-                    }
-                }
-                if (comparison)
-                {
-                    for (int j = block_size - 1; j >= 0; j--)
-                    {
-                        recvbuf[idx+j] = recvbuf[ctr--];
-                    }
-                }
-                else
-                {
-                    for (int j = block_size - 1; j >= 0; j--)
-                    {
-                        recvbuf[idx+j] = 0.0;
-                    }
-                }
-            }
-
-            return recvbuf;
-        }
-
-        template <typename T, typename U>
-        void conditional_comm_T(const std::vector<T>& vals,
-                const std::vector<int>& states,
-                const std::vector<int>& off_proc_states,
-                std::function<bool(int)> compare_func,
-                std::vector<U>& result,
-                std::function<U(U, T)> result_func,
-                const int block_size = 1)
-        {
-            int idx, ctr;
-            int n_sends, n_recvs;
-            int tag = 453246;
-            bool comparison;
-
-            if (profile) vec_t -= RAPtor_MPI_Wtime();
-            recv_data->send(vals.data(), tag, mpi_comm, off_proc_states, compare_func,
-                    &n_sends, block_size);
-            send_data->recv<T>(tag, mpi_comm, states, compare_func, &ctr, &n_recvs, block_size);
-
-            recv_data->waitall(n_sends);
-            send_data->waitall(n_recvs);
-            if (profile) vec_t += RAPtor_MPI_Wtime();
-
-            std::vector<T>& sendbuf = send_data->get_buffer<T>();
-
-            ctr = 0;
-            for (int i = 0; i < send_data->size_msgs; i++)
-            {
-                idx = send_data->indices[i] * block_size;
-                comparison = false;
-                for (int j = 0; j < block_size; j++)
-                {
-                    if (compare_func(states[idx + j]))
-                    {
-                        comparison = true;
-                        break;
-                    }
-                }
-                if (comparison)
-                {
-                    for (int j = 0; j < block_size; j++)
-                    {
-                        result[idx + j] = result_func(result[idx + j], sendbuf[ctr++]);
-                    }
-                }
-            }
-        }
-
-
-        // Matrix Communication
-        CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        CSRMatrix* complete_mat_comm(const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-
-        CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-        CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-        void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) ;
-        void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double*>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) ;
-        CSRMatrix* complete_mat_comm_T(const int n_result_rows,
-                const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true) ;
-
-
-        CSRMatrix* communicate(ParCSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate(ParBSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate(CSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate_T(CSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate_T(A, has_vals);
-        }
-
-
-        // Vector Communication
-        std::vector<double>& communicate(ParVector& v, const int block_size = 1)
-        {
-            return CommPkg::communicate(v, block_size);
-        }
-        void init_comm(ParVector& v, const int block_size = 1)
-        {
-            CommPkg::init_comm(v, block_size);
-        }
-
-        // Helper Methods
-        std::vector<double>& get_double_buffer()
-        {
-            return recv_data->buffer;
-        }
-        std::vector<int>& get_int_buffer()
-        {
-            return recv_data->int_buffer;
-        }
-
-        int key;
-        NonContigData* send_data;
-        CommData* recv_data;
-        RAPtor_MPI_Comm mpi_comm;
-    };
-
-
-
-    /**************************************************************
-    *****   TAPComm Class
-    **************************************************************
-    ***** This class constructs a topology-aware parallel communicator:
-    ***** which messages must be sent/recieved for matrix operations,
-    ***** using topology-aware methods to limit the number and size
-    ***** of inter-node messages
-    *****
-    ***** Attributes
-    ***** -------------
-    ***** local_S_par_comm : ParComm*
-    *****    Parallel communication package for sending data that originates
-    *****    on rank to other processes local to node, before inter-node
-    *****    communication occurs.
-    ***** local_R_par_comm : ParComm*
-    *****    Parallel communication package for redistributing previously
-    *****    received values (from inter-node communication step) to
-    *****    processes local to rank which need said values
-    ***** local_L_par_comm : ParComm*
-    *****    Parallel communication package for communicating values
-    *****    that both originate and have a final destination on node
-    *****    (fully intra-node communication)
-    ***** global_par_comm : ParComm*
-    *****    Parallel communication package for sole inter-node step.
-    ***** buffer : Vector
-    *****    Combination of local_L_par_comm and local_R_par_comm
-    *****    recv buffers, ordered to match off_proc_column_map
-    ***** Partition* partition
-    *****    Partition, holding information about topology
-    **************************************************************/
-    class TAPComm : public CommPkg
-    {
-        public:
-
-        TAPComm(Partition* partition, bool form_S = true, ParComm* L_comm = NULL) : CommPkg(partition)
-        {
-            if (form_S)
-            {
-                local_S_par_comm = new ParComm(partition, 2345, partition->topology->local_comm,
-                        new DuplicateData());
-            }
-            else local_S_par_comm = NULL;
-
-            local_R_par_comm = new ParComm(partition, 3456, partition->topology->local_comm,
-                    new NonContigData());
-            global_par_comm = new ParComm(partition, 5678, RAPtor_MPI_COMM_WORLD,
-                    new DuplicateData());
-
-            if (L_comm)
-            {
-                local_L_par_comm = L_comm;
-                local_L_par_comm->num_shared++;
-            }
-            else
-            {
-                local_L_par_comm = new ParComm(partition, 4567, partition->topology->local_comm,
-                        new NonContigData());
-            }
-        }
-
-
-        /**************************************************************
-        *****   TAPComm Class Constructor
-        **************************************************************
-        ***** Initializes a TAPComm for a matrix without contiguous
-        ***** row-wise partitions across processes.  Instead, each
-        ***** process holds a random assortment of rows.
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** off_proc_column_map : std::vector<int>&
-        *****    Maps local off_proc columns indices to global
-        ***** global_num_cols : int
-        *****    Number of global columns in matrix
-        ***** local_num_cols : int
-        *****    Number of columns local to rank
-        **************************************************************/
-        TAPComm(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                bool form_S = true,
-                RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD)
-                : CommPkg(partition)
-        {
-            if (form_S)
-            {
-                init_tap_comm(partition, off_proc_column_map, comm);
-            }
-            else
-            {
-                init_tap_comm_simple(partition, off_proc_column_map, comm);
-            }
-        }
-
-        TAPComm(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                const std::vector<int>& on_proc_column_map,
-                bool form_S = true,
-                RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD)
-                : CommPkg(partition)
-        {
-            std::vector<int> on_proc_to_new;
-            int on_proc_num_cols = on_proc_column_map.size();
-            if (partition->local_num_cols)
-            {
-                on_proc_to_new.resize(partition->local_num_cols);
-                for (int i = 0; i < on_proc_num_cols; i++)
-                {
-                    on_proc_to_new[on_proc_column_map[i] - partition->first_local_col] = i;
-                }
-            }
-
-            if (form_S)
-            {
-                init_tap_comm(partition, off_proc_column_map, comm);
-
-                for (std::vector<int>::iterator it = local_S_par_comm->send_data->indices.begin();
-                        it != local_S_par_comm->send_data->indices.end(); ++it)
-                {
-                    *it = on_proc_to_new[*it];
-                }
-            }
-            else
-            {
-                init_tap_comm_simple(partition, off_proc_column_map, comm);
-
-                for (std::vector<int>::iterator it = global_par_comm->send_data->indices.begin();
-                        it != global_par_comm->send_data->indices.end(); ++it)
-                {
-                    *it = on_proc_to_new[*it];
-                }
-            }
-
-            for (std::vector<int>::iterator it = local_L_par_comm->send_data->indices.begin();
-                    it != local_L_par_comm->send_data->indices.end(); ++it)
-            {
-                *it = on_proc_to_new[*it];
-            }
-        }
-
-        /**************************************************************
-        *****   TAPComm Class Constructor
-        **************************************************************
-        ***** Create topology-aware communication class from
-        ***** original communication package (which processes rank
-        ***** communication which, and what is sent to / recv from
-        ***** each process.
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** orig_comm : ParComm*
-        *****    Existing standard communication package from which
-        *****    to form topology-aware communicator
-        **************************************************************/
-        TAPComm(TAPComm* tap_comm) : CommPkg(tap_comm->topology)
-        {
-            if (tap_comm->local_S_par_comm)
-            {
-                local_S_par_comm = new ParComm(tap_comm->local_S_par_comm);
-            }
-            else local_S_par_comm = NULL;
-
-            global_par_comm = new ParComm(tap_comm->global_par_comm);
-            local_R_par_comm = new ParComm(tap_comm->local_R_par_comm);
-            local_L_par_comm = new ParComm(tap_comm->local_L_par_comm);
-
-            recv_size = tap_comm->recv_size;
-            if (recv_size)
-            {
-                buffer.resize(recv_size);
-                int_buffer.resize(recv_size);
-            }
-        }
-
-        TAPComm(TAPComm* tap_comm, const std::vector<int>& off_proc_col_to_new,
-                ParComm* local_L = NULL) : CommPkg(tap_comm->topology)
-        {
-            init_off_proc_new(tap_comm, off_proc_col_to_new, local_L);
-        }
-
-        TAPComm(TAPComm* tap_comm, const std::vector<int>& on_proc_col_to_new,
-                const std::vector<int>& off_proc_col_to_new,
-                ParComm* local_L = NULL) : CommPkg(tap_comm->topology)
-        {
-            int idx;
-
-            init_off_proc_new(tap_comm, off_proc_col_to_new, local_L);
-
-            if (!local_L)
-            {
-                for (int i = 0; i < local_L_par_comm->send_data->size_msgs; i++)
-                {
-                    idx = local_L_par_comm->send_data->indices[i];
-                    local_L_par_comm->send_data->indices[i] = on_proc_col_to_new[idx];
-                }
-            }
-
-            if (local_S_par_comm)
-            {
-                for (int i = 0; i < local_S_par_comm->send_data->size_msgs; i++)
-                {
-                    idx = local_S_par_comm->send_data->indices[i];
-                    local_S_par_comm->send_data->indices[i] = on_proc_col_to_new[idx];
-                }
-            }
-            else
-            {
-                for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-                {
-                    idx = global_par_comm->send_data->indices[i];
-                    global_par_comm->send_data->indices[i] = on_proc_col_to_new[idx];
-                }
-            }
-        }
-
-
-        void init_off_proc_new(TAPComm* tap_comm, const std::vector<int>& off_proc_col_to_new,
-                ParComm* local_L = NULL)
-        {
-            int idx, ctr;
-            int start, end;
-
-            DuplicateData* global_recv = (DuplicateData*) tap_comm->global_par_comm->recv_data;
-
-            if (local_L)
-            {
-                local_L_par_comm = local_L;
-                local_L_par_comm->num_shared++;
-            }
-            else
-            {
-                local_L_par_comm = new ParComm(tap_comm->local_L_par_comm, off_proc_col_to_new);
-            }
-            local_R_par_comm = new ParComm(tap_comm->local_R_par_comm, off_proc_col_to_new);
-
-            // Create global par comm / update R send indices
-            std::vector<int>& local_R_int_buffer =
-                tap_comm->local_R_par_comm->send_data->get_buffer<int>();
-            std::vector<int>& global_int_buffer =
-                tap_comm->global_par_comm->send_data->get_buffer<int>();
-
-            std::vector<int> G_to_new(tap_comm->global_par_comm->recv_data->size_msgs, -1);
-            ctr = 0;
-            for (int i = 0; i < global_recv->size_msgs; i++)
-            {
-                start = global_recv->indptr_T[i];
-                end = global_recv->indptr_T[i+1];
-                for (int j = start; j < end; j++)
-                {
-                    idx = global_recv->indices[j];
-                    if (local_R_int_buffer[idx] != -1)
-                    {
-                        G_to_new[i] = ctr++;
-                        break;
-                    }
-                }
-            }
-            for (std::vector<int>::iterator it = local_R_par_comm->send_data->indices.begin();
-                    it != local_R_par_comm->send_data->indices.end(); ++it)
-            {
-                *it = G_to_new[*it];
-            }
-            idx = 0;
-            for (std::vector<int>::iterator it = local_R_int_buffer.begin();
-                    it != local_R_int_buffer.end(); ++it)
-            {
-                if (*it != -1) *it = idx++;
-            }
-
-            global_par_comm = new ParComm(tap_comm->global_par_comm,
-                    local_R_int_buffer);
-
-
-            // create local S / update global send indices
-            if (tap_comm->local_S_par_comm)
-            {
-                DuplicateData* local_S_recv = (DuplicateData*) tap_comm->local_S_par_comm->recv_data;
-                std::vector<int> S_to_new(tap_comm->local_S_par_comm->recv_data->size_msgs, -1);
-                ctr = 0;
-                for (int i = 0; i < local_S_recv->size_msgs; i++)
-                {
-                    start = local_S_recv->indptr_T[i];
-                    end = local_S_recv->indptr_T[i+1];
-                    for (int j = start; j < end; j++)
-                    {
-                        idx = local_S_recv->indices[j];
-                        if (global_int_buffer[idx] != -1)
-                        {
-                            S_to_new[i] = ctr++;
-                            break;
-                        }
-                    }
-                }
-                for (std::vector<int>::iterator it = global_par_comm->send_data->indices.begin();
-                        it != global_par_comm->send_data->indices.end(); ++it)
-                {
-                    *it = S_to_new[*it];
-                }
-                idx = 0;
-                for (std::vector<int>::iterator it = global_int_buffer.begin();
-                        it != global_int_buffer.end(); ++it)
-                {
-                    if (*it != -1) *it = idx++;
-                }
-
-                local_S_par_comm = new ParComm(tap_comm->local_S_par_comm,
-                        global_int_buffer);
-            }
-            else local_S_par_comm = NULL;
-
-            // Determine size of final recvs (should be equal to
-            // number of off_proc cols)
-            recv_size = local_R_par_comm->recv_data->size_msgs +
-                local_L_par_comm->recv_data->size_msgs;
-            if (recv_size)
-            {
-                // Want a single recv buffer local_R and local_L par_comms
-                buffer.resize(recv_size);
-                int_buffer.resize(recv_size);
-            }
-        }
-
-        /**************************************************************
-        *****   ParComm Class Destructor
-        **************************************************************
-        *****
-        **************************************************************/
-        ~TAPComm()
-        {
-            if (global_par_comm)
-                global_par_comm->delete_comm();
-            if (local_S_par_comm)
-                local_S_par_comm->delete_comm();
-            if (local_R_par_comm)
-                local_R_par_comm->delete_comm();
-            if (local_L_par_comm)
-                local_L_par_comm->delete_comm();
-        }
-
-        void init_tap_comm(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                RAPtor_MPI_Comm comm)
-        {
-            // Get RAPtor_MPI Information
-            int rank, num_procs;
-            RAPtor_MPI_Comm_rank(comm, &rank);
-            RAPtor_MPI_Comm_size(comm, &num_procs);
-
-            // Initialize class variables
-            local_S_par_comm = new ParComm(partition, 2345, partition->topology->local_comm,
-                    new DuplicateData());
-            local_R_par_comm = new ParComm(partition, 3456, partition->topology->local_comm,
-                    new NonContigData());
-            local_L_par_comm = new ParComm(partition, 4567, partition->topology->local_comm,
-                    new NonContigData());
-            global_par_comm = new ParComm(partition, 5678, comm, new DuplicateData());
-
-            // Initialize Variables
-            std::vector<int> off_proc_col_to_proc;
-            std::vector<int> on_node_column_map;
-            std::vector<int> on_node_col_to_proc;
-            std::vector<int> off_node_column_map;
-            std::vector<int> off_node_col_to_node;
-            std::vector<int> on_node_to_off_proc;
-            std::vector<int> off_node_to_off_proc;
-            std::vector<int> recv_nodes;
-            std::vector<int> orig_procs;
-            std::vector<int> node_to_local_proc;
-
-            // Find process on which vector value associated with each column is
-            // stored
-            partition->form_col_to_proc(off_proc_column_map, off_proc_col_to_proc);
-
-            // Partition off_proc cols into on_node and off_node
-            split_off_proc_cols(off_proc_column_map, off_proc_col_to_proc,
-                   on_node_column_map, on_node_col_to_proc, on_node_to_off_proc,
-                   off_node_column_map, off_node_col_to_node, off_node_to_off_proc);
-
-            // Gather all nodes with which any local process must communication
-            form_local_R_par_comm(off_node_column_map, off_node_col_to_node,
-                    orig_procs);
-
-            // Find global processes with which rank communications
-            form_global_par_comm(orig_procs);
-
-            // Form local_S_par_comm: initial distribution of values among local
-            // processes, before inter-node communication
-            form_local_S_par_comm(orig_procs);
-
-            // Adjust send indices (currently global vector indices) to be index
-            // of global vector value from previous recv
-            adjust_send_indices(partition->first_local_col);
-
-            // Form local_L_par_comm: fully local communication (origin and
-            // destination processes both local to node)
-            form_local_L_par_comm(on_node_column_map, on_node_col_to_proc,
-                    partition->first_local_col);
-
-            // Determine size of final recvs (should be equal to
-            // number of off_proc cols)
-            update_recv(on_node_to_off_proc, off_node_to_off_proc);
-        }
-
-        void init_tap_comm_simple(Partition* partition,
-                const std::vector<int>& off_proc_column_map,
-                RAPtor_MPI_Comm comm)
-        {
-            // Get RAPtor_MPI Information
-            int rank, num_procs;
-            RAPtor_MPI_Comm_rank(comm, &rank);
-            RAPtor_MPI_Comm_size(comm, &num_procs);
-
-            // Initialize class variables
-            local_S_par_comm = NULL;
-            local_R_par_comm = new ParComm(partition, 3456, partition->topology->local_comm,
-                    new NonContigData());
-            local_L_par_comm = new ParComm(partition, 4567, partition->topology->local_comm,
-                    new NonContigData());
-            global_par_comm = new ParComm(partition, 5678, comm, new DuplicateData());
-
-            // Initialize Variables
-            std::vector<int> off_proc_col_to_proc;
-            std::vector<int> on_node_column_map;
-            std::vector<int> on_node_col_to_proc;
-            std::vector<int> off_node_column_map;
-            std::vector<int> off_node_col_to_proc;
-            std::vector<int> on_node_to_off_proc;
-            std::vector<int> off_node_to_off_proc;
-
-            // Find process on which vector value associated with each column is
-            // stored
-            partition->form_col_to_proc(off_proc_column_map, off_proc_col_to_proc);
-
-            // Partition off_proc cols into on_node and off_node
-            split_off_proc_cols(off_proc_column_map, off_proc_col_to_proc,
-                   on_node_column_map, on_node_col_to_proc, on_node_to_off_proc,
-                   off_node_column_map, off_node_col_to_proc, off_node_to_off_proc);
-
-            // Form local recv communicator.  Will recv from local rank
-            // corresponding to global rank on which data originates.  E.g. if
-            // data is on rank r = (p, n), and my rank is s = (q, m), I will
-            // recv data from (p, m).
-            form_simple_R_par_comm(off_node_column_map, off_node_col_to_proc);
-
-            // Form global par comm.. Will recv from proc on which data
-            // originates
-            form_simple_global_comm(off_node_col_to_proc);
-
-            // Adjust send indices (currently global vector indices) to be
-            // index of global vector value from previous recv (only updating
-            // local_R to match position in global)
-            adjust_send_indices(partition->first_local_col);
-
-            // Form local_L_par_comm: fully local communication (origin and
-            // destination processes both local to node)
-            form_local_L_par_comm(on_node_column_map, on_node_col_to_proc,
-                    partition->first_local_col);
-
-            // Determine size of final recvs (should be equal to
-            // number of off_proc cols)
-            update_recv(on_node_to_off_proc, off_node_to_off_proc);
-
-        }
-
-        // Helper methods for forming TAPComm:
-        void split_off_proc_cols(const std::vector<int>& off_proc_column_map,
-                const std::vector<int>& off_proc_col_to_proc,
-                std::vector<int>& on_node_column_map,
-                std::vector<int>& on_node_col_to_proc,
-                std::vector<int>& on_node_to_off_proc,
-                std::vector<int>& off_node_column_map,
-                std::vector<int>& off_node_col_to_node,
-                std::vector<int>& off_node_to_off_proc);
-        void form_local_R_par_comm(const std::vector<int>& off_node_column_map,
-                const std::vector<int>& off_node_col_to_node,
-                std::vector<int>& orig_procs);
-        void form_global_par_comm(std::vector<int>& orig_procs);
-        void form_local_S_par_comm(std::vector<int>& orig_procs);
-        void adjust_send_indices(const int first_local_col);
-        void form_local_L_par_comm(const std::vector<int>& on_node_column_map,
-                const std::vector<int>& on_node_col_to_proc,
-                const int first_local_col);
-        void form_simple_R_par_comm(std::vector<int>& off_node_column_map,
-                std::vector<int>& off_node_col_to_proc);
-        void form_simple_global_comm(std::vector<int>& off_node_col_to_proc);
-        void update_recv(const std::vector<int>& on_node_to_off_proc,
-                const std::vector<int>& off_node_to_off_proc, bool update_L = true);
-
-        // Class Methods
-        void init_double_comm(const double* values, const int block_size)
-        {
-            initialize(values, block_size);
-        }
-        void init_int_comm(const int* values, const int block_size)
-        {
-            initialize(values, block_size);
-        }
-        std::vector<double>& complete_double_comm(const int block_size)
-        {
-            return complete<double>(block_size);
-        }
-        std::vector<int>& complete_int_comm(const int block_size)
-        {
-            return complete<int>(block_size);
-        }
-
-        template<typename T>
-        std::vector<T>& communicate(const std::vector<T>& values,
-                const int block_size = 1)
-        {
-            return CommPkg::communicate<T>(values.data(), block_size);
-        }
-        template<typename T>
-        std::vector<T>& communicate(const T* values,
-                const int block_size = 1)
-        {
-            return CommPkg::communicate<T>(values, block_size);
-        }
-
-        template<typename T>
-        void initialize(const T* values, const int block_size = 1)
-        {
-            // Messages with origin and final destination on node
-            local_L_par_comm->communicate<T>(values, block_size);
-
-            if (local_S_par_comm)
-            {
-                // Initial redistribution among node
-                std::vector<T>& S_vals = local_S_par_comm->communicate<T>(values, block_size);
-
-                // Begin inter-node communication
-                global_par_comm->initialize(S_vals.data(), block_size);
-            }
-            else
-            {
-                global_par_comm->initialize(values, block_size);
-            }
-        }
-
-        template<typename T>
-        std::vector<T>& complete(const int block_size = 1)
-        {
-            // Complete inter-node communication
-            std::vector<T>& G_vals = global_par_comm->complete<T>(block_size);
-
-            // Redistributing recvd inter-node values
-            local_R_par_comm->communicate<T>(G_vals.data(), block_size);
-
-            std::vector<T>& recvbuf = get_buffer<T>();
-
-            std::vector<T>& R_recvbuf = local_R_par_comm->recv_data->get_buffer<T>();
-            std::vector<T>& L_recvbuf = local_L_par_comm->recv_data->get_buffer<T>();
-
-            if ((int)recvbuf.size() < recv_size * block_size)
-                recvbuf.resize(recv_size * block_size);
-
-            // Add values from L_recv and R_recv to appropriate positions in
-            // Vector recv
-            int idx, pos;
-            int R_recv_size = local_R_par_comm->recv_data->size_msgs;
-            int L_recv_size = local_L_par_comm->recv_data->size_msgs;
-            NonContigData* local_R_recv = (NonContigData*) local_R_par_comm->recv_data;
-            NonContigData* local_L_recv = (NonContigData*) local_L_par_comm->recv_data;
-            for (int i = 0; i < R_recv_size; i++)
-            {
-                pos = i * block_size;
-                idx = local_R_recv->indices[i] * block_size;
-                for (int j = 0; j < block_size; j++)
-                {
-                    recvbuf[idx + j] = R_recvbuf[pos + j];
-                }
-            }
-
-            for (int i = 0; i < L_recv_size; i++)
-            {
-                pos = i * block_size;
-                idx = local_L_recv->indices[i] * block_size;
-                for (int j = 0; j < block_size; j++)
-                {
-                    recvbuf[idx + j] = L_recvbuf[pos + j];
-                }
-            }
-
-            return recvbuf;
-        }
-
-
-        // Transpose Communication
-        void init_double_comm_T(const double* values,
-                const int block_size,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            initialize_T(values, block_size, init_result_func, init_result_func_val);
-        }
-        void init_int_comm_T(const int* values,
-                const int block_size,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            initialize_T(values, block_size, init_result_func, init_result_func_val);
-        }
-        void complete_double_comm_T(span<double> result,
-                const int block_size,
-                std::function<double(double, double)> result_func = &sum_func<double, double>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            complete_T<double>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_double_comm_T(span<int> result,
-                const int block_size,
-                std::function<int(int, double)> result_func = &sum_func<double, int>,
-                std::function<double(double, double)> init_result_func =
-                    &sum_func<double, double>,
-                    double init_result_func_val = 0)
-        {
-            complete_T<double>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(span<double> result,
-                const int block_size,
-                std::function<double(double, int)> result_func = &sum_func<int, double>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(span<int> result,
-                const int block_size,
-                std::function<int(int, int)> result_func = &sum_func<int, int>,
-                std::function<int(int, int)> init_result_func = &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(result, block_size, result_func, init_result_func, init_result_func_val);
-        }
-
-        void complete_double_comm_T(const int block_size,
-                std::function<double(double, double)> init_result_func =
-                &sum_func<double, double>,
-                double init_result_func_val = 0)
-        {
-            complete_T<double>(block_size, init_result_func, init_result_func_val);
-        }
-        void complete_int_comm_T(const int block_size,
-                std::function<int(int, int)> init_result_func =
-                    &sum_func<int, int>,
-                int init_result_func_val = 0)
-        {
-            complete_T<int>(block_size, init_result_func, init_result_func_val);
-        }
-
-        template<typename T, typename U>
-        void communicate_T(const std::vector<T>& values, std::vector<U>& result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values.data(), result, block_size, result_func, init_result_func,
-                    init_result_func_val);
-        }
-        template<typename T, typename U>
-        void communicate_T(const T* values, std::vector<U>& result,
-                const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values, result, block_size, result_func, init_result_func,
-                    init_result_func_val);
-        }
-        template<typename T>
-        void communicate_T(const std::vector<T>& values,
-                const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values.data(), block_size, init_result_func, init_result_func_val);
-        }
-        template<typename T>
-        void communicate_T(const T* values, const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            CommPkg::communicate_T(values, block_size, init_result_func, init_result_func_val);
-        }
-
-        template<typename T>
-        void initialize_T(const T* values, const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            // Messages with origin and final destination on node
-            local_L_par_comm->communicate_T(values, block_size, init_result_func, init_result_func_val);
-
-            // Initial redistribution among node
-            local_R_par_comm->communicate_T(values, block_size, init_result_func, init_result_func_val);
-
-            // Begin inter-node communication
-            std::vector<T>& R_sendbuf = local_R_par_comm->send_data->get_buffer<T>();
-            global_par_comm->init_comm_T(R_sendbuf, block_size, init_result_func, init_result_func_val);
-        }
-
-        template<typename T, typename U>
-        void complete_T(span<U> result, const int block_size = 1,
-                std::function<U(U, T)> result_func = &sum_func<T, U>,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            complete_T<T>(block_size, init_result_func, init_result_func_val);
-            int idx, pos;
-            std::vector<T>& L_sendbuf = local_L_par_comm->send_data->get_buffer<T>();
-
-            for (int i = 0; i < local_L_par_comm->send_data->size_msgs; i++)
-            {
-                idx = local_L_par_comm->send_data->indices[i] * block_size;
-                pos = i * block_size;
-                for (int j = 0; j < block_size; j++)
-                {
-                    result[idx + j] = result_func(result[idx + j], L_sendbuf[pos + j]);
-                }
-            }
-
-            if (local_S_par_comm)
-            {
-                std::vector<T>& S_sendbuf = local_S_par_comm->send_data->get_buffer<T>();
-                for (int i = 0; i < local_S_par_comm->send_data->size_msgs; i++)
-                {
-                    idx = local_S_par_comm->send_data->indices[i] * block_size;
-                    pos = i * block_size;
-                    for (int j = 0; j < block_size; j++)
-                    {
-                        result[idx + j] = result_func(result[idx + j], S_sendbuf[pos + j]);
-                    }
-                }
-            }
-            else
-            {
-                std::vector<T>& G_sendbuf = global_par_comm->send_data->get_buffer<T>();
-                for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-                {
-                    idx = global_par_comm->send_data->indices[i] * block_size;
-                    pos = i * block_size;
-                    for (int j = 0; j < block_size; j++)
-                    {
-                        result[idx + j] = result_func(result[idx + j], G_sendbuf[pos + j]);
-                    }
-                }
-            }
-        }
-        template<typename T>
-        void complete_T(const int block_size = 1,
-                std::function<T(T, T)> init_result_func = &sum_func<T, T>,
-                T init_result_func_val = 0)
-        {
-            // Complete inter-node communication
-            global_par_comm->complete_comm_T<T>(block_size, init_result_func, init_result_func_val);
-
-            if (local_S_par_comm)
-            {
-                std::vector<T>& G_sendbuf = global_par_comm->send_data->get_buffer<T>();
-                local_S_par_comm->communicate_T(G_sendbuf, block_size, init_result_func,
-                        init_result_func_val);
-            }
-        }
-
-
-        // Matrix Communication
-        CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        CSRMatrix* communicate(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        void init_mat_comm(std::vector<char>& send_buffer, const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int b_rows = 1, const int b_cols = 1, const bool has_vals = true);
-        CSRMatrix* complete_mat_comm(const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-
-        CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-        CSRMatrix* communicate_T(const std::vector<int>& rowptr,
-                const std::vector<int>& col_indices, const std::vector<double*>& values,
-                const int n_result_rows, const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-        void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) ;
-        void init_mat_comm_T(std::vector<char>& send_buffer,
-                const std::vector<int>& rowptr, const std::vector<int>& col_indices,
-                const std::vector<double*>& values, const int b_rows = 1,
-                const int b_cols = 1, const bool has_vals = true) ;
-        CSRMatrix* complete_mat_comm_T(const int n_result_rows,
-                const int b_rows = 1, const int b_cols = 1,
-                const bool has_vals = true);
-
-        CSRMatrix* communicate(ParCSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate(ParBSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate(CSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate(A, has_vals);
-        }
-        CSRMatrix* communicate_T(CSRMatrix* A, const bool has_vals = true)
-        {
-            return CommPkg::communicate_T(A, has_vals);
-        }
-
-        // Vector Communication
-        std::vector<double>& communicate(ParVector& v,
-                const int block_size = 1)
-        {
-            return CommPkg::communicate(v, block_size);
-        }
-
-        void init_comm(ParVector& v, const int block_size = 1)
-        {
-            CommPkg::init_comm(v, block_size);
-        }
-
-        // Helper Methods
-        std::vector<double>& get_double_buffer()
-        {
-            return buffer;
-        }
-        std::vector<int>& get_int_buffer()
-        {
-            return int_buffer;
-        }
-
-        // Class Attributes
-        int recv_size;
-        ParComm* local_S_par_comm;
-        ParComm* local_R_par_comm;
-        ParComm* local_L_par_comm;
-        ParComm* global_par_comm;
-    };
-}
-#endif
diff --git a/raptor/core/matrix.cpp b/raptor/core/matrix.cpp
deleted file mode 100644
index c9eb788d..00000000
--- a/raptor/core/matrix.cpp
+++ /dev/null
@@ -1,1438 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "matrix.hpp"
-#include "utilities.hpp"
-
-using namespace raptor;
-
-/**************************************************************
-*****  Matrix Print
-**************************************************************
-***** Print the nonzeros in the matrix, as well as the row
-***** and column according to each nonzero
-**************************************************************/
-template <typename T>
-void print_helper(const COOMatrix* A, const std::vector<T>& vals)
-{
-    int row, col;
-
-    for (int i = 0; i < A->nnz; i++)
-    {
-        row = A->idx1[i];
-        col = A->idx2[i];
-        A->val_print(row, col, vals[i]);
-    }
-}
-template <typename T>
-void print_helper(const CSRMatrix* A, const std::vector<T>& vals)
-{
-    int col, start, end;
-
-    for (int row = 0; row < A->n_rows; row++)
-    {
-        start = A->idx1[row];
-        end = A->idx1[row+1];
-        for (int j = start; j < end; j++)
-        {
-            col = A->idx2[j];
-            A->val_print(row, col, vals[j]);
-        }
-    }
-}
-template <typename T>
-void print_helper(const CSCMatrix* A, const std::vector<T>& vals)
-{
-    int row, start, end;
-
-    for (int col = 0; col < A->n_cols; col++)
-    {
-        start = A->idx1[col];
-        end = A->idx1[col+1];
-        for (int j = start; j < end; j++)
-        {
-            row = A->idx2[j];
-            A->val_print(row, col, vals[j]);
-        }
-    }
-}
-template <typename T>
-void bcoo_print_helper(const BCOOMatrix* A, const std::vector<T>& vals)
-{
-    int row, col;
-
-    for (int i = 0; i < A->nnz; i++)
-    {
-        row = A->idx1[i];
-        col = A->idx2[i];
-        A->val_print(row, col, vals[i]);
-    }
-}
-template <typename T>
-void bsr_print_helper(const BSRMatrix* A, const std::vector<T>& vals)
-{
-    int col, start, end;
-
-    for (int row = 0; row < A->n_rows; row++)
-    {
-        start = A->idx1[row];
-        end = A->idx1[row+1];
-        for (int j = start; j < end; j++)
-        {
-            col = A->idx2[j];
-            A->val_print(row, col, vals[j]);
-        }
-    }
-}
-template <typename T>
-void bsc_print_helper(const BSCMatrix* A, const std::vector<T>& vals)
-{
-    int row, start, end;
-
-    for (int col = 0; col < A->n_cols; col++)
-    {
-        start = A->idx1[col];
-        end = A->idx1[col+1];
-        for (int j = start; j < end; j++)
-        {
-            row = A->idx2[j];
-            A->val_print(row, col, vals[j]);
-        }
-    }
-}
-void COOMatrix::print()
-{
-    print_helper(this, vals);
-}
-void CSRMatrix::print()
-{
-    print_helper(this, vals);
-}
-void CSCMatrix::print()
-{
-    print_helper(this, vals);
-}
-void BCOOMatrix::print()
-{
-    bcoo_print_helper(this, vals);
-}
-void BSRMatrix::print()
-{
-    bsr_print_helper(this, vals);
-}
-void BSCMatrix::print()
-{
-    bsc_print_helper(this, vals);
-}
-
-/**************************************************************
-*****  Matrix Transpose
-**************************************************************
-***** Transpose the matrix, reversing rows and columns
-***** Retain matrix type, and block structure if applicable
-**************************************************************/
-COOMatrix* COOMatrix::transpose()
-{
-    COOMatrix* T = new COOMatrix(n_rows, n_cols, idx2, idx1, vals);
-    return T;
-}
-
-BCOOMatrix* BCOOMatrix::transpose()
-{
-    BCOOMatrix* T = new BCOOMatrix(b_rows, b_cols, n_rows, n_cols, idx2, idx1, block_vals);
-    return T;
-}
-
-CSRMatrix* CSRMatrix::transpose()
-{
-    CSCMatrix* T_csc = new CSCMatrix(n_rows, n_cols, idx1, idx2, vals); 
-    CSRMatrix* T = T_csc->to_CSR();
-    delete T_csc;
-    return T;
-}
-
-BSRMatrix* BSRMatrix::transpose()
-{
-    BSCMatrix* T_bsc = new BSCMatrix(b_rows, b_cols, n_rows, n_cols, idx1, idx2, block_vals);
-    BSRMatrix* T = (BSRMatrix*) T_bsc->to_CSR();
-    delete T_bsc;
-    return T;
-}
-
-CSCMatrix* CSCMatrix::transpose()
-{
-    CSRMatrix* T_csr = new CSRMatrix(n_rows, n_cols, idx1, idx2, vals); 
-    CSCMatrix* T = T_csr->to_CSC();
-    delete T_csr;
-    return T;
-}
-BSCMatrix* BSCMatrix::transpose()
-{
-    BSRMatrix* T_bsr = new BSRMatrix(b_rows, b_cols, n_rows, n_cols, idx1, idx2, block_vals); 
-    BSCMatrix* T = (BSCMatrix*) T_bsr->to_CSC();
-    delete T_bsr;
-    return T;
-}
-
-
-/**************************************************************
-*****   Matrix Resize
-**************************************************************
-***** Set the matrix dimensions to those passed as parameters
-*****
-***** Parameters
-***** -------------
-***** _nrows : int
-*****    Number of rows in matrix
-***** _ncols : int
-*****    Number of cols in matrix
-**************************************************************/
-void Matrix::resize(int _n_rows, int _n_cols)
-{
-    n_rows = _n_rows;
-    n_cols = _n_cols;
-}
-
-/**************************************************************
-*****   Matrix Copy
-**************************************************************
-***** Copy matrix between any subset of matrix types 
-*****
-***** Parameters
-***** -------------
-***** Matrix* A : original matrix to copy (of some type)
-**************************************************************/
-template <typename T>
-void COO_to_COO(const COOMatrix* A, COOMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.clear();
-    B->idx2.clear();
-    B_vals.clear();
-
-    B->idx1.reserve(A->nnz);
-    B->idx2.reserve(A->nnz);
-    B_vals.reserve(A->nnz);
-    for (int i = 0; i < A->nnz; i++)
-    {
-        B->idx1.emplace_back(A->idx1[i]);
-        B->idx2.emplace_back(A->idx2[i]);
-        B_vals.emplace_back(B->copy_val(A_vals[i]));
-    }
-}
-template <typename T>
-void CSR_to_COO(const CSRMatrix* A, COOMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.clear();
-    B->idx2.clear();
-    B_vals.clear();
-
-    B->idx1.reserve(A->nnz);
-    B->idx2.reserve(A->nnz);
-    B_vals.reserve(A->nnz);
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        int row_start = A->idx1[i];
-        int row_end = A->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            B->idx1.emplace_back(i);
-            B->idx2.emplace_back(A->idx2[j]);
-            B_vals.emplace_back(B->copy_val(A_vals[j]));
-        }
-    }
-}
-template <typename T>
-void CSC_to_COO(const CSCMatrix* A, COOMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.clear();
-    B->idx2.clear();
-    B_vals.clear();
-
-    B->idx1.reserve(A->nnz);
-    B->idx2.reserve(A->nnz);
-    B_vals.reserve(A->nnz);
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        int col_start = A->idx1[i];
-        int col_end = A->idx1[i+1];
-        for (int j = col_start; j < col_end; j++)
-        {
-            B->idx1.emplace_back(A->idx2[j]);
-            B->idx2.emplace_back(i);
-            B_vals.emplace_back(B->copy_val(A_vals[j]));
-        }
-    }
-
-}
-template <typename T>
-void COO_to_CSR(const COOMatrix* A, CSRMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.resize(B->n_rows + 1);
-    std::fill(B->idx1.begin(), B->idx1.end(), 0);
-    if (B->nnz)
-    {
-        B->idx2.resize(B->nnz);
-        if (A->data_size())
-            B_vals.resize(B->nnz);
-    }
-
-    // Calculate indptr
-    for (int i = 0; i < B->nnz; i++)
-    {
-        int row = A->idx1[i];
-        B->idx1[row+1]++;
-    }
-    for (int i = 0; i < B->n_rows; i++)
-    {
-        B->idx1[i+1] += B->idx1[i];
-    }
-
-    // Add indices and data
-    std::vector<int> ctr;
-    if (B->n_rows)
-    {
-            ctr.resize(B->n_rows, 0);
-    }
-    for (int i = 0; i < B->nnz; i++)
-    {
-        int row = A->idx1[i];
-        int col = A->idx2[i];
-        int index = B->idx1[row] + ctr[row]++;
-        B->idx2[index] = col;
-        if (A->data_size()) // Checking that matrix has values (not S)
-        {
-            B_vals[index] = B->copy_val(A_vals[i]);
-        }
-    }
-
-}
-template <typename T>
-void CSR_to_CSR(const CSRMatrix* A, CSRMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.resize(A->n_rows + 1);
-    B->idx2.resize(A->nnz);
-    B_vals.resize(A->nnz);
-
-    B->idx1[0] = 0;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        B->idx1[i+1] = A->idx1[i+1];
-        int row_start = B->idx1[i];
-        int row_end = B->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            B->idx2[j] = A->idx2[j];
-            B_vals[j] = B->copy_val(A_vals[j]);
-        }
-    }
-
-}
-template <typename T>
-void BSR_to_CSR(const BSRMatrix* A, CSRMatrix* B, std::vector<T*>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows * A->b_rows;
-    B->n_cols = A->n_cols * A->b_cols;
-
-    B->idx1.resize(B->n_rows + 1);
-    B->idx2.reserve(A->nnz);
-    B->vals.reserve(A->nnz);
-
-    T val;
-    int col;
-    B->idx1[0] = 0;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        int row_start = A->idx1[i];
-        int row_end = A->idx1[i+1];
-        for (int br = 0; br < A->b_rows; br++)
-        {
-            for (int j = row_start; j < row_end; j++)
-            {
-                for (int bc = 0; bc < A->b_cols; bc++)
-                {
-                    val = A_vals[j][br*A->b_cols + bc];
-                    if (fabs(val) > zero_tol)
-                    {
-                        col = A->idx2[j];
-                        B->vals.emplace_back(val);
-                        B->idx2.emplace_back(col*A->b_cols + bc); 
-                    }
-                }
-            }
-            B->idx1[i*A->b_rows + br+1] = B->idx2.size();
-        }
-    }
-    B->nnz = B->vals.size();
-
-}
-template <typename T>
-void CSC_to_CSR(const CSCMatrix* A, CSRMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.clear();
-    B->idx2.clear();
-    B_vals.clear();
-
-    // Resize vectors to appropriate dimensions
-    B->idx1.resize(A->n_rows + 1);
-    B->idx2.resize(A->nnz);
-    if (A->data_size())
-        B_vals.resize(A->nnz);
-
-    // Create indptr, summing number times row appears in CSC
-    for (int i = 0; i <= A->n_rows; i++) B->idx1[i] = 0;
-    for (int i = 0; i < A->nnz; i++)
-    {
-        B->idx1[A->idx2[i] + 1]++;
-    }
-    for (int i = 1; i <= A->n_rows; i++)
-    {
-        B->idx1[i] += B->idx1[i-1];
-    }
-
-    // Add values to indices and data
-    std::vector<int> ctr(B->n_rows, 0);
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        int col_start = A->idx1[i];
-        int col_end = A->idx1[i+1];
-        for (int j = col_start; j < col_end; j++)
-        {
-            int row = A->idx2[j];
-            int idx = B->idx1[row] + ctr[row]++;
-            B->idx2[idx] = i;
-            if (A->data_size())
-            {
-                B_vals[idx] = B->copy_val(A_vals[j]);
-            }
-        }
-    }
-
-}
-template <typename T>
-void COO_to_CSC(const COOMatrix* A, CSCMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.resize(B->n_cols + 1);
-    std::fill(B->idx1.begin(), B->idx1.end(), 0);
-    if (B->nnz)
-    {
-        B->idx2.resize(B->nnz);
-        if (A->data_size())
-            B_vals.resize(B->nnz);
-    }
-
-    // Calculate indptr
-    for (int i = 0; i < B->nnz; i++)
-    {
-        int col = A->idx1[i];
-        B->idx1[col+1]++;
-    }
-    for (int i = 0; i < B->n_cols; i++)
-    {
-        B->idx1[i+1] += B->idx1[i];
-    }
-
-    // Add indices and data
-    std::vector<int> ctr;
-    if (B->n_cols)
-    {
-        ctr.resize(B->n_cols, 0);
-    }
-    for (int i = 0; i < B->nnz; i++)
-    {
-        int col = A->idx1[i];
-        int row = A->idx2[i];
-        int index = B->idx1[col] + ctr[col]++;
-        B->idx2[index] = row;
-        if (A->data_size()) // Checking that matrix has values (not S)
-        {
-            B_vals[index] = B->copy_val(A_vals[i]);
-        }
-    }
-
-}
-template <typename T>
-void CSR_to_CSC(const CSRMatrix* A, CSCMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.clear();
-    B->idx2.clear();
-    B_vals.clear();
-
-    // Resize vectors to appropriate dimensions
-    B->idx1.resize(A->n_cols + 1);
-    B->idx2.resize(A->nnz);
-    if (A->data_size())
-        B_vals.resize(A->nnz);
-
-    // Create indptr, summing number times row appears in CSC
-    for (int i = 0; i <= A->n_cols; i++) B->idx1[i] = 0;
-    for (int i = 0; i < A->nnz; i++)
-    {
-        B->idx1[A->idx2[i] + 1]++;
-    }
-    for (int i = 1; i <= A->n_cols; i++)
-    {
-        B->idx1[i] += B->idx1[i-1];
-    }
-
-    // Add values to indices and data
-    std::vector<int> ctr(B->n_cols, 0);
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        int row_start = A->idx1[i];
-        int row_end = A->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            int col = A->idx2[j];
-            int idx = B->idx1[col] + ctr[col]++;
-            B->idx2[idx] = i;
-            if (A->data_size())
-            {
-                B_vals[idx] = B->copy_val(A_vals[j]);
-            }
-        }
-    }
-
-}
-template <typename T>
-void CSC_to_CSC(const CSCMatrix* A, CSCMatrix* B, std::vector<T>& A_vals,
-        std::vector<T>& B_vals)
-{
-    B->n_rows = A->n_rows;
-    B->n_cols = A->n_cols;
-    B->nnz = A->nnz;
-
-    B->idx1.resize(A->n_cols + 1);
-    B->idx2.resize(A->nnz);
-    B->vals.resize(A->nnz);
-
-    B->idx1[0] = 0;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        int col_start = A->idx1[i];
-        int col_end = A->idx1[i+1];
-        B->idx1[i+1] = col_end;
-        for (int j = col_start; j < col_end; j++)
-        {
-            B->idx2[j] = A->idx2[j];
-            B_vals[j] = B->copy_val(A_vals[j]);
-        }
-    }
-}
-
-
-/**************************************************************
-*****   Matrix Sort
-**************************************************************
-***** Sorts the sparse matrix by row and column
-**************************************************************/
-template <typename T>
-void sort_helper(COOMatrix* A, std::vector<T>& vals)
-{
-    if (A->sorted || A->nnz == 0)
-    {
-        A->sorted = true;
-        return;
-    }
-
-    vec_sort(A->idx1, A->idx2, vals);
-
-    A->sorted = true;
-    A->diag_first = false;
-
-}
-
-template <typename T>
-void sort_helper(CSRMatrix* A, std::vector<T>& vals)
-{
-    int start, end, row_size;
-
-    if (A->sorted || A->nnz == 0)
-    {
-        A->sorted = true;
-        return;
-    }
-
-    // Sort the columns of each row (and data accordingly) 
-    for (int row = 0; row < A->n_rows; row++)
-    {
-        start = A->idx1[row];
-        end = A->idx1[row+1];
-        row_size = end - start;
-        if (row_size == 0) 
-        {
-            continue;
-        }
-
-        if (A->data_size())
-            vec_sort(A->idx2, vals, start, end);
-        else
-            std::sort(A->idx2.begin() + start, A->idx2.begin() + end);
-    }
-
-    A->sorted = true;
-    A->diag_first = false;
-}
-
-template <typename T>
-void sort_helper(CSCMatrix* A, std::vector<T>& vals)
-{
-    int start, end, col_size;
-
-    if (A->sorted || A->nnz == 0)
-    {
-        A->sorted = true;
-        return;
-    }
-
-    // Sort the columns of each col (and data accordingly) and remove
-    // duplicates (summing values together)
-    for (int col = 0; col < A->n_cols; col++)
-    {
-        start = A->idx1[col];
-        end = A->idx1[col+1];
-        col_size = end - start;
-        if (col_size == 0) 
-        {
-            continue;
-        }
-
-        if (A->data_size())
-            vec_sort(A->idx2, vals, start, end);
-        else
-            std::sort(A->idx2.begin() + start, A->idx2.begin() + end);
-    }
-
-    A->sorted = true;
-    A->diag_first = false;
-}
-
-void COOMatrix::sort()
-{
-    sort_helper(this, vals);
-}
-void BCOOMatrix::sort()
-{
-    sort_helper(this, block_vals);
-}
-void CSRMatrix::sort()
-{
-    sort_helper(this, vals);
-}
-void BSRMatrix::sort()
-{
-    sort_helper(this, block_vals);
-}
-void CSCMatrix::sort()
-{
-    sort_helper(this, vals);
-}
-void BSCMatrix::sort()
-{
-    sort_helper(this, block_vals);
-}
-
-
-/**************************************************************
-*****   Matrix Move Diagonal
-**************************************************************
-***** Moves the diagonal element to the front of each row
-***** If matrix is not sorted, sorts before moving
-**************************************************************/
-template <typename T>
-void move_diag_helper(COOMatrix* A, std::vector<T>& vals)
-{
-    if (A->diag_first || A->nnz == 0)
-    {
-        return;
-    }
-
-    if (!A->sorted)
-    {
-        A->sort();
-    }
-
-    int row_start, prev_row;
-    int row, col;
-
-    // Move diagonal entry to first in row
-    row_start = 0;
-    prev_row = 0;
-    for (int i = 0; i < A->nnz; i++)
-    {
-        row = A->idx1[i];
-        col = A->idx2[i];
-        if (row != prev_row)
-        {
-            prev_row = row;
-            row_start = i;
-        }
-        else if (row == col)
-        {
-            auto tmp = vals[i];
-            for (int j = i; j > row_start; j--)
-            {
-                A->idx2[j] = A->idx2[j-1];
-                vals[j] = vals[j-1];
-            }
-            A->idx2[row_start] = row;
-            vals[row_start] = tmp;
-        }
-    }
-
-    A->diag_first = true;
-}
-
-template <typename T>
-void move_diag_helper(CSRMatrix* A, std::vector<T>& vals)
-{
-    int start, end;
-    int col;
-
-    if (A->diag_first || A->nnz == 0)
-    {
-        return;
-    }
-
-    // Move diagonal values to beginning of each row
-    if (A->data_size())
-    {
-        for (int i = 0; i < A->n_rows; i++)
-        {
-            start = A->idx1[i];
-            end = A->idx1[i+1];
-            for (int j = start; j < end; j++)
-            {
-                col = A->idx2[j];
-                if (col == i)
-                {
-                    auto tmp = vals[j];
-                    for (int k = j; k > start; k--)
-                    {
-                        A->idx2[k] = A->idx2[k-1];
-                        vals[k] = vals[k-1];
-                    }
-                    A->idx2[start] = i;
-                    vals[start] = tmp;
-                    break;
-                }
-            }
-        }
-    }
-    else
-    {
-        for (int i = 0; i < A->n_rows; i++)
-        {
-            start = A->idx1[i];
-            end = A->idx1[i+1];
-            for (int j = start; j < end; j++)
-            {
-                col = A->idx2[j];
-                if (col == i)
-                {
-                    for (int k = j; k > start; k--)
-                    {
-                        A->idx2[k] = A->idx2[k-1];
-                    }
-                    A->idx2[start] = i;
-                    break;
-                }
-            }
-        }
-    }
-    A->diag_first = true;
-}
-
-template <typename T>
-void move_diag_helper(CSCMatrix* A, std::vector<T>& vals)
-{
-    int start, end;
-    int row;
-
-    if (A->diag_first || A->nnz == 0)
-    {
-        return;
-    }
-
-    // Move diagonal values to beginning of each row
-    if (A->data_size())
-    {
-        for (int i = 0; i < A->n_cols; i++)
-        {
-            start = A->idx1[i];
-            end = A->idx1[i+1];
-            for (int j = start; j < end; j++)
-            {
-                row = A->idx2[j];
-                if (row == i)
-                {
-                    auto tmp = vals[j];
-                    for (int k = j; k > start; k--)
-                    {
-                        A->idx2[k] = A->idx2[k-1];
-                        vals[k] = vals[k-1];
-                    }
-                    A->idx2[start] = i;
-                    vals[start] = tmp;
-                    break;
-                }
-            }
-        }
-    }
-    else
-    {
-        for (int i = 0; i < A->n_cols; i++)
-        {
-            start = A->idx1[i];
-            end = A->idx1[i+1];
-            for (int j = start; j < end; j++)
-            {
-                row = A->idx2[j];
-                if (row == i)
-                {
-                    for (int k = j; k > start; k--)
-                    {
-                        A->idx2[k] = A->idx2[k-1];
-                    }
-                    A->idx2[start] = i;
-                    break;
-                }
-            }
-        }
-    }
-    A->diag_first = true;
-}
-
-void COOMatrix::move_diag()
-{
-    move_diag_helper(this, vals);
-}
-void BCOOMatrix::move_diag()
-{
-    move_diag_helper(this, block_vals);
-}
-void CSRMatrix::move_diag()
-{
-    move_diag_helper(this, vals);
-}
-void BSRMatrix::move_diag()
-{
-    move_diag_helper(this, block_vals);
-}
-void CSCMatrix::move_diag()
-{
-    move_diag_helper(this, vals);
-}
-void BSCMatrix::move_diag()
-{
-    move_diag_helper(this, block_vals);
-}
-
-/**************************************************************
-*****   Matrix Removes Duplicates
-**************************************************************
-***** Goes through each sorted row, and removes duplicate
-***** entries, summing associated values
-**************************************************************/
-template <typename T>
-void remove_duplicates_helper(COOMatrix* A, std::vector<T>& vals)
-{
-    if (!A->sorted)
-    {
-        A->sort();
-        A->diag_first = false;
-    }
-
-    int prev_row, prev_col, ctr;
-    int row, col;
-
-    // Remove duplicates (sum together)
-    prev_row = A->idx1[0];
-    prev_col = A->idx2[0];
-    ctr = 1;
-    for (int i = 1; i < A->nnz; i++)
-    {
-        row = A->idx1[i];
-        col = A->idx2[i];
-        if (row == prev_row && col == prev_col)
-        {
-            A->append_vals(&vals[ctr - 1], &vals[i]);
-        }
-        else
-        { 
-            if (ctr != i)
-            {
-                A->idx1[ctr] = row;
-                A->idx2[ctr] = col;
-                vals[ctr] = vals[i];
-            }
-            ctr++;
-
-            prev_row = row;
-            prev_col = col;
-        }
-    }
-
-    A->nnz = ctr;
-}
-
-template <typename T>
-void remove_duplicates_helper(CSRMatrix* A, std::vector<T>& vals)
-{
-    int orig_start, orig_end;
-    int new_start;
-    int col, prev_col;
-    int ctr, row_size;
-
-    if (!A->sorted)
-    {
-        A->sort();
-        A->diag_first = false;
-    }
-
-    orig_start = A->idx1[0];
-    for (int row = 0; row < A->n_rows; row++)
-    {
-        new_start = A->idx1[row];
-        orig_end = A->idx1[row+1];
-        row_size = orig_end - orig_start;
-        if (row_size == 0) 
-        {
-            orig_start = orig_end;
-            A->idx1[row+1] = A->idx1[row];
-            continue;
-        }
-
-        // Remove Duplicates
-        col = A->idx2[orig_start];
-        A->idx2[new_start] = col;
-        vals[new_start] = vals[orig_start];
-        prev_col = col;
-        ctr = 1;
-        for (int j = orig_start + 1; j < orig_end; j++)
-        {
-            col = A->idx2[j];
-            if (col == prev_col)
-            {
-                A->append_vals(&vals[ctr - 1 + new_start], &vals[j]);
-            }
-            else
-            {
-                if (A->abs_val(vals[ctr - 1 + new_start]) < zero_tol)
-                {
-                    ctr--;
-                }
-
-                A->idx2[ctr + new_start] = col;
-                vals[ctr + new_start] = vals[j];
-                ctr++;
-                prev_col = col;
-            }
-        }
-        if (A->abs_val(vals[ctr - 1 + new_start]) < zero_tol)
-        {
-            ctr--;
-        }
-
-        orig_start = orig_end;
-        A->idx1[row+1] = A->idx1[row] + ctr;
-    }
-    A->nnz = A->idx1[A->n_rows];
-    A->idx2.resize(A->nnz);
-    vals.resize(A->nnz);
-}
-
-template <typename T>
-void remove_duplicates_helper(CSCMatrix* A, std::vector<T>& vals)
-{
-    int orig_start, orig_end;
-    int new_start;
-    int row, prev_row;
-    int ctr, col_size;
-
-    if (!A->sorted)
-    {
-        A->sort();
-        A->diag_first = false;
-    }
-
-    orig_start = A->idx1[0];
-    for (int col = 0; col < A->n_cols; col++)
-    {
-        new_start = A->idx1[col];
-        orig_end = A->idx1[col+1];
-        col_size = orig_end - orig_start;
-        if (col_size == 0) 
-        {
-            orig_start = orig_end;
-            A->idx1[col+1] = A->idx1[col];
-            continue;
-        }
-
-        // Remove Duplicates
-        row = A->idx2[orig_start];
-        A->idx2[new_start] = row;
-        vals[new_start] = vals[orig_start];
-        prev_row = row;
-        ctr = 1;
-        for (int j = orig_start + 1; j < orig_end; j++)
-        {
-            row = A->idx2[j];
-            if (row == prev_row)
-            {
-                A->append_vals(&vals[ctr - 1 + new_start], &vals[j]);
-            }
-            else
-            {
-                if (A->abs_val(vals[ctr - 1 + new_start]) < zero_tol)
-                {
-                    ctr--;
-                }
-
-                A->idx2[ctr + new_start] = row;
-                vals[ctr + new_start] = vals[j];
-                ctr++;
-                prev_row = row;
-            }
-        }
-        if (A->abs_val(vals[ctr - 1 + new_start]) < zero_tol)
-        {
-            ctr--;
-        }
-
-        orig_start = orig_end;
-        A->idx1[col+1] = A->idx1[col] + ctr;
-    }
-    A->nnz = A->idx1[A->n_cols];
-    A->idx2.resize(A->nnz);
-    vals.resize(A->nnz);
-}
-
-void COOMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, vals);
-}
-void BCOOMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, block_vals);
-}
-void CSRMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, vals);
-}
-void BSRMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, block_vals);
-}
-void CSCMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, vals);
-}
-void BSCMatrix::remove_duplicates()
-{
-    remove_duplicates_helper(this, block_vals);
-}
-
-/**************************************************************
-*****   Matrix Convert
-**************************************************************
-***** Convert from one type of matrix to another
-***** No copies if matrix type remains the same
-***** If blocked matrix, converts to block matrix
-**************************************************************/
-COOMatrix* COOMatrix::to_COO()
-{
-    return this;
-}
-COOMatrix* COOMatrix::to_BCOO()
-{
-    return this->to_COO();
-}
-COOMatrix* BCOOMatrix::to_COO()
-{
-    return this->to_BCOO();
-}
-COOMatrix* BCOOMatrix::to_BCOO()
-{
-    return this;
-}
-CSRMatrix* COOMatrix::to_CSR()
-{
-    CSRMatrix* A = new CSRMatrix();
-    COO_to_CSR(this, A, vals, A->vals);
-    return A;
-}
-CSRMatrix* COOMatrix::to_BSR()
-{
-    return this->to_CSR();
-}
-CSRMatrix* BCOOMatrix::to_CSR()
-{
-    return this->to_BSR();
-}
-CSRMatrix* BCOOMatrix::to_BSR()
-{
-    BSRMatrix* A = new BSRMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    COO_to_CSR(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSCMatrix* COOMatrix::to_CSC()
-{
-    CSCMatrix* A = new CSCMatrix();
-    COO_to_CSC(this, A, vals, A->vals);
-    return A;
-}
-CSCMatrix* COOMatrix::to_BSC()
-{
-    return this->to_CSC();
-}
-CSCMatrix* BCOOMatrix::to_CSC()
-{
-    return this->to_BSC();
-}
-CSCMatrix* BCOOMatrix::to_BSC()
-{
-    BSCMatrix* A = new BSCMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    COO_to_CSC(this, A, block_vals, A->block_vals);
-    return A;
-}
-
-COOMatrix* CSRMatrix::to_COO()
-{
-    COOMatrix* A = new COOMatrix();
-    CSR_to_COO(this, A, vals, A->vals);
-    return A;
-}
-COOMatrix* CSRMatrix::to_BCOO()
-{
-    return this->to_COO();
-}
-COOMatrix* BSRMatrix::to_COO()
-{
-    return this->to_BCOO();
-}
-COOMatrix* BSRMatrix::to_BCOO()
-{
-    BCOOMatrix* A = new BCOOMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSR_to_COO(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSRMatrix* CSRMatrix::to_CSR()
-{
-    return this;
-}
-CSRMatrix* CSRMatrix::to_BSR()
-{
-    return this->to_CSR();
-}
-CSRMatrix* BSRMatrix::to_CSR()
-{
-    CSRMatrix* A = new CSRMatrix();
-    BSR_to_CSR(this, A, block_vals, A->vals);
-    return A;
-}
-CSRMatrix* BSRMatrix::to_BSR()
-{
-    return this;
-}
-CSCMatrix* CSRMatrix::to_CSC()
-{
-    CSCMatrix* A = new CSCMatrix();
-    CSR_to_CSC(this, A, vals, A->vals);
-    return A;
-}
-CSCMatrix* CSRMatrix::to_BSC()
-{
-    return this->to_CSC();
-}
-CSCMatrix* BSRMatrix::to_CSC()
-{
-    return this->to_BSC();
-}
-CSCMatrix* BSRMatrix::to_BSC()
-{
-    BSCMatrix* A = new BSCMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSR_to_CSC(this, A, block_vals, A->block_vals);
-    return A;
-}
-
-COOMatrix* CSCMatrix::to_COO()
-{
-    COOMatrix* A = new COOMatrix();
-    CSC_to_COO(this, A, vals, A->vals);
-    return A;
-}
-COOMatrix* CSCMatrix::to_BCOO()
-{
-    return this->to_COO();
-}
-COOMatrix* BSCMatrix::to_COO()
-{
-    return this->to_BCOO();
-}
-COOMatrix* BSCMatrix::to_BCOO()
-{
-    BCOOMatrix* A = new BCOOMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSC_to_COO(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSRMatrix* CSCMatrix::to_CSR()
-{
-    CSRMatrix* A = new CSRMatrix();
-    CSC_to_CSR(this, A, vals, A->vals);
-    return A;
-}
-CSRMatrix* CSCMatrix::to_BSR()
-{
-    return this->to_CSR();
-}
-CSRMatrix* BSCMatrix::to_CSR()
-{
-    return this->to_BSR();
-}
-CSRMatrix* BSCMatrix::to_BSR()
-{
-    BSRMatrix* A = new BSRMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSC_to_CSR(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSCMatrix* CSCMatrix::to_CSC()
-{
-    return this; 
-}
-CSCMatrix* CSCMatrix::to_BSC()
-{
-    return this->to_CSC();
-}
-CSCMatrix* BSCMatrix::to_CSC()
-{
-    return this->to_BSC();
-}
-CSCMatrix* BSCMatrix::to_BSC()
-{
-    return this;
-}
-
-/**************************************************************
-*****   Matrix Copy
-**************************************************************/
-COOMatrix* COOMatrix::copy()
-{
-    COOMatrix* A = new COOMatrix();
-    COO_to_COO(this, A, vals, A->vals);
-    return A;
-}
-BCOOMatrix* BCOOMatrix::copy()
-{
-    BCOOMatrix* A = new BCOOMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    COO_to_COO(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSRMatrix* CSRMatrix::copy()
-{
-    CSRMatrix* A = new CSRMatrix();
-    CSR_to_CSR(this, A, vals, A->vals);
-    return A;
-}
-BSRMatrix* BSRMatrix::copy()
-{
-    BSRMatrix* A = new BSRMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSR_to_CSR(this, A, block_vals, A->block_vals);
-    return A;
-}
-CSCMatrix* CSCMatrix::copy()
-{
-    CSCMatrix* A = new CSCMatrix();
-    CSC_to_CSC(this, A, vals, A->vals);
-    return A;
-}
-BSCMatrix* BSCMatrix::copy()
-{
-    BSCMatrix* A = new BSCMatrix();
-    A->b_rows = b_rows;
-    A->b_cols = b_cols;
-    A->b_size = b_size;
-    CSC_to_CSC(this, A, block_vals, A->block_vals);
-    return A;
-}
-
-/**************************************************************
-*****  Matrix Block Removal 
-**************************************************************
-***** Determines which columns were kept after removing
-***** block structure from matrices
-**************************************************************/
-void COOMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = true;
-    }
-}
-void BCOOMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = false;
-    }
-    
-    int idx, first_col;
-    double* block_val;
-    for (int i = 0; i < nnz; i++)
-    {
-        block_val = block_vals[i];
-        for (int row = 0; row < b_rows; row++)
-        {
-            idx = row * b_cols;
-            first_col = idx2[i]*b_cols;
-            for (int col = 0; col < b_cols; col++)
-            {
-                if(fabs(block_val[idx + col]) > zero_tol)
-                {
-                    col_check[first_col + col] = true;
-                } 
-            }
-        }
-    }
-}
-
-void CSCMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = true;
-    }
-}
-void BSCMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = false;
-    }
-    
-    int start, end, idx;
-    double* block_val;
-    for (int j = 0; j < n_cols; j++)
-    {
-        start = idx1[j];
-        end = idx1[j+1];
-        for (int row = 0; row < b_rows; row++)
-        {
-            idx = row * b_cols;
-            for (int i = start; i < end; i++)
-            {
-                block_val = block_vals[i];
-                for (int col = 0; col < b_cols; col++)
-                {
-                    if(fabs(block_val[idx + col]) > zero_tol)
-                    {
-                        col_check[j + col] = true;
-                    } 
-                }
-            }
-        }
-    }
-}
-
-void CSRMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = true;
-    }
-}
-void BSRMatrix::block_removal_col_check(bool* col_check)
-{
-    for (int i = 0; i < n_cols * b_cols; i++)
-    {
-        col_check[i] = false;
-    }
-
-    int start, end, idx, first_col;
-    double* block_val;
-    for (int i = 0; i < n_rows; i++)
-    {
-        start = idx1[i];
-        end = idx1[i+1];
-        for (int row = 0; row < b_rows; row++)
-        {
-            idx = row * b_cols;
-            for (int j = start; j < end; j++)
-            {
-                first_col = idx2[j]*b_cols;
-                block_val = block_vals[j];
-                for (int col = 0; col < b_cols; col++)
-                {
-                    if(fabs(block_val[idx + col]) > zero_tol)
-                    {
-                        col_check[first_col + col] = true;
-                    } 
-                }
-            }
-        }
-    }
-}
diff --git a/raptor/core/matrix.hpp b/raptor/core/matrix.hpp
deleted file mode 100644
index 40df47d7..00000000
--- a/raptor/core/matrix.hpp
+++ /dev/null
@@ -1,1353 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_MATRIX_HPP
-#define RAPTOR_CORE_MATRIX_HPP
-
-#include "types.hpp"
-#include "vector.hpp"
-
-/**************************************************************
- *****   Matrix Base Class
- **************************************************************
- ***** This class constructs a sparse matrix, supporting simple linear
- ***** algebra operations.
- *****
- ***** Attributes
- ***** -------------
- ***** n_rows : int
- *****    Number of rows
- ***** n_cols : int
- *****    Number of columns
- ***** nnz : int
- *****    Number of nonzeros
- ***** idx1 : std::vector<int>
- *****    List of position indices, specific to type of matrix
- ***** idx2 : std::vector<int>
- *****    List of position indices, specific to type of matrix
- ***** vals : std::vector<double>
- *****    List of values in matrix
- *****
- ***** Methods
- ***** -------
- ***** resize(int n_rows, int n_cols)
- *****    Resizes dimension of matrix to passed parameters
- ***** mult(Vector* x, Vector* b)
- *****    Sparse matrix-vector multiplication b = A * x
- ***** residual(Vector* x, Vector* b, Vector* r)
- *****    Calculates the residual r = b - A * x
- *****
- ***** Virtual Methods
- ***** -------
- ***** format() 
- *****    Returns the format of the sparse matrix (COO, CSR, CSC)
- ***** sort()
- *****    Sorts the matrix by position.  Whether row-wise or 
- *****    column-wise depends on matrix format.
- ***** add_value(int row, int col, double val)
- *****     Adds val to position (row, col)
- *****     TODO -- make sure this is working for CSR/CSC
- **************************************************************/
-namespace raptor
-{
-  // Forward Declaration of classes so objects can be used
-  class COOMatrix;
-  class CSRMatrix;
-  class CSCMatrix;
-  class Matrix
-  {
-
-  public:
-
-    /**************************************************************
-    *****   Matrix Base Class Constructor
-    **************************************************************
-    ***** Sets matrix dimensions, and sets nnz to 0
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** _nrows : int
-    *****    Number of rows in matrix
-    ***** _ncols : int
-    *****    Number of cols in matrix
-    **************************************************************/
-    Matrix(int _nrows, int _ncols)
-    {
-        n_rows = _nrows;
-        n_cols = _ncols;
-        nnz = 0;
-        sorted = false;
-        diag_first = false;
-        b_rows = 1;
-        b_cols = 1;
-        b_size = 1;
-    }
-
-    /**************************************************************
-    *****   Matrix Base Class Constructor
-    **************************************************************
-    ***** Sets matrix dimensions and nnz based on Matrix* A
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** A : Matrix*
-    *****    Matrix to be copied
-    **************************************************************/
-    Matrix()
-    {
-        n_rows = 0;
-        n_cols = 0;
-        nnz = 0;
-        sorted = false;
-        diag_first = false;
-        b_rows = 1;
-        b_cols = 1;
-        b_size = 1;
-    }
-
-    virtual ~Matrix(){}
-
-    template <typename T>
-    void init_from_lists(std::vector<int>& _idx1, std::vector<int>& _idx2, 
-            std::vector<T>& data)
-    {
-        nnz = data.size();
-        resize_data(nnz);
-
-        T* val_list = (T*) get_data();
-
-        std::copy(_idx1.begin(), _idx1.end(), std::back_inserter(idx1));
-        std::copy(_idx2.begin(), _idx2.end(), std::back_inserter(idx2));
-
-        for (int i = 0; i < nnz; i++)
-        {
-            val_list[i] = copy_val(data[i]);
-        }
-    }
-
-    // Virtual Methods
-    virtual format_t format() = 0;
-    virtual void sort() = 0;
-    virtual void move_diag() = 0;
-    virtual void remove_duplicates() = 0;
-    virtual void print() = 0;
-    virtual CSRMatrix* to_CSR() = 0;
-    virtual CSCMatrix* to_CSC() = 0;
-    virtual COOMatrix* to_COO() = 0;
-    virtual CSRMatrix* to_BSR() = 0;
-    virtual CSCMatrix* to_BSC() = 0;
-    virtual COOMatrix* to_BCOO() = 0;
-    virtual void block_removal_col_check(bool* col_check) = 0;
-    virtual Matrix* copy() = 0;
-
-    virtual void spmv(const double* x, double* b) const = 0;
-    virtual void spmv_append(const double* x, double* b) const = 0;
-    virtual void spmv_append_T(const double* x, double* b) const = 0;
-    virtual void spmv_append_neg(const double* x, double* b) const = 0;
-    virtual void spmv_append_neg_T(const double* x, double* b) const = 0;
-    virtual void spmv_residual(const double* x, const double* b, double* r) const = 0;
-
-    virtual CSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL) = 0;
-    virtual CSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL) = 0;
-    virtual Matrix* transpose() = 0;
-
-    double* get_values(Vector& x) const
-    {
-        return x.values.data();
-    }
-    template<typename T> T* get_values(std::vector<T>& x) const
-    {
-        return x.data();
-    }
-    template<typename T> T* get_values(T* x) const
-    {
-        return x;
-    }
-    
-    // Method for printing the value at one position
-    // (either single or block value)
-    void val_print(int row, int col, double val) const
-    {
-        printf("A[%d][%d] = %e\n", row, col, val);
-    }
-    void val_print(int row, int col, double* val) const
-    {
-        for (int i = 0; i < b_rows; i++)
-        {
-            for (int j = 0; j < b_cols; j++)
-            {
-                printf("A[%d][%d], BlockPos[%d][%d] = %e\n", row, col, i, j, val[i*b_cols+j]);
-            }
-        }
-    }
-
-    double copy_val(double val) const
-    {
-        return val;
-    }
-    double* copy_val(double* val) const
-    {
-        double* new_val = new double[b_size];
-        for (int i = 0; i < b_size; i++)
-        {
-            new_val[i] = val[i];
-        }
-        return new_val;
-    }
-
-    // Method for finding the absolute value of 
-    // either a single or block value
-    double abs_val(double val) const
-    {
-        return fabs(val);
-    }
-    double abs_val(double* val) const
-    {
-        double sum = 0;
-        for (int i = 0; i < b_size; i++)
-        {
-            sum += fabs(val[i]);
-        }
-        return sum;
-    }
-
-    // Methods for appending two values
-    // (either single or block values)
-    void append_vals(double* val, double* addl_val) const
-    {
-        *val += *addl_val;
-    }
-    void append_vals(double** val, double** addl_val) const
-    {
-        for (int i = 0; i < b_size; i++)
-        {
-            *val[i] += *addl_val[i];
-        }
-        delete[] *addl_val;
-    }
-    void mult_vals(double val, double addl_val, double* sum, 
-            int nr, int nc0, int n_inner) const
-    {
-        *sum += (val * addl_val);
-    }
-    void mult_vals(double* val, double* addl_val, double** sum,
-            int nr, int nc, int n_inner) const
-    {
-        for (int i = 0; i < nr; i++) // Go through b_rows of A
-        { 
-            for (int j = 0; j < nc; j++) // Go through b_cols of B
-            {
-                double s = 0;
-                for (int k = 0; k < n_inner; k++) // Go through b_cols of A (== b_rows of B)
-                {
-                    s += val[i*n_inner + k] * addl_val[k*n_inner + j];
-                }
-                (*sum)[i*nc + j] += s;
-            }
-        }
-    }
-    void mult_T_vals(double val, double addl_val, double* sum,
-            int nr, int nc, int n_inner) const
-    {
-        *sum += (val * addl_val);
-    }
-    void mult_T_vals(double* val, double* addl_val, double** sum,
-            int nr, int nc, int n_inner) const
-    {
-        for (int i = 0; i < nr; i++) // Go through b_rows of A
-        { 
-            for (int j = 0; j < nc; j++) // Go through b_cols of B
-            {
-                double s = 0;
-                for (int k = 0; k < n_inner; k++) // Go through b_cols of A (== b_rows of B)
-                {
-                    s += val[k*n_inner + i] * addl_val[k*n_inner + j];
-                }
-                (*sum)[i*nc + j] += s;
-            }
-        }
-    }
-
-
-    void append(int _idx1, int _idx2, double* b, const double* x, const double val) const
-    {
-        b[_idx1] += val*x[_idx2];
-    }
-    void append_T(int _idx1, int _idx2, double* b, const double* x, const double val) const
-    {
-        b[_idx2] += val*x[_idx1];
-    }
-    void append_neg(int _idx1, int _idx2, double* b, const double* x, const double val) const
-    {
-        b[_idx1] -= val*x[_idx2];
-    }
-    void append_neg_T(int _idx1, int _idx2, double* b, const double* x, const double val) const
-    {
-        b[_idx2] -= val*x[_idx1];
-    }
-    void append(int _idx1, int _idx2, double* b, const double* x, const double* val) const
-    {
-        int first_row = _idx1*b_rows;
-        int first_col = _idx2*b_cols;
-        for (int row = 0; row < b_rows; row++)
-        {
-            for (int col = 0; col < b_cols; col++)
-            {
-                b[first_row + row] += (val[row * b_cols + col] * x[first_col + col]);
-            }
-        }
-    }
-    void append_T(int _idx1, int _idx2, double* b, const double* x, const double* val) const
-    {
-        int first_row = _idx1*b_rows;
-        int first_col = _idx2*b_cols;
-
-        for (int row = 0; row < b_rows; row++)
-        {
-            double x_val = x[first_row + row];
-            for (int col = 0; col < b_cols; col++)
-            {
-                b[first_col + col] += (val[row * b_cols + col] * x_val);
-            }
-        }
-    }
-    void append_neg(int _idx1, int _idx2, double* b, const double* x, const double* val) const
-    {
-        int first_row = _idx1*b_rows;
-        int first_col = _idx2*b_cols;
-        for (int row = 0; row < b_rows; row++)
-        {
-            for (int col = 0; col < b_cols; col++)
-            {
-                b[first_row + row] -= (val[row * b_cols + col] * x[first_col + col]);
-            }
-        }
-    }
-    void append_neg_T(int _idx1, int _idx2, double* b, const double* x, const double* val) const
-    {
-        int first_row = _idx1*b_rows;
-        int first_col = _idx1*b_cols;
-        for (int row = 0; row < b_rows; row++)
-        {
-            for (int col = 0; col < b_cols; col++)
-            {
-                b[first_col + col] -= (val[row * b_cols + col] * x[first_row + row]);
-            }
-        }
-    }
-
-    template <typename T, typename U> void mult(T& x, U& b) const
-    {
-        spmv(get_values(x), get_values(b));
-    }
-    template <typename T, typename U> void mult_T(T& x, U& b) const
-    {
-        int cols = n_cols * b_cols;
-        for (int i = 0; i < cols; i++)
-            b[i] = 0.0;
-        spmv_append_T(get_values(x), get_values(b));
-    }
-    template <typename T, typename U> void mult_append(T& x, U& b) const
-    {
-        spmv_append(get_values(x), get_values(b));
-    }
-    template <typename T, typename U> void mult_append_T(T& x, U& b) const
-    {
-        spmv_append_T(get_values(x), get_values(b));
-    }
-    template <typename T, typename U> void mult_append_neg(T& x, U& b) const
-    {
-        spmv_append_neg(get_values(x), get_values(b));
-    }
-    template <typename T, typename U> void mult_append_neg_T(T& x, U& b) const
-    {
-        spmv_append_neg_T(get_values(x), get_values(b));
-    }
-    template <typename T, typename U, typename V> void residual(T& x, U& b, V& r) const
-    {
-        spmv_residual(get_values(x), get_values(b), get_values(r));
-    }
-
-    CSRMatrix* mult(CSRMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* mult(CSCMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* mult(COOMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* mult_T(CSCMatrix* A, int* C_map = NULL);
-    CSRMatrix* mult_T(CSRMatrix* A, int* C_map = NULL);
-    CSRMatrix* mult_T(COOMatrix* A, int* C_map = NULL);
-
-    virtual void add_value(int row, int col, double value) = 0;
-    virtual void add_value(int row, int col, double* value) = 0;
-
-    Matrix* add(CSRMatrix* A, bool remove_dup = true);
-    void add_append(CSRMatrix* A, CSRMatrix* C, bool remove_dup = true);
-    Matrix* subtract(CSRMatrix* A);
-
-    void resize(int _n_rows, int _n_cols);
-
-    virtual void resize_data(int size) = 0;
-    virtual void* get_data() = 0;
-    virtual int data_size() const = 0;
-    virtual void reserve_size(int size) = 0;
-    virtual double get_val(const int j, const int k) = 0;
-
-    std::vector<int> idx1;
-    std::vector<int> idx2;
-    std::vector<double> vals;
-
-    int b_rows;
-    int b_cols;
-    int b_size;
-
-    int n_rows;
-    int n_cols;
-    int nnz;
-
-    bool sorted;
-    bool diag_first;
-
-  };
-
-
-/**************************************************************
- *****   COOMatrix Class (Inherits from Matrix Base Class)
- **************************************************************
- ***** This class constructs a sparse matrix in COO format.
- *****
- ***** Methods
- ***** -------
- ***** format() 
- *****    Returns the format of the sparse matrix (COO)
- ***** sort()
- *****    Sorts the matrix by row, and by column within each row.
- ***** add_value(int row, int col, double val)
- *****     Adds val to position (row, col)
- ***** rows()
- *****     Returns std::vector<int>& containing the rows corresponding
- *****     to each nonzero
- ***** cols()
- *****     Returns std::vector<int>& containing the cols corresponding
- *****     to each nonzero
- ***** data()
- *****     Returns std::vector<double>& containing the nonzero values
- **************************************************************/
-  class COOMatrix : public Matrix
-  {
-
-  public:
-
-    /**************************************************************
-    *****   COOMatrix Class Constructor
-    **************************************************************
-    ***** Initializes an empty COOMatrix
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** _nrows : int
-    *****    Number of rows in Matrix
-    ***** _ncols : int
-    *****    Number of columns in Matrix
-    ***** nnz_per_row : int
-    *****    Prediction of (approximately) number of nonzeros 
-    *****    per row, used in reserving space
-    **************************************************************/
-    COOMatrix(int _nrows, int _ncols, int nnz_per_row = 1) : Matrix(_nrows, _ncols)
-    {
-        int _nnz = nnz_per_row * _nrows;
-        if (_nnz)
-        {
-            idx1.reserve(_nnz);
-            idx2.reserve(_nnz);
-            vals.reserve(_nnz);
-        }
-    }
-
-    COOMatrix(int _nrows, int _ncols, double* _data) : Matrix(_nrows, _ncols)
-    {
-        init_from_dense(_data);
-    }
-
-    COOMatrix(int _nrows, int _ncols, std::vector<int>& rows, std::vector<int>& cols, 
-            std::vector<double>& data) : Matrix(_nrows, _ncols)
-    {
-        init_from_lists(rows, cols, data);
-    }
-
-    COOMatrix()
-    {
-    }
-
-    ~COOMatrix()
-    {
-
-    }
-
-    template <typename T>
-    void init_from_dense(T* _data)
-    {
-        nnz = 0;
-        int nnz_dense = n_rows*n_cols;
-
-        if (nnz_dense)
-        {
-            idx1.resize(nnz_dense);
-            idx2.resize(nnz_dense);
-            resize_data(nnz_dense);
-        }
-
-        T* val_list = (T*) get_data();
-
-        for (int i = 0; i < n_rows; i++)
-        {
-            for (int j = 0; j < n_cols; j++)
-            {
-                int pos = i * n_cols + j;
-                if (abs_val(_data[pos]) > zero_tol)
-                {
-                    idx1[nnz] = i;
-                    idx2[nnz] = j;
-                    val_list[nnz] = copy_val(_data[pos]);
-                    nnz++;
-                }
-            }
-        }
-    }
-
-    COOMatrix* transpose();
-
-    void print();
-
-    void sort();
-    void move_diag();
-    void remove_duplicates();
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-    CSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-
-    void block_removal_col_check(bool* col_check);
-
-    COOMatrix* copy();
-    
-    void add_value(int row, int col, double value)
-    {
-        if (fabs(value) > zero_tol)
-        {
-            idx1.emplace_back(row);
-            idx2.emplace_back(col);
-            vals.emplace_back(value);
-            nnz++;
-        }
-    }
-
-    void add_value(int row, int col, double* value)
-    {
-        idx1.emplace_back(row);
-        idx2.emplace_back(col);
-        vals.emplace_back(*value);
-        nnz++;
-    }
-
-    format_t format()
-    {
-        return COO;
-    }
-
-    void* get_data()
-    {
-       return vals.data();
-    } 
-    int data_size() const
-    {
-        return vals.size();
-    }
-
-    void resize_data(int size)
-    {
-        vals.resize(size);
-    }
-
-    void reserve_size(int size)
-    {
-        idx1.reserve(size);
-        idx2.reserve(size);
-        vals.reserve(size);
-    }
-    
-    double get_val(const int j, const int k)
-    {
-        return vals[j];
-    }
-};
-
-
-/**************************************************************
- *****   CSRMatrix Class (Inherits from Matrix Base Class)
- **************************************************************
- ***** This class constructs a sparse matrix in CSR format.
- *****
- ***** Methods
- ***** -------
- ***** format() 
- *****    Returns the format of the sparse matrix (CSR)
- ***** sort()
- *****    Sorts the matrix.  Already in row-wise order, but sorts
- *****    the columns in each row.
- ***** add_value(int row, int col, double val)
- *****     TODO -- add this functionality
- ***** indptr()
- *****     Returns std::vector<int>& row pointer.  The ith element points to
- *****     the index of indices() corresponding to the first column to lie on 
- *****     row i.
- ***** indices()
- *****     Returns std::vector<int>& containing the cols corresponding
- *****     to each nonzero
- ***** data()
- *****     Returns std::vector<double>& containing the nonzero values
- **************************************************************/
-  class CSRMatrix : public Matrix
-  {
-
-  public:
-
-    /**************************************************************
-    *****   CSRMatrix Class Constructor
-    **************************************************************
-    ***** Initializes an empty CSRMatrix
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** _nrows : int
-    *****    Number of rows in Matrix
-    ***** _ncols : int
-    *****    Number of columns in Matrix
-    ***** nnz_per_row : int
-    *****    Prediction of (approximately) number of nonzeros 
-    *****    per row, used in reserving space
-    **************************************************************/
-    CSRMatrix(int _nrows, int _ncols, int _nnz = 0): Matrix(_nrows, _ncols)
-    {
-        idx1.resize(_nrows + 1);
-        if (_nnz)
-        {
-            idx2.reserve(_nnz);
-            vals.reserve(_nnz);
-        }
-    }
-
-    CSRMatrix(int _nrows, int _ncols, double* _data) : Matrix(_nrows, _ncols)
-    {
-        init_from_dense(_data);
-    }
-
-    CSRMatrix(int _nrows, int _ncols, std::vector<int>& rowptr, 
-            std::vector<int>& cols, std::vector<double>& data) : Matrix(_nrows, _ncols)
-    {
-        init_from_lists(rowptr, cols, data);
-    }
-
-    CSRMatrix()
-    {
-    }
-
-    ~CSRMatrix()
-    {
-
-    }
-
-    template <typename T>
-    void init_from_dense(T* _data)
-    {
-        int nnz_dense = n_rows*n_cols;
-        idx1.resize(n_rows + 1);
-        if (nnz_dense)
-        {
-            idx2.resize(nnz_dense);
-            resize_data(nnz_dense);
-        }
-
-        T* val_list = (T*) get_data();
-
-        idx1[0] = 0;
-        for (int i = 0; i < n_rows; i++)
-        {
-            for (int j = 0; j < n_cols; j++)
-            {
-                int pos = i * n_cols + j;
-                if (abs_val(_data[pos]))
-                {
-                    idx2[nnz] = j;
-                    val_list[nnz] = copy_val(_data[pos]);
-                    nnz++;
-                }
-            }
-            idx1[i+1] = nnz;
-        }
-    }
-
-    CSRMatrix* transpose();
-
-    void print();
-
-    void sort();
-    void move_diag();
-    void remove_duplicates();
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-    CSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    CSRMatrix* add(CSRMatrix* A, bool remove_dup = true);
-    void add_append(CSRMatrix* A, CSRMatrix* C, bool remove_dup = true);
-    CSRMatrix* subtract(CSRMatrix* A);
-
-    CSRMatrix* strength(strength_t strength_type = Classical,
-            double theta = 0.0, int num_variables = 1, int* variables = NULL);
-    CSRMatrix* aggregate();
-    CSRMatrix* fit_candidates(data_t* B, data_t* R, int num_candidates, 
-            double tol = 1e-10);
-
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-
-    void block_removal_col_check(bool* col_check);
-
-    CSRMatrix* copy();
-
-    format_t format()
-    {
-        return CSR;
-    }
-
-    void add_value(int row, int col, double value) 
-    {
-        if (fabs(value) > zero_tol)
-        {
-            idx2.emplace_back(col);
-            vals.emplace_back(value);
-            nnz++;
-        }
-    }
-    void add_value(int row, int col, double* value)
-    {
-        idx2.emplace_back(col);
-        vals.emplace_back(*value);
-        nnz++;
-    }
-
-    void* get_data()
-    {
-       return vals.data();
-    } 
-    int data_size() const
-    {
-        return vals.size();
-    }
-    void resize_data(int size)
-    {
-        vals.resize(size);
-    }
-    void reserve_size(int size)
-    {
-        idx2.reserve(size);
-        vals.reserve(size);
-    }
-
-    double get_val(const int j, const int k)
-    {
-        return vals[j];
-    }
-
-};
-
-/**************************************************************
- *****   CSCMatrix Class (Inherits from Matrix Base Class)
- **************************************************************
- ***** This class constructs a sparse matrix in CSC format.
- *****
- ***** Methods
- ***** -------
- ***** format() 
- *****    Returns the format of the sparse matrix (CSC)
- ***** sort()
- *****    Sorts the matrix.  Already in col-wise order, but sorts
- *****    the rows in each column.
- ***** add_value(int row, int col, double val)
- *****     TODO -- add this functionality
- ***** indptr()
- *****     Returns std::vector<int>& column pointer.  The ith element points to
- *****     the index of indices() corresponding to the first row to lie on 
- *****     column i.
- ***** indices()
- *****     Returns std::vector<int>& containing the rows corresponding
- *****     to each nonzero
- ***** data()
- *****     Returns std::vector<double>& containing the nonzero values
- **************************************************************/
-  class CSCMatrix : public Matrix
-  {
-
-  public:
-
-    CSCMatrix(int _nrows, int _ncols, int _nnz = 0): Matrix(_nrows, _ncols)
-    {
-        idx1.resize(_ncols + 1);
-        if (_nnz)
-        {
-            idx2.reserve(_nnz);
-            vals.reserve(_nnz);
-        }
-        nnz = _nnz;
-    }
-
-    CSCMatrix(int _nrows, int _ncols, double* _data) : Matrix(_nrows, _ncols)
-    {
-        init_from_dense(_data);
-    }
-
-    CSCMatrix(int _nrows, int _ncols, std::vector<int>& colptr, 
-            std::vector<int>& rows, std::vector<double>& data) : Matrix(_nrows, _ncols)
-    {
-        init_from_lists(colptr, rows, data);
-    }
-
-    CSCMatrix()
-    {
-    }
-
-    ~CSCMatrix()
-    {
-
-    }
-
-    template <typename T>
-    void init_from_dense(T* _data)
-    {
-        int nnz_dense = n_rows*n_cols;
-
-        idx1.resize(n_cols + 1);
-        if (nnz_dense)
-        {
-            idx2.resize(nnz_dense);
-            resize_data(nnz_dense);
-        }
-
-        T* val_list = (T*) get_data();
-
-        idx1[0] = 0;
-        for (int i = 0; i < n_cols; i++)
-        {
-            for (int j = 0; j < n_rows; j++)
-            {
-                int pos = i * n_cols + j;
-                if (abs_val(_data[pos]) > zero_tol)
-                {
-                    idx2[nnz] = j;
-                    val_list[nnz] = copy_val(_data[pos]);
-                    nnz++;
-                }
-            }
-            idx1[i+1] = nnz;
-        }
-
-    }
-
-    CSCMatrix* transpose();
-    void print();
-
-    void sort();
-    void move_diag();
-    void remove_duplicates();
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-
-    CSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    CSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    void jacobi(Vector& x, Vector& b, Vector& tmp, double omega = .667);    
-
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-
-    void block_removal_col_check(bool* col_check);
-
-    CSCMatrix* copy();
-
-    format_t format()
-    {
-        return CSC;
-    }
-
-    void add_value(int row, int col, double value)
-    {
-        if (fabs(value) > zero_tol)
-        {
-            idx2.emplace_back(row);
-            vals.emplace_back(value);
-            nnz++;
-        }
-    }
-    void add_value(int row, int col, double* value)
-    {
-        idx2.emplace_back(row);
-        vals.emplace_back(*value);
-        nnz++;
-    }
-
-    void* get_data()
-    {
-       return vals.data();
-    } 
-    int data_size() const
-    {
-        return vals.size();
-    }
-    void resize_data(int size)
-    {
-        vals.resize(size);
-    }
-    void reserve_size(int size)
-    {
-        idx2.reserve(size);
-        vals.reserve(size);
-    }
-
-    double get_val(const int j, const int k)
-    {
-        return vals[j];
-    }
-
-  };
-
-
-
-
-
-// Forward Declaration of Blocked Classes 
-class BCOOMatrix;
-class BSRMatrix;
-class BSCMatrix;
-
-class BSRMatrix : public CSRMatrix
-{
-  public:
-    BSRMatrix(int num_block_rows, int num_block_cols, int block_row_size, 
-            int block_col_size, int _nnz = 1) 
-        : CSRMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-    }
-
-    BSRMatrix(int num_block_rows, int num_block_cols, 
-            int block_row_size, int block_col_size, double** data)
-        :  CSRMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        init_from_dense(data);
-    }
-
-
-    BSRMatrix(int num_block_rows, int num_block_cols, 
-            int block_row_size, int block_col_size, std::vector<int>& rowptr, 
-            std::vector<int>& cols, std::vector<double*>& data)
-        :  CSRMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        init_from_lists(rowptr, cols, data);
-    }
-
-    BSRMatrix(CSRMatrix* A, int block_row_size, int block_col_size) : CSRMatrix(A->n_rows / block_row_size, A->n_cols / block_col_size, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        // Convert CSR to BSR
-        std::vector<int> idx(A->n_cols, -1);
-        for (int bsr_row = 0; bsr_row < n_rows; bsr_row++)
-        {
-            for (int block = 0; block < b_rows; block++)
-            {
-                int csr_row = bsr_row*b_rows+block;
-
-                for (int j = A->idx1[csr_row]; j < A->idx1[csr_row+1]; j++)
-                {
-                    int csr_col = A->idx2[j];
-                    int bsr_col = csr_col / b_rows;
-                    if (idx[bsr_col] == -1)
-                    {
-                        idx[bsr_col] = idx2.size();
-                        idx2.push_back(bsr_col);
-                        block_vals.push_back(new double[b_size]());
-                    }
-                    int idx_row = csr_row % b_rows;
-                    int idx_col = csr_col % b_cols;
-                    block_vals[idx[bsr_col]][idx_row*b_rows + idx_col] = A->vals[j];
-                }
-            }   
-            idx1[bsr_row+1] = idx2.size();
-
-            // Reset IDX array for next BSR row
-            for (int j = idx1[bsr_row]; j < idx1[bsr_row+1]; j++)
-                idx[idx2[j]] = -1;
-        }
-    }
-
-    
-    BSRMatrix() : CSRMatrix()
-    {
-        b_rows = 1;
-        b_cols = 1;
-        b_size = 1;
-    }
-
-    ~BSRMatrix()
-    {
-        for (std::vector<double*>::iterator it = block_vals.begin();
-                it != block_vals.end(); ++it)
-            delete[] *it;
-    }
-
-    BSRMatrix* transpose();
-    void sort();
-    void remove_duplicates();
-    void move_diag();
-
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-
-    void block_removal_col_check(bool* col_check);
-
-    void print();
-    BSRMatrix* copy();
-
-    BSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    BSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-    format_t format()
-    {
-        return BSR;
-    }
-
-    void add_value(int row, int col, double* value) 
-    {
-        idx2.emplace_back(col);
-        block_vals.emplace_back(copy_val(value));
-        nnz++;
-    }
-
-    void* get_data()
-    {
-       return block_vals.data();
-    } 
-    int data_size() const
-    {
-        return block_vals.size();
-    }
-    void resize_data(int size)
-    {
-        block_vals.resize(size);
-    }
-    void reserve_size(int size)
-    {
-        idx2.reserve(size);
-        block_vals.reserve(size);
-    }
-
-    double get_val(const int j, const int k)
-    {
-        return block_vals[j][k];
-    }
-
-    std::vector<double*> block_vals;
-};
-
-class BCOOMatrix : public COOMatrix
-{
-  public:
-    BCOOMatrix(int num_block_rows, int num_block_cols, int block_row_size, 
-            int block_col_size, int nnz_per_block_row = 1) 
-        : COOMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-    }
-
-    BCOOMatrix(int num_block_rows, int num_block_cols,
-            int block_row_size, int block_col_size, double** values) 
-        : COOMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-        
-        init_from_dense(values); 
-    }
-
-    BCOOMatrix(int num_block_rows, int num_block_cols,
-            int block_row_size, int block_col_size,
-            std::vector<int>& rows, std::vector<int>& cols, 
-            std::vector<double*>& data)
-       : COOMatrix(num_block_rows, num_block_cols, 0) 
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        init_from_lists(rows, cols, data);
-    }
-
-    BCOOMatrix() : COOMatrix()
-    {
-        b_rows = 1;
-        b_cols = 1;
-        b_size = 1;
-    }
-
-    ~BCOOMatrix()
-    {
-        for (std::vector<double*>::iterator it = block_vals.begin();
-                it != block_vals.end(); ++it)
-            delete[] *it;
-    }
-
-    BCOOMatrix* transpose();
-    void sort();
-    void remove_duplicates();
-    void move_diag();
-
-    void print();
-    BCOOMatrix* copy();
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-
-    void block_removal_col_check(bool* col_check);
-
-    BSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    BSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-    void add_value(int row, int col, double* values)
-    {
-        idx1.emplace_back(row);
-        idx2.emplace_back(col);
-        block_vals.emplace_back(copy_val(values));
-        nnz++;
-    }
-
-    format_t format()
-    {
-        return BCOO;
-    }
-
-    void* get_data()
-    {
-       return block_vals.data();
-    } 
-    int data_size() const
-    {
-        return block_vals.size();
-    }
-    void resize_data(int size)
-    {
-        block_vals.resize(size);
-    }
-    void reserve_size(int size)
-    {
-        idx1.reserve(size);
-        idx2.reserve(size);
-        block_vals.reserve(size);
-    }
-
-    double get_val(const int j, const int k)
-    {
-        return block_vals[j][k];
-    }
-
-    std::vector<double*> block_vals;
-};
-
-// Blocks are still stored row-wise in BSC matrix...
-class BSCMatrix : public CSCMatrix
-{
-  public:
-    BSCMatrix(int num_block_rows, int num_block_cols, int block_row_size, 
-            int block_col_size, int _nnz = 1) 
-        : CSCMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-    }
-
-    BSCMatrix(int num_block_rows, int num_block_cols, 
-            int block_row_size, int block_col_size, double** data)
-        :  CSCMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        init_from_dense(data);
-    }
-
-
-    BSCMatrix(int num_block_rows, int num_block_cols, 
-            int block_row_size, int block_col_size, std::vector<int>& colptr, 
-            std::vector<int>& rows, std::vector<double*>& data)
-        :  CSCMatrix(num_block_rows, num_block_cols, 0)
-    {
-        b_rows = block_row_size;
-        b_cols = block_col_size;
-        b_size = b_rows * b_cols;
-
-        init_from_lists(colptr, rows, data);
-    }
-
-    
-    BSCMatrix() : CSCMatrix()
-    {
-        b_rows = 1;
-        b_cols = 1;
-        b_size = 1;
-    }
-
-    ~BSCMatrix()
-    {
-        for (std::vector<double*>::iterator it = block_vals.begin();
-                it != block_vals.end(); ++it)
-            delete[] *it;
-    }
-
-    BSCMatrix* transpose();
-    void sort();
-    void remove_duplicates();
-    void move_diag();
-
-    COOMatrix* to_COO();
-    CSRMatrix* to_CSR();
-    CSCMatrix* to_CSC();
-    CSRMatrix* to_BSR();
-    CSCMatrix* to_BSC();
-    COOMatrix* to_BCOO();
-    
-    void block_removal_col_check(bool* col_check);
-
-    void print();
-    BSCMatrix* copy();
-
-    BSRMatrix* spgemm(CSRMatrix* B, int* B_to_C = NULL);
-    BSRMatrix* spgemm_T(CSCMatrix* A, int* C_map = NULL);
-
-    void spmv(const double* x, double* b) const;
-    void spmv_append(const double* x, double* b) const;
-    void spmv_append_T(const double* x, double* b) const;
-    void spmv_append_neg(const double* x, double* b) const;
-    void spmv_append_neg_T(const double* x, double* b) const;
-    void spmv_residual(const double* x, const double* b, double* r) const; 
-
-    format_t format()
-    {
-        return BSC;
-    }
-
-    void add_value(int row, int col, double* value)
-    {
-        idx2.emplace_back(row);
-        block_vals.emplace_back(copy_val(value));
-        nnz++;
-    }
-
-    void* get_data()
-    {
-       return block_vals.data();
-    }
-    void resize_data(int size)
-    {
-        block_vals.resize(size);
-    }
-    int data_size() const
-    {
-        return block_vals.size();
-    }
-    void reserve_size(int size)
-    {
-        idx2.reserve(size);
-        block_vals.reserve(size);
-    }
-
-    double get_val(const int j, const int k)
-    {
-        return block_vals[j][k];
-    }
-
-    std::vector<double*> block_vals;
-};
-
-
-
-}
-
-#endif
-
diff --git a/raptor/core/mpi_types.cpp b/raptor/core/mpi_types.cpp
deleted file mode 100644
index bd3c935c..00000000
--- a/raptor/core/mpi_types.cpp
+++ /dev/null
@@ -1,331 +0,0 @@
-bool profile = false;
-double collective_t = 0.0;
-double p2p_t = 0.0;
-double* current_t;
-double mat_t = 0.0;
-double vec_t = 0.0;
-double total_t = 0.0;
-double new_comm_t = 0.0;
-
-#include <mpi.h>
-#include "mpi_types.hpp"
-
-void init_profile()
-{
-    profile = true;
-    reset_profile();
-}
-void reset_profile()
-{
-    collective_t = 0.0;
-    p2p_t = 0.0;
-    mat_t = 0.0;
-    vec_t = 0.0;
-    new_comm_t = 0.0;
-    if (profile) total_t = -MPI_Wtime();
-    else total_t = 0.0;
-}
-void finalize_profile()
-{
-    profile = false;
-    total_t += MPI_Wtime();
-}
-void average_profile(int n_iter)
-{
-    total_t /= n_iter;
-    collective_t /= n_iter;
-    p2p_t /= n_iter;
-    vec_t /= n_iter;
-    mat_t /= n_iter;
-    new_comm_t /= n_iter;
-}
-void print_profile(const char* string)
-{
-    int rank;
-    double t0;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-
-    MPI_Allreduce(&total_t, &t0, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-    if (rank == 0) printf("%s Total Time: %e\n", string, t0);
-    if (fabs(t0 - total_t) > zero_tol)
-        reset_profile();
-    MPI_Reduce(&collective_t, &t0, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    if (rank == 0 && t0 > 0) printf("%s Collective Comm Time: %e\n", string, t0);
-    MPI_Reduce(&p2p_t, &t0, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    if (rank == 0 && t0 > 0) printf("%s P2P Comm Time: %e\n", string, t0);
-    MPI_Reduce(&vec_t, &t0, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    if (rank == 0 && t0 > 0) printf("%s Vec Comm Time: %e\n", string, t0);
-    MPI_Reduce(&mat_t, &t0, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    if (rank == 0 && t0 > 0) printf("%s Mat Comm Time: %e\n", string, t0);
-}
-
-
-// Collective Methods
-int RAPtor_MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, 
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Reduce(const void *sendbuf, void *recvbuf, int count, 
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, int root, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Gather(const void *sendbuf, int sendcount, RAPtor_MPI_Datatype sendtype,
-        void *recvbuf, int recvcount, RAPtor_MPI_Datatype recvtype, int root, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount, 
-            recvtype, root, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Allgather(const void* sendbuf, int sendcount, RAPtor_MPI_Datatype sendtype,
-        void *recvbuf, int recvcount, RAPtor_MPI_Datatype recvtype, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Allgather(sendbuf, sendcount, sendtype, recvbuf, 
-            recvcount, recvtype, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Allgatherv(const void* sendbuf, int sendcount, RAPtor_MPI_Datatype sendtype,
-        void *recvbuf, const int *recvcounts, const int* displs, 
-        RAPtor_MPI_Datatype recvtype, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Allgatherv(sendbuf, sendcount, sendtype, recvbuf, recvcounts,
-            displs, recvtype, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count,
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, RAPtor_MPI_Comm comm, RAPtor_MPI_Request* request)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Iallreduce(sendbuf, recvbuf, count, datatype, op, comm, request);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &collective_t;
-    return val;
-}
-int RAPtor_MPI_Bcast(void *buffer, int count, RAPtor_MPI_Datatype datatype,
-        int root, RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Bcast(buffer, count, datatype, root, comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Ibarrier(RAPtor_MPI_Comm comm, RAPtor_MPI_Request *request)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Ibarrier(comm, request);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &collective_t;
-    return val;
-}
-int RAPtor_MPI_Barrier(RAPtor_MPI_Comm comm)
-{
-    if (profile) collective_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Barrier(comm);
-    if (profile) collective_t += RAPtor_MPI_Wtime();
-    return val;
-}
-
-
-
-// Point-to-Point Methods
-int RAPtor_MPI_Send(const void *buf, int count, RAPtor_MPI_Datatype datatype, int dest,
-        int tag, RAPtor_MPI_Comm comm)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Send(buf, count, datatype, dest, tag, comm);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Isend(const void *buf, int count, RAPtor_MPI_Datatype datatype, int dest, int tag,
-        RAPtor_MPI_Comm comm, RAPtor_MPI_Request * request)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Isend(buf, count, datatype, dest, tag, comm, request);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &p2p_t;
-    return val;
-}
-int RAPtor_MPI_Issend(const void *buf, int count, RAPtor_MPI_Datatype datatype, int dest, int tag,
-        RAPtor_MPI_Comm comm, RAPtor_MPI_Request * request)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Issend(buf, count, datatype, dest, tag, comm, request);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &p2p_t;
-    return val;
-}
-int RAPtor_MPI_Recv(void *buf, int count, RAPtor_MPI_Datatype datatype, int source, int tag,
-        RAPtor_MPI_Comm comm, RAPtor_MPI_Status * status)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Recv(buf, count, datatype, source, tag, comm, status);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Irecv(void *buf, int count, RAPtor_MPI_Datatype datatype, int source,
-        int tag, RAPtor_MPI_Comm comm, RAPtor_MPI_Request * request)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Irecv(buf, count, datatype, source, tag, comm, request);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &p2p_t;
-    return val;
-}
-int RAPtor_MPI_Probe(int source, int tag, RAPtor_MPI_Comm comm, RAPtor_MPI_Status* status)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Probe(source, tag, comm, status);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Iprobe(int source, int tag, RAPtor_MPI_Comm comm,
-        int *flag, RAPtor_MPI_Status *status)
-{
-    if (profile) p2p_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Iprobe(source, tag, comm, flag, status);
-    if (profile) p2p_t += RAPtor_MPI_Wtime();
-    if (profile) current_t = &p2p_t;
-    return val;
-}
-
-
-
-// Waiting for completion
-int RAPtor_MPI_Wait(RAPtor_MPI_Request *request, RAPtor_MPI_Status *status)
-{
-    if (profile) *current_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Wait(request, status);
-    if (profile) *current_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Waitall(int count, RAPtor_MPI_Request array_of_requests[], RAPtor_MPI_Status array_of_statuses[])
-{
-    if (profile) *current_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Waitall(count, array_of_requests, array_of_statuses);
-    if (profile) *current_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Test(MPI_Request *request, int *flag, MPI_Status *status)
-{
-    if (profile) *current_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Test(request, flag, status);
-    if (profile) *current_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Testall(int count, MPI_Request array_of_requests[],
-        int* flag, MPI_Status array_of_statuses[])
-{
-    if (profile) *current_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Testall(count, array_of_requests, flag, array_of_statuses);
-    if (profile) *current_t += RAPtor_MPI_Wtime();
-    return val;
-}
-
-
-// Packing/Unpacking Data
-int RAPtor_MPI_Pack(const void *inbuf, int incount, 
-        RAPtor_MPI_Datatype datatype, void *outbuf, int outside, int *position, 
-        RAPtor_MPI_Comm comm)
-{
-    return MPI_Pack(inbuf, incount, datatype, outbuf, outside, position, comm);
-}
-int RAPtor_MPI_Unpack(const void *inbuf, int insize, int *position, 
-        void *outbuf, int outcount, RAPtor_MPI_Datatype datatype, RAPtor_MPI_Comm comm)
-{
-    return MPI_Unpack(inbuf, insize, position, outbuf, outcount, datatype, comm);
-}
-int RAPtor_MPI_Pack_size(int incount, RAPtor_MPI_Datatype datatype, 
-        RAPtor_MPI_Comm comm, int *size)
-{
-    return MPI_Pack_size(incount, datatype, comm, size);
-}
-
-
-// Other utilities (no communication)
-double RAPtor_MPI_Wtime()
-{
-    return MPI_Wtime();
-}
-int RAPtor_MPI_Get_count(const RAPtor_MPI_Status *status, 
-        RAPtor_MPI_Datatype datatype, int *count)
-{
-    return MPI_Get_count(status, datatype, count);
-}
-int RAPtor_MPI_Comm_rank(RAPtor_MPI_Comm comm, int* rank)
-{
-    return MPI_Comm_rank(comm, rank);
-}
-int RAPtor_MPI_Comm_size(RAPtor_MPI_Comm comm, int* size)
-{
-    return MPI_Comm_size(comm, size);
-}
-
-
-
-// Creating New Communicator
-int RAPtor_MPI_Comm_split(RAPtor_MPI_Comm comm, int color, int key,
-        RAPtor_MPI_Comm* new_comm)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Comm_split(comm, color, key, new_comm);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Comm_group(RAPtor_MPI_Comm comm, RAPtor_MPI_Group *group)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Comm_group(comm, group);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Comm_create_group(RAPtor_MPI_Comm comm, RAPtor_MPI_Group group,
-        int tag, RAPtor_MPI_Comm* newcomm)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Comm_create_group(comm, group, tag, newcomm);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Group_incl(RAPtor_MPI_Group group, int n, const int ranks[],
-        RAPtor_MPI_Group *newgroup)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Group_incl(group, n, ranks, newgroup);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Comm_free(RAPtor_MPI_Comm *comm)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Comm_free(comm);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Group_free(RAPtor_MPI_Group* group)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Group_free(group);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-int RAPtor_MPI_Comm_dup(MPI_Comm comm, MPI_Comm* new_comm)
-{
-    if (profile) new_comm_t -= RAPtor_MPI_Wtime();
-    int val = MPI_Comm_dup(comm, new_comm);
-    if (profile) new_comm_t += RAPtor_MPI_Wtime();
-    return val;
-}
-
diff --git a/raptor/core/mpi_types.hpp b/raptor/core/mpi_types.hpp
deleted file mode 100644
index 8fbcdad1..00000000
--- a/raptor/core/mpi_types.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_MPI_TYPES_HPP_
-#define RAPTOR_CORE_MPI_TYPES_HPP_
-
-#include "types.hpp"
-#include <mpi.h>
-
-// Global Timing Variables
-extern bool profile;
-extern double collective_t;
-extern double p2p_t;
-extern double* current_t;
-extern double mat_t;
-extern double vec_t;
-extern double total_t;
-
-extern void init_profile();
-extern void reset_profile();
-extern void finalize_profile();
-extern void print_profile(const char* string);
-extern void average_profile(int n_iter);
-
-#define RAPtor_MPI_COMM_WORLD        MPI_COMM_WORLD
-#define RAPtor_MPI_COMM_NULL         MPI_COMM_NULL
-
-#define RAPtor_MPI_Comm              MPI_Comm
-#define RAPtor_MPI_Group             MPI_Group
-#define RAPtor_MPI_Datatype          MPI_Datatype
-#define RAPtor_MPI_Request           MPI_Request
-#define RAPtor_MPI_Status            MPI_Status
-#define RAPtor_MPI_Op                MPI_Op
-
-#define RAPtor_MPI_INT               MPI_INT
-#define RAPtor_MPI_DOUBLE            MPI_DOUBLE
-#define RAPtor_MPI_DOUBLE_INT        MPI_DOUBLE_INT
-#define RAPtor_MPI_LONG              MPI_LONG
-#define RAPtor_MPI_PACKED            MPI_PACKED
-
-#define RAPtor_MPI_STATUS_IGNORE     MPI_STATUS_IGNORE
-#define RAPtor_MPI_STATUSES_IGNORE   MPI_STATUSES_IGNORE
-
-#define RAPtor_MPI_SOURCE            MPI_SOURCE
-#define RAPtor_MPI_ANY_SOURCE        MPI_ANY_SOURCE
-
-#define RAPtor_MPI_IN_PLACE          MPI_IN_PLACE
-#define RAPtor_MPI_SUM               MPI_SUM
-#define RAPtor_MPI_MAX               MPI_MAX
-#define RAPtor_MPI_BOR               MPI_BOR
-
-
-// MPI Information
-extern int RAPtor_MPI_Comm_rank(RAPtor_MPI_Comm comm, int *rank);
-extern int RAPtor_MPI_Comm_size(RAPtor_MPI_Comm comm, int *size);
-
-// Collective Operations
-extern int RAPtor_MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, 
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Reduce(const void *sendbuf, void *recvbuf, int count, 
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, int root, 
-        RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Gather(const void *sendbuf, int sendcount, 
-        RAPtor_MPI_Datatype sendtype, void *recvbuf, int recvcount,
-        RAPtor_MPI_Datatype recvtype, int root, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Allgather(const void* sendbuf, int sendcount,
-        RAPtor_MPI_Datatype sendtype, void *recvbuf, int recvcount,
-         RAPtor_MPI_Datatype recvtype, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Allgatherv(const void* sendbuf, int sendcount,
-        RAPtor_MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, 
-        const int* displs, RAPtor_MPI_Datatype recvtype, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count,
-        RAPtor_MPI_Datatype datatype, RAPtor_MPI_Op op, RAPtor_MPI_Comm comm, 
-        RAPtor_MPI_Request* request);
-extern int RAPtor_MPI_Ibarrier(RAPtor_MPI_Comm comm, 
-        RAPtor_MPI_Request *request);
-extern int RAPtor_MPI_Barrier(RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Bcast(void *buffer, int count, RAPtor_MPI_Datatype datatype,
-        int root, RAPtor_MPI_Comm comm);
-
-// Point-to-Point Operations
-extern int RAPtor_MPI_Send(const void *buf, int count,
-        RAPtor_MPI_Datatype datatype, int dest, int tag, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Isend(const void *buf, int count, 
-        RAPtor_MPI_Datatype datatype, int dest, int tag, RAPtor_MPI_Comm comm,
-        RAPtor_MPI_Request * request);
-extern int RAPtor_MPI_Issend(const void *buf, int count, 
-        RAPtor_MPI_Datatype datatype, int dest, int tag, RAPtor_MPI_Comm comm,
-        RAPtor_MPI_Request * request);
-extern int RAPtor_MPI_Recv(void *buf, int count, RAPtor_MPI_Datatype datatype,
-        int source, int tag, RAPtor_MPI_Comm comm, RAPtor_MPI_Status * status);
-extern int RAPtor_MPI_Irecv(void *buf, int count, RAPtor_MPI_Datatype datatype,
-        int source, int tag, RAPtor_MPI_Comm comm, RAPtor_MPI_Request * request);
-
-// Waiting for data
-extern int RAPtor_MPI_Wait(RAPtor_MPI_Request *request, 
-        RAPtor_MPI_Status *status);
-extern int RAPtor_MPI_Waitall(int count, RAPtor_MPI_Request array_of_requests[], 
-        RAPtor_MPI_Status array_of_statuses[]);
-extern int RAPtor_MPI_Probe(int source, int tag, RAPtor_MPI_Comm comm,
-        RAPtor_MPI_Status* status);
-extern int RAPtor_MPI_Iprobe(int source, int tag, RAPtor_MPI_Comm comm, 
-        int *flag, RAPtor_MPI_Status *status);
-extern int RAPtor_MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
-extern int RAPtor_MPI_Testall(int count, MPI_Request array_of_requests[], 
-        int* flag, MPI_Status array_of_statuses[]);
-
-// Packing Data
-extern int RAPtor_MPI_Pack(const void *inbuf, int incount, 
-        RAPtor_MPI_Datatype datatype, void *outbuf, int outside, int *position, 
-        RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Unpack(const void *inbuf, int insize, int *position, 
-        void *outbuf, int outcount, RAPtor_MPI_Datatype datatype, RAPtor_MPI_Comm comm);
-extern int RAPtor_MPI_Get_count(const RAPtor_MPI_Status *status, 
-        RAPtor_MPI_Datatype datatype, int *count);
-extern int RAPtor_MPI_Pack_size(int incount, RAPtor_MPI_Datatype datatype, 
-        RAPtor_MPI_Comm comm, int *size);
-
-// Timing Data
-extern double RAPtor_MPI_Wtime();
-
-// Creating Communicators
-extern int RAPtor_MPI_Comm_free(RAPtor_MPI_Comm *comm);
-extern int RAPtor_MPI_Comm_split(RAPtor_MPI_Comm comm, int color, int key,
-        RAPtor_MPI_Comm* new_comm);
-extern int RAPtor_MPI_Comm_group(RAPtor_MPI_Comm comm, RAPtor_MPI_Group *group);
-extern int RAPtor_MPI_Comm_create_group(RAPtor_MPI_Comm comm, RAPtor_MPI_Group group,
-        int tag, RAPtor_MPI_Comm* newcomm);
-extern int RAPtor_MPI_Group_incl(RAPtor_MPI_Group group, int n, const int ranks[],
-        RAPtor_MPI_Group *newgroup);
-extern int RAPtor_MPI_Group_free(RAPtor_MPI_Group* group);
-extern int RAPtor_MPI_Comm_dup(MPI_Comm comm, MPI_Comm* new_comm);
-
-#endif
diff --git a/raptor/core/par_matrix.cpp b/raptor/core/par_matrix.cpp
deleted file mode 100644
index edb1e611..00000000
--- a/raptor/core/par_matrix.cpp
+++ /dev/null
@@ -1,1116 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "par_matrix.hpp"
-
-using namespace raptor;
-
-// Declare private methods
-void bsr_to_csr_copy_helper(ParBSRMatrix* A, ParCSRMatrix* B);
-
-/**************************************************************
-*****   ParMatrix Add Value
-**************************************************************
-***** Adds a value to the local portion of the parallel matrix,
-***** determining whether it should be added to diagonal or
-***** off-diagonal block.
-*****
-***** Parameters
-***** -------------
-***** row : index_t
-*****    Local row of value
-***** global_col : index_t
-*****    Global column of value
-***** value : data_t
-*****    Value to be added to parallel matrix
-**************************************************************/
-void ParMatrix::add_value(
-        int row,
-        index_t global_col,
-        data_t value)
-{
-    if (global_col >= partition->first_local_col
-            && global_col <= partition->last_local_col)
-    {
-        on_proc->add_value(row, global_col - partition->first_local_col, value);
-    }
-    else
-    {
-        off_proc->add_value(row, global_col, value);
-    }
-}
-
-/**************************************************************
-*****   ParMatrix Add Global Value
-**************************************************************
-***** Adds a value to the local portion of the parallel matrix,
-***** determining whether it should be added to diagonal or
-***** off-diagonal block.
-*****
-***** Parameters
-***** -------------
-***** global_row : index_t
-*****    Global row of value
-***** global_col : index_t
-*****    Global column of value
-***** value : data_t
-*****    Value to be added to parallel matrix
-**************************************************************/
-void ParMatrix::add_global_value(
-        index_t global_row,
-        index_t global_col,
-        data_t value)
-{
-    add_value(global_row - partition->first_local_row, global_col, value);
-}
-
-/**************************************************************
-*****   ParMatrix Finalize
-**************************************************************
-***** Finalizes the diagonal and off-diagonal matrices.  Sorts
-***** the local_to_global indices, and creates the parallel
-***** communicator
-*****
-***** Parameters
-***** -------------
-***** create_comm : bool (optional)
-*****    Boolean for whether parallel communicator should be
-*****    created (default is true)
-**************************************************************/
-void ParMatrix::condense_off_proc()
-{
-    if (off_proc->nnz == 0)
-    {
-        return;
-    }
-
-    int prev_col = -1;
-
-    std::map<int, int> orig_to_new;
-
-    std::copy(off_proc->idx2.begin(), off_proc->idx2.end(),
-            std::back_inserter(off_proc_column_map));
-    std::sort(off_proc_column_map.begin(), off_proc_column_map.end());
-
-    off_proc_num_cols = 0;
-    for (std::vector<int>::iterator it = off_proc_column_map.begin();
-            it != off_proc_column_map.end(); ++it)
-    {
-        if (*it != prev_col)
-        {
-            orig_to_new[*it] = off_proc_num_cols;
-            off_proc_column_map[off_proc_num_cols++] = *it;
-            prev_col = *it;
-        }
-    }
-    off_proc_column_map.resize(off_proc_num_cols);
-
-    for (std::vector<int>::iterator it = off_proc->idx2.begin();
-            it != off_proc->idx2.end(); ++it)
-    {
-        *it = orig_to_new[*it];
-    }
-}
-
-void ParMatrix::finalize(bool create_comm)
-{
-    on_proc->sort();
-    on_proc->remove_duplicates();
-    off_proc->sort();
-    off_proc->remove_duplicates();
-
-    int rank, num_procs;
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-
-    // Assume nonzeros in each on_proc column
-    if (on_proc_num_cols > (int)on_proc_column_map.size())
-    {
-        on_proc_column_map.resize(on_proc_num_cols);
-        for (int i = 0; i < on_proc_num_cols; i++)
-        {
-            on_proc_column_map[i] = i + partition->first_local_col;
-        }
-    }
-
-    if (local_num_rows > (int)local_row_map.size())
-    {
-        local_row_map.resize(local_num_rows);
-        for (int i = 0; i < local_num_rows; i++)
-        {
-            local_row_map[i] = i + partition->first_local_row;
-        }
-    }
-
-    // Condense columns in off_proc, storing global
-    // columns as 0-num_cols, and store mapping
-    if (off_proc->nnz)
-    {
-        condense_off_proc();
-    }
-    else
-    {
-        off_proc_num_cols = 0;
-    }
-    off_proc->resize(local_num_rows, off_proc_num_cols);
-    local_nnz = on_proc->nnz + off_proc->nnz;
-
-    if (create_comm){
-        comm = new ParComm(partition, off_proc_column_map);
-    }
-    else
-        comm = new ParComm(partition);
-}
-
-int* ParMatrix::map_partition_to_local()
-{
-    int* on_proc_partition_to_col = new int[partition->local_num_cols+1];
-    for (int i = 0; i < partition->local_num_cols+1; i++) on_proc_partition_to_col[i] = -1;
-    for (int i = 0; i < on_proc_num_cols; i++)
-    {
-        on_proc_partition_to_col[on_proc_column_map[i] - partition->first_local_col] = i;
-    }
-
-    return on_proc_partition_to_col;
-}
-
-
-/**************************************************************
-*****  ParBSRMatrix to ParCSRMatrix Convert
-**************************************************************/
-void bsr_to_csr_copy_helper(ParBSRMatrix* A, ParCSRMatrix* B)
-{
-    if (B->on_proc)
-    {
-        delete B->on_proc;
-    }
-    if (B->off_proc)
-    {
-        delete B->off_proc;
-    }
-
-    // Convert on and off proc to CSR
-    B->on_proc = A->on_proc->to_CSR();
-    B->off_proc = A->off_proc->to_CSR();
-
-    B->local_nnz = B->on_proc->nnz + B->off_proc->nnz;
-    B->global_num_rows = A->global_num_rows * A->on_proc->b_rows;
-    B->global_num_cols = A->global_num_cols * A->on_proc->b_cols;
-
-    B->on_proc_num_cols = B->on_proc->n_cols;
-    B->off_proc_num_cols = B->off_proc->n_cols;
-
-    // Updated partition
-    B->partition = new Partition(B->global_num_rows, B->global_num_cols,
-                        B->on_proc->n_rows, B->on_proc->n_cols,
-                        A->partition->first_local_row * A->on_proc->b_rows,
-                        A->partition->first_local_col * A->on_proc->b_cols);
-    B->local_num_rows = B->partition->local_num_rows;
-
-    // Updated column and row maps -
-    B->finalize(false);
-
-    // Determine which cols of blocks are non-zero
-    bool* off_proc_nz_cols = new bool[A->off_proc_num_cols * A->off_proc->b_cols];
-    A->off_proc->block_removal_col_check(off_proc_nz_cols);
-
-    // Update off_proc_column_map
-    int first_col;
-    int off_proc_map_indx = 0;
-    for (int i = 0; i < A->off_proc_num_cols; i++)
-    {
-        first_col = A->off_proc_column_map[i] * A->off_proc->b_cols;
-        for (int j = 0; j < A->off_proc->b_cols; j++)
-        {
-            if (off_proc_nz_cols[i*A->off_proc->b_cols + j])
-            {
-                B->off_proc_column_map[off_proc_map_indx] = first_col + j;
-                off_proc_map_indx++;
-            }
-        }
-    }
-
-    // Updated how communicators are created
-    if (A->comm)
-    {
-        B->comm = new ParComm(B->partition, B->off_proc_column_map, B->on_proc_column_map);
-    }
-    else
-    {
-        B->comm = NULL;
-    }
-
-    if (A->tap_comm)
-    {
-        B->tap_comm = new TAPComm(B->partition, B->off_proc_column_map, B->on_proc_column_map);
-    }
-    else
-    {
-        B->tap_comm = NULL;
-    }
-
-    if (A->tap_mat_comm)
-    {
-        B->tap_mat_comm = new TAPComm(B->partition, B->off_proc_column_map, B->on_proc_column_map);
-    }
-    else
-    {
-        B->tap_mat_comm = NULL;
-    }
-
-    delete[] off_proc_nz_cols;
-}
-
-
-
-/**************************************************************
-*****  ParMatrix Convert
-**************************************************************
-***** Convert from one type of parmatrix to another
-***** No copies if parmatrix type remains the same
-***** If blocked parmatrix, converts to block matrix
-**************************************************************/
-ParCOOMatrix* ParCOOMatrix::to_ParCOO()
-{
-    return this;
-}
-ParCOOMatrix* ParCOOMatrix::to_ParBCOO()
-{
-    return this->to_ParCOO();
-}
-ParCOOMatrix* ParBCOOMatrix::to_ParCOO()
-{
-    return this->to_ParBCOO();
-}
-ParCOOMatrix* ParBCOOMatrix::to_ParBCOO()
-{
-    return this;
-}
-ParCSRMatrix* ParCOOMatrix::to_ParCSR()
-{
-    ParCSRMatrix* A = new ParCSRMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSRMatrix* ParCOOMatrix::to_ParBSR()
-{
-    return this->to_ParCSR();
-}
-ParCSRMatrix* ParBCOOMatrix::to_ParCSR()
-{
-    return this->to_ParBSR();
-}
-ParCSRMatrix* ParBCOOMatrix::to_ParBSR()
-{
-    ParBSRMatrix* A = new ParBSRMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSCMatrix* ParCOOMatrix::to_ParCSC()
-{
-    ParCSCMatrix* A = new ParCSCMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSCMatrix* ParCOOMatrix::to_ParBSC()
-{
-    return this->to_ParCSC();
-}
-ParCSCMatrix* ParBCOOMatrix::to_ParCSC()
-{
-    return this->to_ParBSC();
-}
-ParCSCMatrix* ParBCOOMatrix::to_ParBSC()
-{
-    ParBSCMatrix* A = new ParBSCMatrix();
-    A->copy_helper(this);
-    return A;
-}
-
-ParCOOMatrix* ParCSRMatrix::to_ParCOO()
-{
-    ParCOOMatrix* A = new ParCOOMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCOOMatrix* ParCSRMatrix::to_ParBCOO()
-{
-    return this->to_ParCOO();
-}
-ParCOOMatrix* ParBSRMatrix::to_ParCOO()
-{
-    return this->to_ParBCOO();
-}
-ParCOOMatrix* ParBSRMatrix::to_ParBCOO()
-{
-    ParBCOOMatrix* A = new ParBCOOMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSRMatrix* ParCSRMatrix::to_ParCSR()
-{
-    return this;
-}
-ParCSRMatrix* ParCSRMatrix::to_ParBSR()
-{
-    return this->to_ParCSR();
-}
-ParCSRMatrix* ParBSRMatrix::to_ParCSR()
-{
-    ParCSRMatrix* A = new ParCSRMatrix();
-    bsr_to_csr_copy_helper(this, A);
-    return A;
-}
-ParCSRMatrix* ParBSRMatrix::to_ParBSR()
-{
-    return this;
-}
-ParCSCMatrix* ParCSRMatrix::to_ParCSC()
-{
-    ParCSCMatrix* A = new ParCSCMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSCMatrix* ParCSRMatrix::to_ParBSC()
-{
-    return this->to_ParCSC();
-}
-ParCSCMatrix* ParBSRMatrix::to_ParCSC()
-{
-    return this->to_ParBSC();
-}
-ParCSCMatrix* ParBSRMatrix::to_ParBSC()
-{
-    ParBSCMatrix* A = new ParBSCMatrix();
-    A->copy_helper(this);
-    return A;
-}
-
-ParCOOMatrix* ParCSCMatrix::to_ParCOO()
-{
-    ParCOOMatrix* A = new ParCOOMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCOOMatrix* ParCSCMatrix::to_ParBCOO()
-{
-    return this->to_ParCOO();
-}
-ParCOOMatrix* ParBSCMatrix::to_ParCOO()
-{
-    return this->to_ParBCOO();
-}
-ParCOOMatrix* ParBSCMatrix::to_ParBCOO()
-{
-    ParBCOOMatrix* A = new ParBCOOMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSRMatrix* ParCSCMatrix::to_ParCSR()
-{
-    ParCSRMatrix* A = new ParCSRMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSRMatrix* ParCSCMatrix::to_ParBSR()
-{
-    return this->to_ParCSR();
-}
-ParCSRMatrix* ParBSCMatrix::to_ParCSR()
-{
-    return this->to_ParBSR();
-}
-ParCSRMatrix* ParBSCMatrix::to_ParBSR()
-{
-    ParBSRMatrix* A = new ParBSRMatrix();
-    A->copy_helper(this);
-    return A;
-}
-ParCSCMatrix* ParCSCMatrix::to_ParCSC()
-{
-    return this;
-}
-ParCSCMatrix* ParCSCMatrix::to_ParBSC()
-{
-    return this->to_ParCSC();
-}
-ParCSCMatrix* ParBSCMatrix::to_ParCSC()
-{
-    return this->to_ParBSC();
-}
-ParCSCMatrix* ParBSCMatrix::to_ParBSC()
-{
-    return this;
-}
-
-
-void ParCSRMatrix::copy_structure(ParBSRMatrix* A)
-{
-    on_proc->idx1.clear();
-    on_proc->idx2.clear();
-    off_proc->idx1.clear();
-    off_proc->idx2.clear();
-
-    std::copy(A->on_proc->idx1.begin(), A->on_proc->idx1.end(),
-		std::back_inserter(on_proc->idx1));
-    std::copy(A->on_proc->idx2.begin(), A->on_proc->idx2.end(),
-		std::back_inserter(on_proc->idx2));
-
-    std::copy(A->off_proc->idx1.begin(), A->off_proc->idx1.end(),
-		std::back_inserter(off_proc->idx1));
-    std::copy(A->off_proc->idx2.begin(), A->off_proc->idx2.end(),
-		std::back_inserter(off_proc->idx2));
-
-    on_proc->n_rows = A->on_proc->n_rows;
-    on_proc->n_cols = A->on_proc->n_cols;
-    on_proc->nnz = A->on_proc->nnz;
-
-    off_proc->n_rows = A->off_proc->n_rows;
-    off_proc->n_cols = A->off_proc->n_cols;
-    off_proc->nnz = A->off_proc->nnz;
-
-    ParMatrix::copy_helper(A);
-}
-
-
-void ParMatrix::default_copy_helper(ParMatrix* A)
-{
-    partition = A->partition;
-    partition->num_shared++;
-
-    local_nnz = A->local_nnz;
-    local_num_rows = A->local_num_rows;
-    global_num_rows = A->global_num_rows;
-    global_num_cols = A->global_num_cols;
-
-    std::copy(A->off_proc_column_map.begin(), A->off_proc_column_map.end(),
-            std::back_inserter(off_proc_column_map));
-    std::copy(A->on_proc_column_map.begin(), A->on_proc_column_map.end(),
-            std::back_inserter(on_proc_column_map));
-    std::copy(A->local_row_map.begin(), A->local_row_map.end(),
-            std::back_inserter(local_row_map));
-
-    off_proc_num_cols = off_proc_column_map.size();
-    on_proc_num_cols = on_proc_column_map.size();
-
-    if (A->comm)
-    {
-        comm = A->comm;
-        comm->num_shared++;
-    }
-    else
-    {
-        comm = NULL;
-    }
-
-    if (A->tap_comm)
-    {
-        tap_comm = A->tap_comm;
-        tap_comm->num_shared++;
-    }
-    else
-    {
-        tap_comm = NULL;
-    }
-
-    if (A->tap_mat_comm)
-    {
-        tap_mat_comm = A->tap_mat_comm;
-        tap_mat_comm->num_shared++;
-    }
-    else
-    {
-        tap_mat_comm = NULL;
-    }
-}
-
-void ParMatrix::copy_helper(ParCOOMatrix* A)
-{
-    default_copy_helper(A);
-}
-void ParMatrix::copy_helper(ParCSRMatrix* A)
-{
-    default_copy_helper(A);
-}
-void ParMatrix::copy_helper(ParCSCMatrix* A)
-{
-    default_copy_helper(A);
-}
-
-
-void ParCOOMatrix::copy_helper(ParCOOMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->copy();
-    off_proc = A->off_proc->copy();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCOOMatrix::copy_helper(ParCSRMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_COO();
-    off_proc = A->off_proc->to_COO();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCOOMatrix::copy_helper(ParCSCMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_COO();
-    off_proc = A->off_proc->to_COO();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSRMatrix::copy_helper(ParCSRMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->copy();
-    off_proc = A->off_proc->copy();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSRMatrix::copy_helper(ParCSCMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_CSR();
-    off_proc = A->off_proc->to_CSR();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSRMatrix::copy_helper(ParCOOMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_CSR();
-    off_proc = A->off_proc->to_CSR();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSCMatrix::copy_helper(ParCSRMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_CSC();
-    off_proc = A->off_proc->to_CSC();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSCMatrix::copy_helper(ParCSCMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->copy();
-    off_proc = A->off_proc->copy();
-
-    ParMatrix::copy_helper(A);
-}
-
-void ParCSCMatrix::copy_helper(ParCOOMatrix* A)
-{
-    if (on_proc)
-    {
-        delete on_proc;
-    }
-    if (off_proc)
-    {
-        delete off_proc;
-    }
-
-    on_proc = A->on_proc->to_CSC();
-    off_proc = A->off_proc->to_CSC();
-
-    ParMatrix::copy_helper(A);
-}
-
-// Main transpose
-ParCSRMatrix* ParCSRMatrix::transpose()
-{
-    int start, end;
-    int proc;
-    int col, col_start, col_end;
-    int ctr, prev_ctr, size, bytes;
-    int col_count, count;
-    int col_size;
-    int idx, row;
-    RAPtor_MPI_Status recv_status;
-
-    Partition* part_T;
-    Matrix* on_proc_T;
-    Matrix* off_proc_T;
-    CSCMatrix* send_mat;
-    CSCMatrix* recv_mat;
-    ParCSRMatrix* T = NULL;
-
-    std::vector<char> send_buffer;
-    std::vector<char> recv_buffer;
-
-    // Transpose partition
-    part_T = partition->transpose();
-
-    // Transpose local (on_proc) matrix
-    on_proc_T = on_proc->transpose();
-
-    // Allocate vectors for sending off_proc matrix
-    send_mat = off_proc->to_CSC();
-    recv_mat = new CSCMatrix(local_num_rows, comm->send_data->size_msgs);
-
-    int int_size, dbl_size;
-    MPI_Pack_size(1, RAPtor_MPI_INT, comm->mpi_comm, &int_size);
-    MPI_Pack_size(1, RAPtor_MPI_DOUBLE, comm->mpi_comm, &dbl_size);
-
-    bytes = 0;
-    for (int i = 0; i < comm->recv_data->num_msgs; i++)
-    {
-        start = comm->recv_data->indptr[i];
-        end = comm->recv_data->indptr[i+1];
-        for (col = start; col < end; col++)
-        {
-            col_start = send_mat->idx1[col];
-            col_end = send_mat->idx1[col+1];
-            col_size = col_end - col_start;
-            bytes += col_size * (int_size + dbl_size) + int_size;
-        }
-    }
-    send_buffer.resize(bytes);
-    std::vector<int> send_ptr(comm->recv_data->num_msgs+1);
-
-    // Add off_proc cols of matrix to send buffer
-    ctr = 0;
-    prev_ctr = 0;
-    for (int i = 0; i < comm->recv_data->num_msgs; i++)
-    {
-        proc = comm->recv_data->procs[i];
-        start = comm->recv_data->indptr[i];
-        end = comm->recv_data->indptr[i+1];
-        for (col = start; col < end; col++)
-        {
-            col_start = send_mat->idx1[col];
-            col_end = send_mat->idx1[col+1];
-            col_size = col_end - col_start;
-            RAPtor_MPI_Pack(&col_size, 1, RAPtor_MPI_INT, send_buffer.data(), bytes, &ctr, comm->mpi_comm);
-            for (int k = col_start; k < col_end; k++)
-            {
-                RAPtor_MPI_Pack(&(local_row_map[send_mat->idx2[k]]), 1, RAPtor_MPI_INT,
-                        send_buffer.data(), bytes, &ctr, comm->mpi_comm);
-            }
-            RAPtor_MPI_Pack(&(send_mat->vals[col_start]), col_end - col_start, RAPtor_MPI_DOUBLE,
-                    send_buffer.data(), bytes, &ctr, comm->mpi_comm);
-        }
-
-        RAPtor_MPI_Isend(&(send_buffer[prev_ctr]), ctr - prev_ctr, RAPtor_MPI_PACKED, proc,
-                comm->key, comm->mpi_comm, &(comm->recv_data->requests[i]));
-        prev_ctr = ctr;
-    }
-
-    col_count = 0;
-    recv_mat->idx1[0] = 0;
-    recv_mat->nnz = 0;
-    for (int i = 0; i < comm->send_data->num_msgs; i++)
-    {
-        proc = comm->send_data->procs[i];
-        start = comm->send_data->indptr[i];
-        end = comm->send_data->indptr[i+1];
-        size = end - start;
-        RAPtor_MPI_Probe(proc, comm->key, comm->mpi_comm, &recv_status);
-        RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_PACKED, &count);
-        if (count > (int)recv_buffer.size())
-        {
-            recv_buffer.resize(count);
-        }
-        RAPtor_MPI_Recv(&(recv_buffer[0]), count, RAPtor_MPI_PACKED, proc,
-                comm->key, comm->mpi_comm, &recv_status);
-        ctr = 0;
-        for (int j = 0; j < size; j++)
-        {
-            RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &col_size, 1, RAPtor_MPI_INT, comm->mpi_comm);
-            recv_mat->idx2.resize(recv_mat->nnz + col_size);
-            recv_mat->vals.resize(recv_mat->nnz + col_size);
-            RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &(recv_mat->idx2[recv_mat->nnz]), col_size,
-                    RAPtor_MPI_INT, comm->mpi_comm);
-            RAPtor_MPI_Unpack(recv_buffer.data(), count, &ctr, &(recv_mat->vals[recv_mat->nnz]), col_size,
-                    RAPtor_MPI_DOUBLE, comm->mpi_comm);
-            recv_mat->nnz += col_size;
-            recv_mat->idx1[col_count+1] = recv_mat->nnz;
-            col_count++;
-        }
-    }
-    recv_mat->nnz = recv_mat->idx2.size();
-    RAPtor_MPI_Waitall(comm->recv_data->num_msgs, comm->recv_data->requests.data(), RAPtor_MPI_STATUSES_IGNORE);
-
-    off_proc_T = new CSRMatrix(on_proc_num_cols, -1);
-    std::vector<int> off_T_sizes(on_proc_num_cols, 0);
-    for (int i = 0; i < comm->send_data->size_msgs; i++)
-    {
-        row = comm->send_data->indices[i];
-        start = recv_mat->idx1[i];
-        end = recv_mat->idx1[i+1];
-        off_T_sizes[row] += (end - start);
-    }
-    off_proc_T->idx1[0] = 0;
-    for (int i = 0; i < off_proc_T->n_rows; i++)
-    {
-        off_proc_T->idx1[i+1] = off_proc_T->idx1[i] + off_T_sizes[i];
-        off_T_sizes[i] = 0;
-    }
-    off_proc_T->nnz = off_proc_T->idx1[off_proc_T->n_rows];
-    off_proc_T->idx2.resize(off_proc_T->nnz);
-    off_proc_T->vals.resize(off_proc_T->nnz);
-    for (int i = 0; i < comm->send_data->size_msgs; i++)
-    {
-        row = comm->send_data->indices[i];
-        start = recv_mat->idx1[i];
-        end = recv_mat->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            idx = off_proc_T->idx1[row] + off_T_sizes[row]++;
-            off_proc_T->idx2[idx] = recv_mat->idx2[j];
-            off_proc_T->vals[idx] = recv_mat->vals[j];
-        }
-    }
-
-    T = new ParCSRMatrix(part_T, on_proc_T, off_proc_T);
-
-    delete send_mat;
-    delete recv_mat;
-
-    return T;
-}
-
-ParCOOMatrix* ParCOOMatrix::transpose()
-{
-    ParCSRMatrix* A_csr = to_ParCSR();
-    ParCSRMatrix* AT_csr = A_csr->transpose();
-    delete A_csr;
-
-    ParCOOMatrix* AT = AT_csr->to_ParCOO();
-    delete AT_csr;
-
-    return AT;
-}
-ParCSCMatrix* ParCSCMatrix::transpose()
-{
-    // TODO -- Shouldn't have to convert first
-    ParCSRMatrix* A_csr = to_ParCSR();
-    ParCSRMatrix* AT_csr = A_csr->transpose();
-    delete A_csr;
-
-    ParCSCMatrix* AT = AT_csr->to_ParCSC();
-    delete AT_csr;
-
-    return AT;
-}
-
-// Assumes block_row_size and block_col_size evenly divide local row/col sizes
-ParBSRMatrix* ParCSRMatrix::to_ParBSR(const int block_row_size, const int block_col_size)
-{
-    int start, end, col;
-    int prev_row, prev_col;
-    int block_row, block_col;
-    int block_pos, col_pos;
-    int global_col, pos;
-    double val;
-
-    int global_block_rows = global_num_rows / block_row_size;
-    int global_block_cols = global_num_cols / block_col_size;
-    ParBSRMatrix* A = new ParBSRMatrix(global_block_rows, global_block_cols,
-            block_row_size, block_col_size);
-
-    // Get local to global mappings for block matrix
-    prev_row = -1;
-    for (std::vector<int>::iterator it = local_row_map.begin();
-            it != local_row_map.end(); ++it)
-    {
-        block_row = *it / block_row_size;
-        if (block_row != prev_row)
-        {
-            A->local_row_map.emplace_back(block_row);
-            prev_row = block_row;
-        }
-    }
-    if (global_num_rows == global_num_cols)
-    {
-        A->on_proc_column_map = A->get_local_row_map();
-    }
-    else
-    {
-        prev_col = -1;
-        for (std::vector<int>::iterator it = on_proc_column_map.begin();
-                it != on_proc_column_map.end(); ++it)
-        {
-            block_col = *it / block_col_size;
-            if (block_col != prev_col)
-            {
-                A->on_proc_column_map.emplace_back(block_row);
-                prev_col = block_row;
-            }
-        }
-    }
-
-    prev_col = -1;
-    std::map<int, int> global_to_block_local;
-    for (std::vector<int>::iterator it = off_proc_column_map.begin();
-            it != off_proc_column_map.end(); ++it)
-    {
-        block_col = *it / block_col_size;
-        if (block_col != prev_col)
-        {
-            global_to_block_local[block_col] = A->off_proc_column_map.size();
-            A->off_proc_column_map.emplace_back(block_col);
-            prev_col = block_col;
-        }
-    }
-    A->local_num_rows = A->local_row_map.size();
-    A->on_proc_num_cols = A->local_num_rows;
-    A->off_proc_num_cols = A->off_proc_column_map.size();
-    A->off_proc->n_cols = A->off_proc_num_cols;
-
-    BSRMatrix* A_on_proc = (BSRMatrix*) A->on_proc;
-    BSRMatrix* A_off_proc = (BSRMatrix*) A->off_proc;
-
-    A_on_proc->idx1[0] = 0;
-    A_off_proc->idx1[0] = 0;
-    for (int i = 0; i < local_num_rows; i += block_row_size)
-    {
-        std::vector<int> on_proc_pos(A->on_proc_num_cols, -1);
-        std::vector<int> off_proc_pos(A->off_proc_num_cols, -1);
-        for (int row_pos = 0; row_pos < block_row_size; row_pos++)
-        {
-            start = on_proc->idx1[i+row_pos];
-            end = on_proc->idx1[i+row_pos+1];
-            for (int k = start; k < end; k++)
-            {
-                col = on_proc->idx2[k];
-                block_col = col / block_col_size;
-                if (on_proc_pos[block_col] == -1)
-                {
-                    on_proc_pos[block_col] = A_on_proc->idx2.size();
-                    A_on_proc->idx2.emplace_back(block_col);
-                    A_on_proc->block_vals.emplace_back(
-                            new double[A_on_proc->b_size]());
-                }
-                val = on_proc->vals[k];
-                pos = on_proc_pos[block_col];
-                col_pos = col % block_col_size;
-                block_pos = row_pos * block_col_size + col_pos;
-                A_on_proc->block_vals[pos][block_pos] = val;
-            }
-
-            start = off_proc->idx1[i+row_pos];
-            end = off_proc->idx1[i+row_pos+1];
-            for (int k = start; k < end; k++)
-            {
-                col = off_proc->idx2[k];
-                global_col = off_proc_column_map[col];
-                block_col = global_to_block_local[global_col / block_col_size];
-                if (off_proc_pos[block_col] == -1)
-                {
-                    off_proc_pos[block_col] = A_off_proc->idx2.size();
-                    A_off_proc->idx2.emplace_back(block_col);
-                    A_off_proc->block_vals.emplace_back(
-                            new double[A_off_proc->b_size]());
-                }
-                val = off_proc->vals[k];
-                pos = off_proc_pos[block_col];
-                col_pos = global_col % block_col_size;
-                block_pos = row_pos * block_col_size + col_pos;
-                A_off_proc->block_vals[pos][block_pos] = val;
-            }
-        }
-        A_on_proc->idx1[i/block_row_size + 1] = A_on_proc->idx2.size();
-        A_off_proc->idx1[i/block_row_size + 1] = A_off_proc->idx2.size();
-    }
-    A_on_proc->nnz = A_on_proc->idx2.size();
-    A_off_proc->nnz = A_off_proc->idx2.size();
-
-    A->comm = new ParComm(A->partition, A->off_proc_column_map);
-
-    return A;
-}
-
-void ParMatrix::init_tap_communicators(RAPtor_MPI_Comm mpi_comm)
-{
-    /*********************************
-     * Initialize
-     * *******************************/
-    // Get RAPtor_MPI Information
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(mpi_comm, &rank);
-    RAPtor_MPI_Comm_size(mpi_comm, &num_procs);
-
-    // Initialize standard tap_comm
-    tap_comm = new TAPComm(partition, true);
-
-    // Initialize Variables
-    std::vector<int> off_proc_col_to_proc;
-    std::vector<int> on_node_column_map;
-    std::vector<int> on_node_col_to_proc;
-    std::vector<int> off_node_column_map;
-    std::vector<int> off_node_col_to_proc;
-    std::vector<int> on_node_to_off_proc;
-    std::vector<int> off_node_to_off_proc;
-    std::vector<int> recv_nodes;
-    std::vector<int> orig_procs;
-    std::vector<int> node_to_local_proc;
-    std::vector<int> on_proc_to_new;
-    int on_proc_nc = on_proc_column_map.size();
-    if (partition->local_num_cols)
-    {
-        on_proc_to_new.resize(partition->local_num_cols);
-        for (int i = 0; i < on_proc_nc; i++)
-        {
-            on_proc_to_new[on_proc_column_map[i] - partition->first_local_col] = i;
-        }
-    }
-
-    /*********************************
-     * Split columns by processes,
-     * on-node, and off-node
-     * *******************************/
-    // Find process on which vector value associated with each column is
-    // stored
-    partition->form_col_to_proc(off_proc_column_map, off_proc_col_to_proc);
-
-    // Partition off_proc cols into on_node and off_node
-    tap_comm->split_off_proc_cols(off_proc_column_map, off_proc_col_to_proc,
-           on_node_column_map, on_node_col_to_proc, on_node_to_off_proc,
-           off_node_column_map, off_node_col_to_proc, off_node_to_off_proc);
-
-    // Form local_L_par_comm: fully local communication (origin and
-    // destination processes both local to node)
-    tap_comm->form_local_L_par_comm(on_node_column_map, on_node_col_to_proc,
-            partition->first_local_col);
-    for (std::vector<int>::iterator it = tap_comm->local_L_par_comm->send_data->indices.begin();
-            it != tap_comm->local_L_par_comm->send_data->indices.end(); ++it)
-    {
-        *it = on_proc_to_new[*it];
-    }
-
-
-    /*********************************
-     * Form standard 3-step
-     * node-aware communicator
-     * *******************************/
-    // Gather all nodes with which any local process must communication
-    tap_comm->form_local_R_par_comm(off_node_column_map, off_node_col_to_proc,
-            orig_procs);
-
-    // Find global processes with which rank communications
-    tap_comm->form_global_par_comm(orig_procs);
-
-    // Form local_S_par_comm: initial distribution of values among local
-    // processes, before inter-node communication
-    tap_comm->form_local_S_par_comm(orig_procs);
-
-    // Adjust send indices (currently global vector indices) to be index
-    // of global vector value from previous recv
-    tap_comm->adjust_send_indices(partition->first_local_col);
-
-
-    tap_comm->update_recv(on_node_to_off_proc, off_node_to_off_proc);
-    for (std::vector<int>::iterator it = tap_comm->local_S_par_comm->send_data->indices.begin();
-            it != tap_comm->local_S_par_comm->send_data->indices.end(); ++it)
-    {
-        *it = on_proc_to_new[*it];
-    }
-
-
-    /*********************************
-     * Form simple 2-step
-     * node-aware communicator
-     * *******************************/
-    // Create simple (2-step) TAPComm for matrix communication
-    // Copy local_L_par_comm from 3-step tap_comm
-    tap_mat_comm = new TAPComm(partition, false, tap_comm->local_L_par_comm);
-
-    // Form local recv communicator.  Will recv from local rank
-    // corresponding to global rank on which data originates.  E.g. if
-    // data is on rank r = (p, n), and my rank is s = (q, m), I will
-    // recv data from (p, m).
-    tap_mat_comm->form_simple_R_par_comm(off_node_column_map, off_node_col_to_proc);
-
-    // Form global par comm.. Will recv from proc on which data
-    // originates
-    tap_mat_comm->form_simple_global_comm(off_node_col_to_proc);
-
-    // Adjust send indices (currently global vector indices) to be
-    // index of global vector value from previous recv (only updating
-    // local_R to match position in global)
-    tap_mat_comm->adjust_send_indices(partition->first_local_col);
-
-    tap_mat_comm->update_recv(on_node_to_off_proc, off_node_to_off_proc, false);
-
-    for (std::vector<int>::iterator it =
-            tap_mat_comm->global_par_comm->send_data->indices.begin();
-            it != tap_mat_comm->global_par_comm->send_data->indices.end(); ++it)
-    {
-        *it = on_proc_to_new[*it];
-    }
-}
diff --git a/raptor/core/par_matrix.hpp b/raptor/core/par_matrix.hpp
deleted file mode 100644
index 20326145..00000000
--- a/raptor/core/par_matrix.hpp
+++ /dev/null
@@ -1,852 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef PARMATRIX_HPP
-#define PARMATRIX_HPP
-
-#include <mpi.h>
-#include <math.h>
-#include <set>
-
-#include "matrix.hpp"
-#include "par_vector.hpp"
-#include "comm_pkg.hpp"
-#include "mpi_types.hpp"
-#include "partition.hpp"
-
-// Making Par Matrix an abstract Class
-/**************************************************************
- *****   ParMatrix Class
- **************************************************************
- ***** This class constructs a parallel matrix object, holding
- ***** a local diagonal matrix, a local off-diagonal block matrix,
- ***** and communication information
- *****
- ***** Attributes
- ***** -------------
- ***** global_num_rows : index_t
- *****    Number of rows in the global parallel matrix
- ***** global_num_cols : index_t
- *****    Number of columns in the parallel matrix
- ***** local_nnz : int
- *****    Number of nonzeros stored locally
- ***** local_num_rows : int
- *****    Number of rows stored locally
- ***** first_local_row : index_t
- *****    Global index of first row local to process
- ***** first_local_col : index_t
- *****    Global index of first column to fall in local block
- ***** diag : Matrix*
- *****    Matrix storing local diagonal block
- ***** offd : Matrix*
- *****    Matrix storing local off-diagonal block
- ***** offd_num_cols : index_t
- *****    Number of columns in the off-diagonal matrix
- ***** offd_column_map : std::vector<int>
- *****    Maps local columns of offd Matrix to global
- ***** comm : ParComm*
- *****    Parallel communicator for matrix
- *****
- ***** Methods
- ***** -------
- ***** initialize_partition()
- *****    Determines which rows are local to process and which
- *****    columns fall in local block
- ***** add_value()
- *****    Adds a value to a given local row and global column.
- *****    Determines if this value is in the diagonal or
- *****    off-diagonal block.
- ***** add_global_value()
- *****    Adds a value to a given global row and global column.
- *****    Determines if this value is in the diagonal or
- *****    off-diagonal block.
- ***** finalize()
- *****    Finalizes a matrix after values have been added.
- *****    Converts the matrices to the appropriate formats and
- *****    creates the parallel communicator.
- **************************************************************/
-namespace raptor
-{
-  class ParComm;
-  class TAPComm;
-  class ParCOOMatrix;
-  class ParBCOOMatrix;
-  class ParCSRMatrix;
-  class ParBSRMatrix;
-  class ParCSCMatrix;
-  class ParBSCMatrix;
-
-  class ParMatrix
-  {
-  public:
-    ParMatrix(Partition* part)
-    {
-        partition = part;
-        partition->num_shared++;
-
-        global_num_rows = partition->global_num_rows;
-        global_num_cols = partition->global_num_cols;
-        on_proc_num_cols = partition->local_num_cols;
-        local_num_rows = partition->local_num_rows;
-
-        comm = NULL;
-        tap_comm = NULL;
-        tap_mat_comm = NULL;
-        on_proc = NULL;
-        off_proc = NULL;
-    }
-
-    ParMatrix(Partition* part, index_t glob_rows, index_t glob_cols, int local_rows,
-            int on_proc_cols)
-    {
-        partition = part;
-        partition->num_shared++;
-
-        global_num_rows = glob_rows;
-        global_num_cols = glob_cols;
-        on_proc_num_cols = on_proc_cols;
-        local_num_rows = local_rows;
-
-        comm = NULL;
-        tap_comm = NULL;
-        tap_mat_comm = NULL;
-        on_proc = NULL;
-        off_proc = NULL;
-    }
-
-    ParMatrix(index_t glob_rows, index_t glob_cols)
-    {
-        partition = new Partition(glob_rows, glob_cols);
-
-        global_num_rows = partition->global_num_rows;
-        global_num_cols = partition->global_num_cols;
-        on_proc_num_cols = partition->local_num_cols;
-        local_num_rows = partition->local_num_rows;
-
-        comm = NULL;
-        tap_comm = NULL;
-        tap_mat_comm = NULL;
-        on_proc = NULL;
-        off_proc = NULL;
-    }
-
-    ParMatrix(index_t glob_rows,
-            index_t glob_cols,
-            int local_rows,
-            int local_cols,
-            index_t first_row,
-            index_t first_col,
-            Topology* topology = NULL)
-    {
-        partition = new Partition(glob_rows, glob_cols,
-                local_rows, local_cols, first_row, first_col, topology);
-
-        global_num_rows = partition->global_num_rows;
-        global_num_cols = partition->global_num_cols;
-        on_proc_num_cols = partition->local_num_cols;
-        local_num_rows = partition->local_num_rows;
-
-        comm = NULL;
-        tap_comm = NULL;
-        tap_mat_comm = NULL;
-        on_proc = NULL;
-        off_proc = NULL;
-    }
-
-    ParMatrix()
-    {
-        local_num_rows = 0;
-        global_num_rows = 0;
-        global_num_cols = 0;
-        off_proc_num_cols = 0;
-        on_proc_num_cols = 0;
-
-        comm = NULL;
-        tap_comm = NULL;
-        tap_mat_comm = NULL;
-
-        on_proc = NULL;
-        off_proc = NULL;
-
-        partition = NULL;
-    }
-
-    virtual ~ParMatrix()
-    {
-        delete off_proc;
-        delete on_proc;
-
-        if (comm) comm->delete_comm();
-        if (tap_comm) tap_comm->delete_comm();
-        if (tap_mat_comm) tap_mat_comm->delete_comm();
-
-        if (partition)
-        {
-            if (partition->num_shared)
-            {
-                partition->num_shared--;
-            }
-            else
-            {
-                delete partition;
-            }
-        }
-    }
-
-    /**************************************************************
-    *****   ParMatrix Add Value
-    **************************************************************
-    ***** Adds a value to the local portion of the parallel matrix,
-    ***** determining whether it should be added to diagonal or
-    ***** off-diagonal block.
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** local_row : index_t
-    *****    Local row of value
-    ***** global_col : index_t
-    *****    Global column of value
-    ***** value : data_t
-    *****    Value to be added to parallel matrix
-    **************************************************************/
-    void add_value(index_t row, index_t global_col, data_t value);
-
-    /**************************************************************
-    *****   ParMatrix Add Global Value
-    **************************************************************
-    ***** Adds a value to the local portion of the parallel matrix,
-    ***** determining whether it should be added to diagonal or
-    ***** off-diagonal block.
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** global_row : index_t
-    *****    Global row of value
-    ***** global_col : index_t
-    *****    Global column of value
-    ***** value : data_t
-    *****    Value to be added to parallel matrix
-    **************************************************************/
-    void add_global_value(int row, int global_col, double value);
-
-    /**************************************************************
-    *****   ParMatrix Finalize
-    **************************************************************
-    ***** Finalizes the diagonal and off-diagonal matrices.  Sorts
-    ***** the local_to_global indices, and creates the parallel
-    ***** communicator
-    **************************************************************/
-    void finalize(bool create_comm = true); //b_cols added for BSR
-
-    int* map_partition_to_local();
-    void condense_off_proc();
-
-    void residual(ParVector& x, ParVector& b, ParVector& r, bool tap = false);
-    void tap_residual(ParVector& x, ParVector& b, ParVector& r);
-    void mult(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult(ParVector& x, ParVector& b);
-    void mult_append(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult_append(ParVector& x, ParVector& b);
-    void mult_T(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult_T(ParVector& x, ParVector& b);
-    ParMatrix* mult(ParCSRMatrix* B, bool tap = false);
-    ParMatrix* tap_mult(ParCSRMatrix* B);
-    ParMatrix* mult_T(ParCSCMatrix* B, bool tap = false);
-    ParMatrix* mult_T(ParCSRMatrix* B, bool tap = false);
-    ParMatrix* tap_mult_T(ParCSCMatrix* B);
-    ParMatrix* tap_mult_T(ParCSRMatrix* B);
-    ParMatrix* add(ParCSRMatrix* A);
-    ParMatrix* subtract(ParCSRMatrix* A);
-
-    void init_tap_communicators(RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD);
-    void update_tap_comm(ParMatrix* old, const std::vector<int>& old_to_new)
-    {
-        tap_comm = new TAPComm((TAPComm*) old->tap_comm, old_to_new, NULL);
-        tap_mat_comm = new TAPComm((TAPComm*) old->tap_mat_comm, old_to_new,
-                tap_comm->local_L_par_comm);
-    }
-    void update_tap_comm(ParMatrix* old, const std::vector<int>& on_old_to_new,
-            const std::vector<int>& off_old_to_new)
-    {
-        tap_comm = new TAPComm((TAPComm*) old->tap_comm, on_old_to_new, off_old_to_new,
-                NULL);
-        tap_mat_comm = new TAPComm((TAPComm*) old->tap_mat_comm, on_old_to_new,
-                off_old_to_new, tap_comm->local_L_par_comm);
-    }
-
-
-
-    void sort()
-    {
-        on_proc->sort();
-        off_proc->sort();
-    }
-
-    virtual ParMatrix* transpose() = 0;
-
-    std::vector<int>& get_off_proc_column_map()
-    {
-        return off_proc_column_map;
-    }
-
-    std::vector<int>& get_on_proc_column_map()
-    {
-        return on_proc_column_map;
-    }
-
-    std::vector<int>& get_local_row_map()
-    {
-        return local_row_map;
-    }
-
-    virtual ParCOOMatrix* to_ParCOO() = 0;
-    virtual ParCSRMatrix* to_ParCSR() = 0;
-    virtual ParCSCMatrix* to_ParCSC() = 0;
-    virtual ParCOOMatrix* to_ParBCOO() = 0;
-    virtual ParCSRMatrix* to_ParBSR() = 0;
-    virtual ParCSCMatrix* to_ParBSC() = 0;
-    virtual ParMatrix* copy() = 0;
-    virtual void copy_helper(ParCSRMatrix* A);
-    virtual void copy_helper(ParCSCMatrix* A);
-    virtual void copy_helper(ParCOOMatrix* A);
-    void default_copy_helper(ParMatrix* A);
-
-    // Store dimensions of parallel matrix
-    int local_nnz;
-    int local_num_rows;
-    int global_num_rows;
-    int global_num_cols;
-    int off_proc_num_cols;
-    int on_proc_num_cols;
-
-    // Store two matrices: on_proc containing columns
-    // corresponding to vector values stored on_process
-    // and off_proc columns correspond to vector values
-    // stored off process (on other processes)
-    Matrix* on_proc;
-    Matrix* off_proc;
-
-    // Store information about columns of off_proc
-    // It will be condensed to only store columns with
-    // nonzeros, and these must be mapped to
-    // global column indices
-    std::vector<int> off_proc_column_map; // Maps off_proc local to global
-    std::vector<int> on_proc_column_map; // Maps on_proc local to global
-    std::vector<int> local_row_map; // Maps local rows to global
-
-    // Parallel communication package indicating which
-    // processes hold vector values associated with off_proc,
-    // and which processes need vector values from this proc
-    Partition* partition;
-    ParComm* comm;
-    TAPComm* tap_comm;
-    TAPComm* tap_mat_comm;
-  };
-
-  class ParCOOMatrix : public ParMatrix
-  {
-  public:
-    ParCOOMatrix(bool form_mat = true) : ParMatrix()
-    {
-        if (form_mat)
-        {
-            on_proc = new COOMatrix(0, 0, 0);
-            off_proc = new COOMatrix(0, 0, 0);
-        }
-    }
-
-    ParCOOMatrix(index_t glob_rows,
-            index_t glob_cols,
-            int nnz_per_row = 5, bool form_mat = true)
-        : ParMatrix(glob_rows, glob_cols)
-    {
-        if (form_mat)
-        {
-            on_proc = new COOMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new COOMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCOOMatrix(index_t glob_rows, index_t glob_cols, int local_rows,
-            int local_cols, index_t first_row, index_t first_col,
-            int nnz_per_row = 5, bool form_mat = true)
-        : ParMatrix(glob_rows, glob_cols,
-                local_rows, local_cols, first_row, first_col)
-    {
-        if (form_mat)
-        {
-            on_proc = new COOMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new COOMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCOOMatrix(Partition* part,
-            int nnz_per_row = 5, bool form_mat = true) : ParMatrix(part)
-    {
-        if (form_mat)
-        {
-            on_proc = new COOMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new COOMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParCOOMatrix* copy()
-    {
-        ParCOOMatrix* A = new ParCOOMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-    void copy_helper(ParCSRMatrix* A);
-    void copy_helper(ParCSCMatrix* A);
-    void copy_helper(ParCOOMatrix* A);
-
-    void mult(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult(ParVector& x, ParVector& b);
-    void mult_T(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult_T(ParVector& x, ParVector& b);
-
-    ParCOOMatrix* transpose();
-  };
-
-
-  class ParBCOOMatrix : public ParCOOMatrix
-  {
-  public:
-    ParBCOOMatrix() : ParCOOMatrix(false)
-    {
-        on_proc = new BCOOMatrix(0, 0, 1, 1, 0);
-        off_proc = new BCOOMatrix(0, 0, 1, 1, 0);
-    }
-
-    ParBCOOMatrix(int global_block_rows, int global_block_cols,
-            int block_row_size, int block_col_size, int nnz_per_row)
-        : ParCOOMatrix(global_block_rows, global_block_cols, nnz_per_row, false)
-    {
-        on_proc = new BCOOMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-        off_proc = new BCOOMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-    }
-
-    ParBCOOMatrix(int global_block_rows, int global_block_cols,
-            int local_block_rows, int local_block_cols,
-            int first_block_row, int first_block_col,
-            int block_row_size, int block_col_size, int nnz_per_row = 5)
-        : ParCOOMatrix(global_block_rows, global_block_cols,
-                local_block_rows, local_block_cols, first_block_row,
-                first_block_col, nnz_per_row, false)
-    {
-        on_proc = new BCOOMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-        off_proc = new BCOOMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-    }
-
-    ParBCOOMatrix(Partition* part, int block_row_size, int block_col_size,
-            int nnz_per_row = 5) : ParCOOMatrix(part, nnz_per_row, false)
-    {
-        on_proc = new BCOOMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-        off_proc = new BCOOMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz_per_row);
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParCOOMatrix* copy()
-    {
-        ParCOOMatrix* A = new ParCOOMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-  };
-
-  class ParCSRMatrix : public ParMatrix
-  {
-  public:
-    ParCSRMatrix(bool form_mat = true) : ParMatrix()
-    {
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(0, 0, 0);
-            off_proc = new CSRMatrix(0, 0, 0);
-        }
-    }
-
-    ParCSRMatrix(index_t glob_rows, index_t glob_cols, int nnz = 0,
-            bool form_mat = true) : ParMatrix(glob_rows, glob_cols)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz);
-            off_proc = new CSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz);
-        }
-    }
-
-    ParCSRMatrix(index_t glob_rows, index_t glob_cols, int local_rows,
-            int local_cols, index_t first_row, index_t first_col, Topology* topology = NULL,
-            int nnz = 0, bool form_mat = true) : ParMatrix(glob_rows, glob_cols,
-                local_rows, local_cols, first_row, first_col, topology)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz);
-            off_proc = new CSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz);
-        }
-    }
-
-    ParCSRMatrix(Partition* part,
-            int nnz = 0, bool form_mat = true) : ParMatrix(part)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz);
-            off_proc = new CSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz);
-        }
-    }
-
-    ParCSRMatrix(Partition* part, Matrix* _on_proc, Matrix* _off_proc) : ParMatrix(part)
-    {
-        on_proc = _on_proc;
-        off_proc = _off_proc;
-        on_proc_num_cols = on_proc->n_cols;
-        off_proc_num_cols = off_proc->n_cols;
-        local_num_rows = on_proc->n_rows;
-        finalize();
-    }
-
-
-    ParCSRMatrix(Partition* part, index_t glob_rows, index_t glob_cols,
-            int local_rows, int on_proc_cols, int off_proc_cols, int nnz = 0,
-            bool form_mat = true) : ParMatrix(part, glob_rows, glob_cols,
-                local_rows, on_proc_cols)
-    {
-        off_proc_num_cols = off_proc_cols;
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(local_num_rows, on_proc_cols, nnz);
-            off_proc = new CSRMatrix(local_num_rows, off_proc_num_cols, nnz);
-        }
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParCSRMatrix* copy()
-    {
-        ParCSRMatrix* A = new ParCSRMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-
-    void copy_structure(ParBSRMatrix* A);
-
-    ParBSRMatrix* to_ParBSR(const int block_row_size, const int block_col_size);
-
-    void copy_helper(ParCSRMatrix* A);
-    void copy_helper(ParCSCMatrix* A);
-    void copy_helper(ParCOOMatrix* A);
-
-    ParCSRMatrix* strength(strength_t strength_type, double theta = 0.0,
-            bool tap_amg = false, int num_variables = 1, int* variables = NULL);
-    ParCSRMatrix* aggregate();
-    ParCSRMatrix* fit_candidates(double* B, double* R, int num_candidates,
-            double tol = 1e-10);
-    int maximal_independent_set(std::vector<int>& local_states,
-            std::vector<int>& off_proc_states, int max_iters = -1);
-
-    void mult(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult(ParVector& x, ParVector& b);
-    void mult_T(ParVector& x, ParVector& b, bool tap = false);
-    void tap_mult_T(ParVector& x, ParVector& b);
-    ParCSRMatrix* mult(ParCSRMatrix* B, bool tap = false);
-    ParCSRMatrix* tap_mult(ParCSRMatrix* B);
-    ParCSRMatrix* mult_T(ParCSCMatrix* A, bool tap = false);
-    ParCSRMatrix* mult_T(ParCSRMatrix* A, bool tap = false);
-    ParCSRMatrix* tap_mult_T(ParCSCMatrix* A);
-    ParCSRMatrix* tap_mult_T(ParCSRMatrix* A);
-    ParCSRMatrix* add(ParCSRMatrix* A);
-    ParCSRMatrix* subtract(ParCSRMatrix* B);
-
-    void print_mult(ParCSRMatrix* B);
-    void print_mult_T(ParCSCMatrix* A);
-    void print_mult();
-    void print_mult_T();
-
-    void mult_helper(ParCSRMatrix* B, ParCSRMatrix* C, CSRMatrix* recv,
-            CSRMatrix* C_on_on, CSRMatrix* C_on_off);
-    CSRMatrix* mult_T_partial(ParCSCMatrix* A);
-    CSRMatrix* mult_T_partial(CSCMatrix* A_off);
-    void mult_T_combine(ParCSCMatrix* A, ParCSRMatrix* C, CSRMatrix* recv_mat,
-            CSRMatrix* C_on_on, CSRMatrix* C_off_on);
-
-    ParCSRMatrix* transpose();
-  };
-
- class ParBSRMatrix : public ParCSRMatrix
-  {
-  public:
-    ParBSRMatrix() : ParCSRMatrix(false)
-    {
-        on_proc = new BSRMatrix(0, 0, 1, 1, 0);
-        off_proc = new BSRMatrix(0, 0, 1, 1, 0);
-    }
-
-    ParBSRMatrix(int global_block_rows, int global_block_cols,
-            int block_row_size, int block_col_size,
-            int nnz = 0)
-        :  ParCSRMatrix(global_block_rows, global_block_cols, nnz, false)
-    {
-        on_proc = new BSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParBSRMatrix(int global_block_rows, int global_block_cols,
-            int local_block_rows, int local_block_cols,
-            int first_block_row, int first_block_col,
-            int block_row_size, int block_col_size,
-            Topology* topology = NULL, int nnz = 0)
-        : ParCSRMatrix(global_block_rows, global_block_cols,
-                local_block_rows, local_block_cols,
-                first_block_row, first_block_col, topology,
-                nnz, false)
-    {
-        on_proc = new BSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParBSRMatrix(Partition* part, int block_row_size, int block_col_size,
-            int nnz = 0) : ParCSRMatrix(part, nnz, false)
-    {
-        on_proc = new BSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParBSRMatrix(Partition* part, BSRMatrix* _on_proc, BSRMatrix* _off_proc)
-        : ParCSRMatrix(part)
-    {
-        on_proc = _on_proc;
-        off_proc = _off_proc;
-        on_proc_num_cols = on_proc->n_cols;
-        off_proc_num_cols = off_proc->n_cols;
-        local_num_rows = on_proc->n_rows;
-        finalize();
-    }
-
-    ParBSRMatrix(Partition* part, int global_block_rows, int global_block_cols,
-            int local_block_rows, int on_proc_block_cols, int off_proc_block_cols,
-            int block_row_size, int block_col_size, int nnz = 0)
-          : ParCSRMatrix(part, global_block_rows, global_block_cols,
-                  local_block_rows, on_proc_block_cols, off_proc_block_cols,
-                  nnz, false)
-    {
-        off_proc_num_cols = off_proc_block_cols;
-        on_proc = new BSRMatrix(local_block_rows, on_proc_block_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSRMatrix(local_block_rows, off_proc_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParBSRMatrix* copy()
-    {
-        ParBSRMatrix* A = new ParBSRMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-
-  };
-
-
-
-  class ParCSCMatrix : public ParMatrix
-  {
-  public:
-    ParCSCMatrix(bool form_mat = true) : ParMatrix()
-    {
-        if (form_mat)
-        {
-            on_proc = new CSCMatrix(0, 0, 0);
-            off_proc = new CSCMatrix(0, 0, 0);
-        }
-    }
-
-    ParCSCMatrix(index_t glob_rows, index_t glob_cols, int nnz_per_row = 5,
-            bool form_mat = true) : ParMatrix(glob_rows, glob_cols)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSCMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new CSCMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCSCMatrix(index_t glob_rows, index_t glob_cols, int local_n_rows,
-            int local_n_cols, index_t first_row, index_t first_col,
-            int nnz_per_row = 5, bool form_mat = true)
-        : ParMatrix(glob_rows, glob_cols, local_n_rows, local_n_cols,
-                first_row, first_col)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSCMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new CSCMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCSCMatrix(Partition* part, index_t glob_rows, index_t glob_cols, int local_rows,
-            int on_proc_cols, int off_proc_cols, int nnz_per_row = 5, bool form_mat = true)
-        : ParMatrix(part, glob_rows, glob_cols, local_rows, on_proc_cols)
-    {
-        off_proc_num_cols = off_proc_cols;
-        if (form_mat)
-        {
-            on_proc = new CSCMatrix(local_num_rows, on_proc_cols, nnz_per_row);
-            off_proc = new CSCMatrix(local_num_rows, off_proc_num_cols, nnz_per_row);
-        }
-    }
-
-
-    ParCSCMatrix(Partition* part, int nnz_per_row = 5, bool form_mat = true)
-        : ParMatrix(part)
-    {
-        if (form_mat)
-        {
-            on_proc = new CSRMatrix(partition->local_num_rows, partition->local_num_cols,
-                    nnz_per_row);
-            off_proc = new CSRMatrix(partition->local_num_rows, partition->global_num_cols,
-                    nnz_per_row);
-        }
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParCSCMatrix* copy()
-    {
-        ParCSCMatrix* A = new ParCSCMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-
-    void copy_helper(ParCSRMatrix* A);
-    void copy_helper(ParCSCMatrix* A);
-    void copy_helper(ParCOOMatrix* A);
-
-    void mult(ParVector& x, ParVector& b, bool tap);
-    void tap_mult(ParVector& x, ParVector& b);
-    void mult_T(ParVector& x, ParVector& b, bool tap);
-    void tap_mult_T(ParVector& x, ParVector& b);
-
-    ParCSCMatrix* transpose();
-  };
-
-
-class ParBSCMatrix : public ParCSCMatrix
-  {
-  public:
-    ParBSCMatrix() : ParCSCMatrix(false)
-    {
-        on_proc = new BSCMatrix(0, 0, 1, 1, 0);
-        off_proc = new BSCMatrix(0, 0, 1, 1, 0);
-    }
-
-    ParBSCMatrix(int global_block_rows, int global_block_cols,
-            int block_row_size, int block_col_size,
-            int nnz = 0)
-        : ParCSCMatrix(global_block_rows, global_block_cols, nnz, false)
-    {
-        on_proc = new BSCMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSCMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParBSCMatrix(Partition* part, int block_row_size, int block_col_size,
-            int nnz = 0) : ParCSCMatrix(part, nnz, false)
-    {
-        on_proc = new BSCMatrix(partition->local_num_rows, partition->local_num_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSCMatrix(partition->local_num_rows, partition->global_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParBSCMatrix(Partition* part, int global_block_rows, int global_block_cols,
-            int local_block_rows, int on_proc_block_cols, int off_proc_block_cols,
-            int block_row_size, int block_col_size, int nnz = 0)
-          : ParCSCMatrix(part, global_block_rows, global_block_cols, local_block_rows,
-                  on_proc_block_cols, off_proc_block_cols, nnz, false)
-    {
-        off_proc_num_cols = off_proc_block_cols;
-        on_proc = new BSCMatrix(local_num_rows, on_proc_block_cols,
-                block_row_size, block_col_size, nnz);
-        off_proc = new BSCMatrix(local_num_rows, off_proc_num_cols,
-                block_row_size, block_col_size, nnz);
-    }
-
-    ParCOOMatrix* to_ParCOO();
-    ParCSRMatrix* to_ParCSR();
-    ParCSCMatrix* to_ParCSC();
-    ParCOOMatrix* to_ParBCOO();
-    ParCSRMatrix* to_ParBSR();
-    ParCSCMatrix* to_ParBSC();
-
-    ParBSCMatrix* copy()
-    {
-        ParBSCMatrix* A = new ParBSCMatrix();
-        A->copy_helper(this);
-        return A;
-    }
-
-  };
-
-
-}
-#endif
diff --git a/raptor/core/par_vector.cpp b/raptor/core/par_vector.cpp
deleted file mode 100644
index 0eb2fcdd..00000000
--- a/raptor/core/par_vector.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "par_vector.hpp"
-
-using namespace raptor;
-
-
-/**************************************************************
-*****   Vector AXPY
-**************************************************************
-***** Multiplies the local vector by a constant, alpha, and then
-***** sums each element with corresponding entry of Y
-*****
-***** Parameters
-***** -------------
-***** y : ParVector* y
-*****    Vector to be summed with
-***** alpha : data_t
-*****    Constant value to multiply each element of vector by
-**************************************************************/
-void ParVector::axpy(ParVector& x, data_t alpha)
-{
-    if (local_n)
-    {
-        local.axpy(x.local, alpha);
-    }
-}
-
-/**************************************************************
-*****   Vector Scale
-**************************************************************
-***** Multiplies the local vector by a constant, alpha
-*****
-***** Parameters
-***** -------------
-***** alpha : data_t
-*****    Constant value to multiply each element of vector by
-**************************************************************/
-void ParVector::scale(data_t alpha)
-{
-    if (local_n)
-    {
-        local.scale(alpha);
-    }
-}
-
-/**************************************************************
-*****   ParVector Set Constant Value
-**************************************************************
-***** Sets each element of the local vector to a constant value
-*****
-***** Parameters
-***** -------------
-***** alpha : data_t
-*****    Value to set each element of local vector to
-**************************************************************/
-void ParVector::set_const_value(data_t alpha)
-{
-    if (local_n)
-    {
-        local.set_const_value(alpha);
-    }
-}
-
-/**************************************************************
-*****   ParVector Set Random Values
-**************************************************************
-***** Sets each element of the local vector to a random value
-**************************************************************/
-void ParVector::set_rand_values()
-{
-    if (local_n)
-    {
-        local.set_rand_values();
-    }
-}
-
-/**************************************************************
-*****   Vector Norm
-**************************************************************
-***** Calculates the P norm of the global vector (for a given P)
-*****
-***** Parameters
-***** -------------
-***** p : index_t
-*****    Determines which p-norm to calculate
-**************************************************************/
-data_t ParVector::norm(index_t p)
-{
-    data_t result = 0.0;
-    if (local_n)
-    {
-        result = local.norm(p);
-        result = pow(result, p); // undoing root of p from local operation
-    }
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, &result, 1, RAPtor_MPI_DATA_T, RAPtor_MPI_SUM, RAPtor_MPI_COMM_WORLD);
-    return pow(result, 1./p);
-}
-
-
-data_t ParVector::inner_product(ParVector& x)
-{
-    data_t inner_prod = 0.0;
-
-    if (local_n != x.local_n)
-    {
-        int rank;
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        printf("Error.  Cannot perform inner product.  Dimensions do not match.\n");
-        exit(-1);
-    }
-
-    if (local_n)
-    {
-        inner_prod = local.inner_product(x.local);
-    }
-
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, &inner_prod, 1, RAPtor_MPI_DATA_T, RAPtor_MPI_SUM, RAPtor_MPI_COMM_WORLD);
-    
-    return inner_prod;
-}
-
-
diff --git a/raptor/core/par_vector.hpp b/raptor/core/par_vector.hpp
deleted file mode 100644
index 4685e16a..00000000
--- a/raptor/core/par_vector.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_PARVECTOR_HPP
-#define RAPTOR_CORE_PARVECTOR_HPP
-
-#include "assert.h"
-
-#include <mpi.h>
-#include <math.h>
-
-#include "mpi_types.hpp"
-#include "vector.hpp"
-
-/**************************************************************
- *****   ParVector Class
- **************************************************************
- ***** This class constructs a parallel vector, containing 
- ***** values for a local portion
- *****
- ***** Attributes
- ***** -------------
- ***** local : Vector*
- *****    Local portion of the parallel vector
- ***** global_n : index_t
- *****    Number of entries in the global vector
- ***** local_n : index_t
- *****    Dimension of the local portion of the vector
- ***** 
- ***** Methods
- ***** -------
- ***** set_const_value(data_t alpha)
- *****    Sets the local vector to a constant value
- ***** set_rand_values()
- *****    Sets each element of the local vector to a random value
- ***** axpy(Vector& y, data_t alpha)
- *****    Performs axpy on local portion of vector
- ***** scale(data_t alpha)
- *****    Multiplies entries of the local vector by a constant
- ***** norm(index_t p)
- *****    Calculates the p-norm of the global vector
- **************************************************************/
-namespace raptor
-{
-    class ParVector
-    {
-    public:
-        /**************************************************************
-        *****   ParVector Class Constructor
-        **************************************************************
-        ***** Sets the dimensions of the global vector and initializes
-        ***** an empty local vector of the given size
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** glbl_n : index_t
-        *****    Number of entries in global vector
-        ***** lcl_n : index_t
-        *****    Number of entries of global vector stored locally
-        **************************************************************/
-        ParVector(index_t glbl_n, int lcl_n)
-        {
-            resize(glbl_n, lcl_n);
-        }
-
-        ParVector(const ParVector& x)
-        {
-            copy(x);
-        }
-
-        /**************************************************************
-        *****   ParVector Class Constructor
-        **************************************************************
-        ***** Creates an empty ParVector (local_n = 0)
-        **************************************************************/
-        ParVector()
-        {
-            local_n = 0;
-        }
-
-        /**************************************************************
-        *****   ParVector Class Destructor
-        **************************************************************
-        ***** Deletes the local vector
-        **************************************************************/
-        ~ParVector()
-        {
-        }
-
-        void resize(index_t glbl_n, int lcl_n)
-        {
-            global_n = glbl_n;
-            local_n = lcl_n;
-            local.resize(local_n);
-        }
-
-        void copy(const ParVector& x)
-        {
-            global_n = x.global_n;
-            local_n = x.local_n;
-            local.copy(x.local);
-        }
-
-        /**************************************************************
-        *****   ParVector Set Constant Value
-        **************************************************************
-        ***** Sets each element of the local vector to a constant value
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** alpha : data_t
-        *****    Value to set each element of local vector to
-        **************************************************************/
-        void set_const_value(data_t alpha);
-
-        /**************************************************************
-        *****   ParVector Set Random Values
-        **************************************************************
-        ***** Sets each element of the local vector to a random value
-        **************************************************************/
-        void set_rand_values();
-
-        /**************************************************************
-        *****   Vector AXPY
-        **************************************************************
-        ***** Multiplies the local vector by a constant, alpha, and then
-        ***** sums each element with corresponding entry of Y
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** y : ParVector* y
-        *****    ParVector to be summed with
-        ***** alpha : data_t
-        *****    Constant value to multiply each element of vector by
-        **************************************************************/
-        void axpy(ParVector& y, data_t alpha);
-
-        /**************************************************************
-        *****   Vector Scale
-        **************************************************************
-        ***** Multiplies the local vector by a constant, alpha
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** alpha : data_t
-        *****    Constant value to multiply each element of vector by
-        **************************************************************/
-        void scale(data_t alpha);
-
-        /**************************************************************
-        *****   Vector Norm
-        **************************************************************
-        ***** Calculates the P norm of the global vector (for a given P)
-        *****
-        ***** Parameters
-        ***** -------------
-        ***** p : index_t
-        *****    Determines which p-norm to calculate
-        **************************************************************/
-        data_t norm(index_t p);
-
-        data_t inner_product(ParVector& x);        
-
-        const data_t& operator[](const int index) const
-        {
-            return local.values[index];
-        }
-
-        data_t& operator[](const int index)
-        {
-            return local.values[index];
-        }
-
-        Vector local;
-        int global_n;
-        int local_n;
-    };
-
-}
-#endif
diff --git a/raptor/core/partition.hpp b/raptor/core/partition.hpp
deleted file mode 100644
index c1878b9c..00000000
--- a/raptor/core/partition.hpp
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef PARTITION_HPP 
-#define PARTITION_HPP
-
-#include <mpi.h>
-#include <math.h>
-#include <set>
-
-#include "types.hpp"
-#include "topology.hpp"
-
-#define STANDARD_PPN 4
-#define STANDARD_PROC_LAYOUT 1
-
-/**************************************************************
- *****   Partition Class
- **************************************************************
- ***** This class holds the partition of a number of vertices 
- ***** across a number of processes
- *****
- ***** Attributes
- ***** -------------
- ***** global_num_indices : index_t
- *****    Number of rows to be partitioned
- ***** first_local_idx : index_t
- *****    First global index of a row in partition local to rank
- ***** local_num_indices : index_t
- *****    Number of rows local to rank's partition
- *****
- ***** Methods
- ***** ---------
- **************************************************************/
-namespace raptor
-{
-  class Partition
-  {
-  public:
-    Partition(index_t _global_num_rows, index_t _global_num_cols,
-            Topology* _topology = NULL)
-    {
-        int rank, num_procs;
-        int avg_num;
-        int extra;
-
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-        global_num_rows = _global_num_rows;
-        global_num_cols = _global_num_cols;
-
-        // Partition rows across processes
-        avg_num = global_num_rows / num_procs;
-        extra = global_num_rows % num_procs;
-        first_local_row = avg_num * rank;
-        local_num_rows = avg_num;
-        if (extra > rank)
-        {
-            first_local_row += rank;
-            local_num_rows++;
-        }
-        else
-        {
-            first_local_row += extra;
-        }
-
-        // Partition cols across processes
-        if (global_num_rows < num_procs)
-        {
-            num_procs = global_num_rows;
-        }
-        avg_num = global_num_cols / num_procs;
-        extra = global_num_cols % num_procs;
-        if (local_num_rows)
-        {
-            first_local_col = avg_num * rank;
-            local_num_cols = avg_num;
-            if (extra > rank)
-            {
-                first_local_col += rank;
-                local_num_cols++;
-            }
-            else
-            {
-                first_local_col += extra;
-            }
-        }
-        else
-        {
-            first_local_col = 0;
-            local_num_cols = 0;
-        }
-
-        last_local_row = first_local_row + local_num_rows - 1;
-        last_local_col = first_local_col + local_num_cols - 1;
-
-        num_shared = 0;
-
-        create_assumed_partition();
-
-        if (_topology == NULL)
-        {
-            topology = new Topology();
-        }
-        else
-        {
-            topology = _topology;
-            topology->num_shared++;
-        }
-    }
-
-    Partition(index_t _global_num_rows, index_t _global_num_cols,
-            index_t _brows, index_t _bcols, Topology* _topology = NULL)
-    {
-        int rank, num_procs;
-        int avg_num_blocks, global_num_row_blocks, global_num_col_blocks;
-        int extra;
-
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-        global_num_rows = _global_num_rows;
-        global_num_cols = _global_num_cols;
-
-        // Partition rows across processes
-        global_num_row_blocks = global_num_rows / _brows;
-        avg_num_blocks = global_num_row_blocks / num_procs;
-        extra = global_num_row_blocks % num_procs;
-        first_local_row = avg_num_blocks * rank * _brows;
-        local_num_rows = avg_num_blocks * _brows;
-        if (extra > rank)
-        {
-            first_local_row += rank * _brows;
-            local_num_rows += _brows;
-        }
-        else
-        {
-            first_local_row += extra * _brows;
-        }
-
-        // Partition cols across processes
-            // local_num_cols = number of cols in on_proc matrix
-        if (global_num_row_blocks < num_procs)
-        {
-            num_procs = global_num_row_blocks;
-        }
-
-            global_num_col_blocks = global_num_cols / _bcols;
-        avg_num_blocks = global_num_col_blocks / num_procs;
-        extra = global_num_col_blocks % num_procs;
-        if (local_num_rows)
-        {
-            first_local_col = avg_num_blocks * rank * _bcols;
-            local_num_cols = avg_num_blocks * _bcols;
-            if (extra > rank)
-            {
-                first_local_col += rank * _bcols;
-                local_num_cols += _bcols;
-            }
-            else
-            {
-                first_local_col += extra * _bcols;
-            }
-        }
-        else
-        {
-            local_num_cols = 0;
-        }
-
-        last_local_row = first_local_row + local_num_rows - 1;
-        last_local_col = first_local_col + local_num_cols - 1;
-
-        num_shared = 0;
-
-        create_assumed_partition();
-
-        if (_topology == NULL)
-        {
-            topology = new Topology();
-        }
-        else
-        {
-            topology = _topology;
-            topology->num_shared++;
-        }
-    }
-
-    Partition(index_t _global_num_rows, index_t _global_num_cols,
-            int _local_num_rows, int _local_num_cols,
-            index_t _first_local_row, index_t _first_local_col,
-            Topology* _topology = NULL)
-    {
-        global_num_rows = _global_num_rows;
-        global_num_cols = _global_num_cols;
-        local_num_rows = _local_num_rows;
-        local_num_cols = _local_num_cols;
-        first_local_row = _first_local_row;
-        first_local_col = _first_local_col;
-        last_local_row = first_local_row + local_num_rows - 1;
-        last_local_col = first_local_col + local_num_cols - 1;
-
-        num_shared = 0;
-
-        create_assumed_partition();
-
-        if (_topology == NULL)
-        {
-            topology = new Topology();
-        }
-        else
-        {
-            topology = _topology;
-            topology->num_shared++;
-        }
-    }
-
-    Partition(Topology* _topology = NULL)
-    {
-        if (_topology == NULL)
-        {
-            topology = new Topology();
-        }
-        else
-        {
-            topology = _topology;
-            topology->num_shared++;
-        }
-
-        num_shared = 0;
-        global_num_rows = 0;
-        global_num_cols = 0;
-        local_num_rows = 0;
-        local_num_cols = 0;
-        first_local_row = 0;
-        first_local_col = 0;
-        last_local_row = 0;
-        last_local_col = 0;
-        assumed_num_cols = 0;
-    }
-
-    Partition(Partition* A, Partition* B)
-    {
-        global_num_rows = A->global_num_rows;
-        global_num_cols = B->global_num_cols;
-        local_num_rows = A->local_num_rows;
-        local_num_cols = B->local_num_cols;
-        first_local_row = A->first_local_row;
-        first_local_col = B->first_local_col;
-        last_local_row = A->last_local_row;
-        last_local_col = B->last_local_col;
-
-        num_shared = 0;
-
-        assumed_num_cols = B->assumed_num_cols;
-        first_cols.resize(B->first_cols.size());
-        std::copy(B->first_cols.begin(), B->first_cols.end(),
-                first_cols.begin());
-
-        create_assumed_partition();
-
-        topology = A->topology;
-        topology->num_shared++;
-    }
-
-    Partition* transpose()
-    {
-        return new Partition(global_num_cols, global_num_rows,
-                local_num_cols, local_num_rows, first_local_col,
-                first_local_row, topology);
-    }
-
-    ~Partition()
-    {
-        if (topology->num_shared)
-        {
-            topology->num_shared--;
-        }
-        else
-        {
-            delete topology;
-        }
-    }
-
-    void create_assumed_partition()
-    {
-        // Get RAPtor_MPI Information
-        int rank, num_procs;
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-        
-        assumed_num_cols = global_num_cols / num_procs;
-        if (global_num_cols % num_procs) assumed_num_cols++;
-
-        first_cols.resize(num_procs+1);
-        RAPtor_MPI_Allgather(&(first_local_col), 1, RAPtor_MPI_INT, first_cols.data(), 1, RAPtor_MPI_INT,
-                        RAPtor_MPI_COMM_WORLD);
-        first_cols[num_procs] = global_num_cols;
-    }
-
-    void form_col_to_proc (const std::vector<int>& off_proc_column_map,
-            std::vector<int>& off_proc_col_to_proc) 
-    {
-        int rank, num_procs;
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-        int global_col, assumed_proc;
-        int ctr = 0;
-        off_proc_col_to_proc.resize(off_proc_column_map.size());
-        for (std::vector<int>::const_iterator it = off_proc_column_map.begin();
-                        it != off_proc_column_map.end(); ++it)
-        {
-            global_col = *it;
-            assumed_proc = global_col / assumed_num_cols;
-            while (global_col < first_cols[assumed_proc])
-            {
-                assumed_proc--;
-            }
-            while (assumed_proc < num_procs - 1 && global_col >= first_cols[assumed_proc+1])
-            {
-                assumed_proc++;
-            }
-            off_proc_col_to_proc[ctr++] = assumed_proc;
-        }
-    }
-
-
-    index_t global_num_rows;
-    index_t global_num_cols;
-    int local_num_rows;
-    int local_num_cols;
-    index_t first_local_row;
-    index_t first_local_col;
-    index_t last_local_row;
-    index_t last_local_col;
-
-    int assumed_num_cols;
-    std::vector<int> first_cols;
-
-    Topology* topology;
-
-    int num_shared;  // Number of ParMatrix classes using partition
-
-  };
-}
-#endif
-
-
-
diff --git a/raptor/core/tap_comm.cpp b/raptor/core/tap_comm.cpp
deleted file mode 100644
index 3d744908..00000000
--- a/raptor/core/tap_comm.cpp
+++ /dev/null
@@ -1,1158 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "comm_pkg.hpp"
-
-//#include <pmi.h>
-//#include <rca_lib.h>
-
-using namespace raptor;
-
-/**************************************************************
-*****   Split Off Proc Cols
-**************************************************************
-***** Splits off_proc_column_map into on_node_column_map and 
-***** off_node_column map.  Also maps each of these columns to 
-***** their corresponding process, and maps each local index
-***** in on_node and off_node to off_proc
-*****
-***** Parameters
-***** -------------
-***** off_proc_column_map : std::vector<int>&
-*****    Vector holding rank's off_proc_columns
-***** off_proc_col_to_proc : std::vector<int>&
-*****    Vector mapping rank's off_proc_columns to distant procs
-***** on_node_column_map : std::vector<int>&
-*****    Will be returned holding on_node columns
-***** on_node_col_to_proc : std::vector<int>&
-*****    Will be returned holding procs corresponding to on_node cols
-***** on_node_to_off_proc : std::vector<int>&
-*****    Will be returned holding map from on_node to off_proc
-***** off_node_column_map : std::vector<int>&
-*****    Will be returned holding off_node columns
-***** off_node_col_to_node : std::vector<int>&
-*****    Will be returned holding procs corresponding to off_node cols
-***** off_node_to_off_proc : std::vector<int>&
-*****    Will be returned holding map from off_node to off_proc
-**************************************************************/
-void TAPComm::split_off_proc_cols(const std::vector<int>& off_proc_column_map,
-        const std::vector<int>& off_proc_col_to_proc,
-        std::vector<int>& on_node_column_map,
-        std::vector<int>& on_node_col_to_proc,
-        std::vector<int>& on_node_to_off_proc,
-        std::vector<int>& off_node_column_map,
-        std::vector<int>& off_node_col_to_proc,
-        std::vector<int>& off_node_to_off_proc)
-{
-    int rank, rank_node, num_procs;
-    int proc;
-    int node;
-    int global_col;
-    int off_proc_num_cols = off_proc_column_map.size();
-
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-    rank_node = topology->get_node(rank);
-
-    // Reserve size in vectors
-
-    on_node_column_map.reserve(off_proc_num_cols);
-    on_node_col_to_proc.reserve(off_proc_num_cols);
-    off_node_column_map.reserve(off_proc_num_cols);
-    off_node_col_to_proc.reserve(off_proc_num_cols);
-    
-    for (int i = 0; i < off_proc_num_cols; i++)
-    {
-        proc = off_proc_col_to_proc[i];
-        node = topology->get_node(proc);
-        global_col = off_proc_column_map[i];
-        if (node == rank_node)
-        {
-            on_node_column_map.emplace_back(global_col);
-            on_node_col_to_proc.emplace_back(topology->get_local_proc(proc));
-            on_node_to_off_proc.emplace_back(i);
-        }
-        else
-        {
-            off_node_column_map.emplace_back(global_col);
-            off_node_col_to_proc.emplace_back(proc);
-            off_node_to_off_proc.emplace_back(i);
-        }
-    }
-}
-
-
-/**************************************************************
-*****   Gather off node nodes
-**************************************************************
-***** Gathers nodes with which any local processes communicates
-*****
-***** Parameters
-***** -------------
-***** off_node_col_to_node : std::vector<int>&
-*****    Vector holding rank's off_node_columns
-***** recv_nodes : std::vector<int>&
-*****    Returned holding all nodes with which any local
-*****    process communicates (union of off_node_col_to_node)
-**************************************************************/
-void TAPComm::form_local_R_par_comm(const std::vector<int>& off_node_column_map,
-        const std::vector<int>& off_node_col_to_proc,
-        std::vector<int>& orig_procs)
-{
-    int local_rank;
-    RAPtor_MPI_Comm_rank(topology->local_comm, &local_rank);
-
-    // Declare Variables
-    int int_size = sizeof(int);
-    
-    int node;
-    int num_recv_nodes;
-    int local_proc;
-    int idx, idx_p, proc;
-    int start_ctr, ctr;
-    int local_num_sends;
-    int recv_start, recv_end;
-    int recv_proc, recv_s;
-    int count, pos;
-    int off_node_num_cols = off_node_column_map.size();
-    int N = topology->num_nodes / int_size;
-    if (topology->num_nodes % int_size)
-    {
-        N++;
-    }
-    std::vector<int> tmp_recv_nodes(N, 0);
-    std::vector<int> nodal_recv_nodes(N, 0);
-    std::vector<int> node_sizes(topology->num_nodes, 0);
-    std::vector<int> nodal_off_node_sizes;
-    std::vector<int> node_to_local_proc;
-    std::vector<int> local_recv_procs(topology->PPN, 0);
-    std::vector<int> local_recv_sizes(topology->PPN, 0);
-    std::vector<int> local_send_procs(topology->PPN);
-    std::vector<int> proc_idx;
-    std::vector<int> off_node_col_to_lcl_proc;
-    std::vector<int> send_buffer;
-    std::vector<int> recv_nodes;
-
-    RAPtor_MPI_Status recv_status;
-
-    NonContigData* local_R_recv = (NonContigData*) local_R_par_comm->recv_data;
-
-    // Find nodes from which rank must recv, and the size of each recv
-    for (std::vector<int>::const_iterator it = off_node_col_to_proc.begin();
-            it != off_node_col_to_proc.end(); ++it)
-    {
-        node = topology->get_node(*it);
-        idx = node / int_size;
-        pos = node % int_size;
-        tmp_recv_nodes[idx] |= 1 << pos;
-        node_sizes[node]++;
-    }
-
-    // Allreduce among procs local to node to find nodes from which rank_node
-    // recvs
-    RAPtor_MPI_Allreduce(tmp_recv_nodes.data(), nodal_recv_nodes.data(), N, RAPtor_MPI_INT,
-            RAPtor_MPI_BOR, topology->local_comm);
-
-    // Add nodes from which rank_node must recv to recv_nodes
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < int_size; j++)
-        {
-            if ((nodal_recv_nodes[i] >> j) & 1)
-            {
-                recv_nodes.emplace_back(i*int_size + j);
-            }
-        }
-    }
-
-    // Find the number of nodes from which rank node recvs
-    num_recv_nodes = recv_nodes.size();
-
-    // Find the size of each nodal recv
-    if (num_recv_nodes)
-    {
-        // Collect the number of bytes sent to each node
-        nodal_off_node_sizes.resize(num_recv_nodes);
-        for (int i = 0; i < num_recv_nodes; i++)
-        {
-            node = recv_nodes[i];
-            nodal_off_node_sizes[i] = node_sizes[node];
-        }
-        RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, nodal_off_node_sizes.data(), num_recv_nodes, RAPtor_MPI_INT,
-                RAPtor_MPI_SUM, topology->local_comm);
-
-        // Sort nodes, descending by msg size (find permutation)
-        std::vector<int> p(num_recv_nodes);
-        std::iota(p.begin(), p.end(), 0);
-        std::sort(p.begin(), p.end(), 
-                [&](const int lhs, const int rhs)
-                {
-                    return nodal_off_node_sizes[lhs] > nodal_off_node_sizes[rhs];
-                });
-
-        // Sort recv nodes by total num bytes recvd from node
-        std::vector<bool> done(num_recv_nodes);
-        for (int i = 0; i < num_recv_nodes; i++)
-        {
-            if (done[i]) continue;
-
-            done[i] = true;
-            int prev_j = i;
-            int j = p[i];
-            while (i != j)
-            {
-                std::swap(recv_nodes[prev_j], recv_nodes[j]);
-                std::swap(nodal_off_node_sizes[prev_j], nodal_off_node_sizes[j]);
-                done[j] = true;
-                prev_j = j;
-                j = p[j];
-            }
-        }
-
-        // Map recv nodes to local processes
-        local_proc = 0;
-        node_to_local_proc.resize(topology->num_nodes);
-        for (std::vector<int>::iterator it = recv_nodes.begin();
-                it != recv_nodes.end(); ++it)
-        {
-            node_to_local_proc[*it] = local_proc++ ;
-            if (local_proc >= topology->PPN)
-            {
-                local_proc = 0;
-            }
-        }
-    }
-
-    if (num_recv_nodes)
-    {
-        proc_idx.resize(num_recv_nodes, 0);
-    }
-    if (off_node_num_cols)
-    {
-        off_node_col_to_lcl_proc.resize(off_node_num_cols);
-    }
-
-    // Find number of recvd indices per local proc
-    for (int i = 0; i < off_node_num_cols; i++)
-    {
-        proc = off_node_col_to_proc[i];
-        node = topology->get_node(proc);
-        local_proc = node_to_local_proc[node];
-        local_recv_sizes[local_proc]++;
-        off_node_col_to_lcl_proc[i] = local_proc;
-    }
-
-    // Create displs based on local_recv_sizes
-    recv_s = 0;
-    std::vector<int> proc_to_idx(topology->PPN);
-    for (int i = 0; i < topology->PPN; i++)
-    {
-        if (local_recv_sizes[i])
-        {
-            recv_s += local_recv_sizes[i];
-            proc_to_idx[i] = local_R_recv->procs.size();
-            local_R_recv->procs.emplace_back(i);
-            local_R_recv->indptr.emplace_back(recv_s);
-            local_recv_sizes[i] = 0;
-            local_recv_procs[i] = 1;
-        }
-    }
-    // Add columns to local_recv_indices in location according to
-    local_R_recv->indices.resize(off_node_num_cols);
-    for (int i = 0; i < off_node_num_cols; i++)
-    {
-        local_proc = off_node_col_to_lcl_proc[i];
-        idx_p = proc_to_idx[local_proc];
-        idx = local_R_recv->indptr[idx_p] + local_recv_sizes[local_proc]++;
-        local_R_recv->indices[idx] = i;
-    }
-    local_R_recv->num_msgs = local_R_recv->procs.size();
-    local_R_recv->size_msgs = local_R_recv->indices.size();
-    local_R_recv->finalize();
-
-    // On node communication-- scalable to do all reduce to find number of
-    // local processes to send to :)
-    RAPtor_MPI_Allreduce(local_recv_procs.data(), local_send_procs.data(), topology->PPN, RAPtor_MPI_INT,
-            RAPtor_MPI_SUM, topology->local_comm);
-    local_num_sends = local_send_procs[local_rank];
-
-    // Send recv_indices to each recv_proc along with their origin 
-    // node
-    if (local_R_recv->size_msgs)
-    {
-        send_buffer.resize(2*local_R_recv->size_msgs);
-    }
-
-
-    ctr = 0;
-    start_ctr = 0;
-    for (int i = 0; i < local_R_recv->num_msgs; i++)
-    {
-        recv_proc = local_R_recv->procs[i];
-        recv_start = local_R_recv->indptr[i];
-        recv_end = local_R_recv->indptr[i+1];
-        for (int j = recv_start; j < recv_end; j++)
-        {
-            idx = local_R_recv->indices[j];
-            send_buffer[ctr++] = off_node_column_map[idx];
-        }
-        for (int j = recv_start; j < recv_end; j++)
-        {
-            idx = local_R_recv->indices[j];
-            send_buffer[ctr++] = off_node_col_to_proc[idx];
-        }
-        RAPtor_MPI_Isend(&(send_buffer[start_ctr]), 2*(recv_end - recv_start),
-                RAPtor_MPI_INT, recv_proc, 6543, topology->local_comm, 
-                &(local_R_recv->requests[i]));
-        start_ctr = ctr;
-    }
-
-    // Recv messages from local processes and add to send_data
-    ctr = 0;
-    for (int i = 0; i < local_num_sends; i++)
-    {
-        RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, 6543, topology->local_comm, &recv_status);
-        RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_INT, &count);
-        proc = recv_status.RAPtor_MPI_SOURCE;
-        int recvbuf[count];
-        RAPtor_MPI_Recv(recvbuf, count, RAPtor_MPI_INT, proc, 6543, topology->local_comm,
-                &recv_status);
-        local_R_par_comm->send_data->add_msg(proc, count / 2, recvbuf);
-        start_ctr = count / 2;
-        // Add orig nodes for each recvd col (need to know this for
-        // global communication setup)
-        for (int j = start_ctr; j < count; j++)
-        {
-            orig_procs.emplace_back(recvbuf[j]);
-        }
-    }
-    local_R_par_comm->send_data->finalize();
-
-    // Wait for all sends to complete
-    if (local_R_recv->num_msgs)
-    {
-        RAPtor_MPI_Waitall(local_R_recv->num_msgs,
-                local_R_recv->requests.data(),
-                RAPtor_MPI_STATUS_IGNORE);
-    }
-
-}   
-
-/**************************************************************
-*****   Find global comm procs
-**************************************************************
-***** Determine which processes with which rank will communicate
-***** during inter-node communication
-*****
-***** Parameters
-***** -------------
-***** recv_nodes : std::vector<int>&
-*****    All nodes with which any local process communicates 
-***** send_procs : std::vector<int>&
-*****    Returns with all off_node processes to which rank sends
-***** recv_procs : std::vector<int>&
-*****    Returns with all off_node process from which rank recvs
-**************************************************************/
-void TAPComm::form_global_par_comm(std::vector<int>& orig_procs)
-{
-    int rank, num_procs;
-    int local_rank;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_rank(topology->local_comm, &local_rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    int n_sends;
-    int proc, node;
-    int recvbuf;
-    int n_send_procs;
-    int recv_s;
-    int idx, node_idx;
-    int ctr;
-    int start, end, size;
-    int count;
-    RAPtor_MPI_Status recv_status;
-
-    std::vector<int> node_list(topology->num_nodes, 0);
-    std::vector<int> sendbuf;
-    std::vector<int> sendbuf_sizes;
-    std::vector<int> send_procs;
-    std::vector<int> send_sizes(topology->PPN);
-    std::vector<int> send_displs(topology->PPN+1);
-    std::vector<int> node_sizes(topology->num_nodes, 0);
-    std::vector<int> send_proc_sizes;
-    std::vector<int> node_to_idx(topology->num_nodes, 0);
-    std::vector<int> node_recv_idx_orig_procs;
-    std::vector<int> send_buffer;
-
-    NonContigData* global_recv = (NonContigData*) global_par_comm->recv_data;
-
-    if (local_R_par_comm->send_data->size_msgs)
-    {
-        node_recv_idx_orig_procs.resize(local_R_par_comm->send_data->size_msgs);
-    }
-
-    // Find how many msgs must recv from each node
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        proc = orig_procs[i];
-        node = topology->get_node(proc);
-        node_sizes[node]++;
-    }
-
-    // Form recv procs and indptr, based on node_sizes
-    recv_s = 0;
-    for (int i = 0; i < topology->num_nodes; i++)
-    {
-        if (node_sizes[i])
-        {
-            recv_s += node_sizes[i];
-            node_to_idx[i] = global_recv->procs.size();
-            global_recv->indptr.emplace_back(recv_s);
-            global_recv->procs.emplace_back(i);  // currently have node 
-            node_sizes[i] = 0;
-        }
-    }
-    global_recv->num_msgs = global_recv->procs.size();
-    global_recv->size_msgs = recv_s;
-
-    // Form recv indices, placing global column in correct position
-    global_recv->indices.resize(recv_s);
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        proc = orig_procs[i];
-        node = topology->get_node(proc);
-        node_idx = node_to_idx[node];
-        idx = global_recv->indptr[node_idx] + node_sizes[node]++;
-        global_recv->indices[idx] = local_R_par_comm->send_data->indices[i];
-        node_recv_idx_orig_procs[idx] = proc;
-    }
-
-    // Remove duplicates... Likely send same data to multiple local procs, but
-    // only want to recv this data from a distant node once
-    ctr = 0;
-    start = global_recv->indptr[0];
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        proc = global_recv->procs[i];
-        end = global_recv->indptr[i+1];
-        size = end - start;
-        if (size)
-        {
-            // Find permutation of node_recv_indices (between start and end)
-            // in ascending order
-            std::vector<int> p(size);
-            std::iota(p.begin(), p.end(), 0);
-            std::sort(p.begin(), p.end(),
-                    [&] (int j, int k)
-                    {
-                        return global_recv->indices[j+start] 
-                               < global_recv->indices[k+start];
-                    });
-
-            // Sort node_recv_indices and node_recv_idx_orig_procs together
-            std::vector<bool> done(size);
-            for (int j = 0; j < size; j++)
-            {
-                if (done[j]) continue;
-
-                done[j] = true;
-                int prev_k = j;
-                int k = p[j];
-                while (j != k)
-                {
-                    std::swap(global_recv->indices[prev_k+start],
-                            global_recv->indices[k+start]);
-                    std::swap(node_recv_idx_orig_procs[prev_k+start], 
-                            node_recv_idx_orig_procs[k+start]);
-                    done[k] = true;
-                    prev_k = k;
-                    k = p[k];
-                }
-            }
-        }
-
-        // Add msg to global_par_comm->recv_data
-        node_recv_idx_orig_procs[ctr] = node_recv_idx_orig_procs[start];
-        global_recv->indices[ctr++] 
-                = global_recv->indices[start];
-        for (int j = start+1; j < end; j++)
-        {
-            if (global_recv->indices[j] != global_recv->indices[j-1])
-            {
-                node_recv_idx_orig_procs[ctr] = node_recv_idx_orig_procs[j];
-                global_recv->indices[ctr++] = global_recv->indices[j];
-            }
-        }
-        global_recv->indptr[i + 1] = ctr;
-        start = end;
-    }
-    global_recv->indices.resize(ctr);
-    global_recv->size_msgs = ctr;
-    global_recv->finalize();
-
-    std::vector<int> send_p(num_procs, 0);
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        node = global_recv->procs[i];
-        proc = topology->get_global_proc(node, local_rank);
-        send_p[proc] = 1;
-    }
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, send_p.data(), num_procs, RAPtor_MPI_INT,
-            RAPtor_MPI_SUM, RAPtor_MPI_COMM_WORLD);
-    int recv_n = send_p[rank];
-    sendbuf.resize(recv_n);
-    sendbuf_sizes.resize(recv_n);
-    // Send recv sizes to corresponding local procs on appropriate nodes
-    ctr = 0;
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        node = global_recv->procs[i];
-        proc = topology->get_global_proc(node, local_rank);
-        RAPtor_MPI_Isend(&(node_sizes[node]), 1, RAPtor_MPI_INT, proc, 9876, RAPtor_MPI_COMM_WORLD,
-                &(global_recv->requests[i]));
-    }
-    for (int i = 0; i < recv_n; i++)
-    {
-        RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, 9876, RAPtor_MPI_COMM_WORLD, &recv_status);
-        proc = recv_status.RAPtor_MPI_SOURCE;
-        RAPtor_MPI_Recv(&recvbuf, 1, RAPtor_MPI_INT, proc, 9876, RAPtor_MPI_COMM_WORLD, 
-                &recv_status);
-        sendbuf[i] = proc;
-        sendbuf_sizes[i] = recvbuf;
-    }
-    RAPtor_MPI_Waitall(global_recv->num_msgs, global_recv->requests.data(),
-            RAPtor_MPI_STATUSES_IGNORE);
-
-    // Gather all procs to which node must send 
-    n_sends = sendbuf.size();
-    RAPtor_MPI_Allgather(&n_sends, 1, RAPtor_MPI_INT, send_sizes.data(), 1, RAPtor_MPI_INT, topology->local_comm);
-    send_displs[0] = 0;
-    for (int i = 0; i < topology->PPN; i++)
-    {
-        send_displs[i+1] = send_displs[i] + send_sizes[i];
-    } 
-    n_send_procs = send_displs[topology->PPN];
-    send_procs.resize(n_send_procs);
-    send_proc_sizes.resize(n_send_procs);
-    RAPtor_MPI_Allgatherv(sendbuf.data(), n_sends, RAPtor_MPI_INT, send_procs.data(), 
-            send_sizes.data(), send_displs.data(), RAPtor_MPI_INT, topology->local_comm);
-    RAPtor_MPI_Allgatherv(sendbuf_sizes.data(), n_sends, RAPtor_MPI_INT, send_proc_sizes.data(), 
-            send_sizes.data(), send_displs.data(), RAPtor_MPI_INT, topology->local_comm);
-
-    // Permute send_procs based on send_proc_sizes
-    std::vector<int> p(n_send_procs);
-    std::iota(p.begin(), p.end(), 0);
-    std::sort(p.begin(), p.end(), 
-            [&](const int lhs, const int rhs)
-            {
-                return send_proc_sizes[lhs] > send_proc_sizes[rhs];
-            });
-    std::vector<bool> done(n_send_procs);
-    for (int i = 0; i < n_send_procs; i++)
-    {
-        if (done[i]) continue;
-
-        done[i] = true;
-        int prev_j = i;
-        int j = p[i];
-        while (i != j)
-        {
-            std::swap(send_procs[prev_j], send_procs[j]);
-            std::swap(send_proc_sizes[prev_j], send_proc_sizes[j]);
-            done[j] = true;
-            prev_j = j; 
-            j = p[j];
-        }
-    }
-
-    // Distribute send_procs across local procs
-    n_sends = 0;
-    for (size_t i = topology->PPN - local_rank - 1; i < send_procs.size(); i += topology->PPN)
-    {
-        global_par_comm->send_data->procs.emplace_back(send_procs[i]);
-    }
-    global_par_comm->send_data->num_msgs = global_par_comm->send_data->procs.size();
-    global_par_comm->send_data->requests.resize(global_par_comm->send_data->num_msgs);
-
-
-    for (int i = 0; i < global_par_comm->send_data->num_msgs; i++)
-    {
-        proc = global_par_comm->send_data->procs[i];
-        RAPtor_MPI_Isend(&(global_par_comm->send_data->procs[i]), 1, RAPtor_MPI_INT, proc, 6789, 
-                RAPtor_MPI_COMM_WORLD, &(global_par_comm->send_data->requests[i]));
-    }
-    // Recv processes from which rank must recv
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, 6789, RAPtor_MPI_COMM_WORLD, &recv_status);
-        proc = recv_status.RAPtor_MPI_SOURCE;
-        node = topology->get_node(proc);
-        RAPtor_MPI_Recv(&recvbuf, 1, RAPtor_MPI_INT, proc, 6789, RAPtor_MPI_COMM_WORLD, &recv_status);
-        idx = node_to_idx[node];
-        global_recv->procs[idx] = proc;
-    }
-    // Wait for sends to complete
-    if (global_par_comm->send_data->num_msgs)
-    {
-        RAPtor_MPI_Waitall(global_par_comm->send_data->num_msgs, 
-                global_par_comm->send_data->requests.data(), RAPtor_MPI_STATUSES_IGNORE);
-
-    }
-
-
-    for (int i = 0; i < global_recv->size_msgs; i++)
-    {
-        send_buffer.emplace_back(global_recv->indices[i]);
-        send_buffer.emplace_back(node_recv_idx_orig_procs[i]);
-    }
-
-
-    // Send recv indices to each recv proc along with the process of
-    // origin for each recv idx
-    ctr = 0;
-    
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        proc = global_recv->procs[i];
-        start = global_recv->indptr[i];
-        end = global_recv->indptr[i+1];
-        RAPtor_MPI_Isend(&(send_buffer[2*start]), 2*(end - start),
-                RAPtor_MPI_INT, proc, 5432, RAPtor_MPI_COMM_WORLD,
-                &(global_recv->requests[i]));
-        
-    }
-
-    // Recv send data (which indices to send) to global processes
-    orig_procs.clear();
-    for (int i = 0; i < global_par_comm->send_data->num_msgs; i++)
-    {
-        proc = global_par_comm->send_data->procs[i];
-        RAPtor_MPI_Probe(proc, 5432, RAPtor_MPI_COMM_WORLD, &recv_status);
-        RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_INT, &count);
-        int commbuf[count];
-        RAPtor_MPI_Recv(commbuf, count, RAPtor_MPI_INT, proc, 5432, RAPtor_MPI_COMM_WORLD, &recv_status);
-        for (int j = 0; j < count; j += 2)
-        {
-           global_par_comm->send_data->indices.emplace_back(commbuf[j]);
-           orig_procs.emplace_back(topology->get_local_proc(commbuf[j+1]));
-        }
-        global_par_comm->send_data->indptr.emplace_back(
-                global_par_comm->send_data->indices.size()); 
-    }
-    global_par_comm->send_data->num_msgs = global_par_comm->send_data->procs.size();
-    global_par_comm->send_data->size_msgs = global_par_comm->send_data->indices.size();
-    global_par_comm->send_data->finalize();
-
-    if (global_recv->num_msgs)
-    {
-        RAPtor_MPI_Waitall(global_recv->num_msgs,
-                global_recv->requests.data(),
-                RAPtor_MPI_STATUS_IGNORE);
-    }
-}
-
-
-/**************************************************************
-*****   Form local_S_par_comm
-**************************************************************
-***** Find which local processes the values originating on rank
-***** must be sent to, and which processes store values rank must
-***** send as inter-node communication.
-*****
-***** Parameters
-***** -------------
-**************************************************************/
-void TAPComm::form_local_S_par_comm(std::vector<int>& orig_procs)
-{
-    int rank;
-    int local_rank;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_rank(topology->local_comm, &local_rank);
-
-    // Find local_col_starts for all procs local to node, and sort
-    int start, end;
-    int proc, proc_idx;
-    
-    int ctr, idx;
-    int size;
-
-    std::vector<int> local_procs(topology->PPN);
-    std::vector<int> proc_sizes(topology->PPN, 0);
-    std::vector<int> recv_procs(topology->PPN, 0);
-    std::vector<int> proc_to_idx(topology->PPN);
-
-    NonContigData* local_S_recv = (NonContigData*) local_S_par_comm->recv_data;
-
-    if (global_par_comm->send_data->num_msgs)
-    {
-        local_S_recv->indices.resize(global_par_comm->send_data->size_msgs);
-    }
-
-    // Find all column indices originating on local procs
-    for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-    {
-        proc = orig_procs[i];
-        proc_sizes[proc]++;
-        recv_procs[proc] = 1;
-    }
-
-    // Reduce recv_procs to how many msgs rank will recv
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, recv_procs.data(), topology->PPN, RAPtor_MPI_INT, RAPtor_MPI_SUM, topology->local_comm);
-    int n_recvs = recv_procs[local_rank];
-
-    // Form local_S_par_comm recv_data
-    int recv_s = 0;
-    for (int i = 0; i < topology->PPN; i++)
-    {
-        if (proc_sizes[i])
-        {
-            recv_s += proc_sizes[i];
-            proc_to_idx[i] = local_S_recv->procs.size();
-            local_S_recv->procs.emplace_back(i);
-            local_S_recv->indptr.emplace_back(recv_s);
-        }
-        proc_sizes[i] = 0;
-    }
-    local_S_recv->num_msgs = local_S_recv->procs.size();
-    for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-    {
-        proc = orig_procs[i];
-        proc_idx = proc_to_idx[proc];
-        idx = local_S_recv->indptr[proc_idx] + proc_sizes[proc]++;
-        local_S_recv->indices[idx] = global_par_comm->send_data->indices[i];
-    }
-
-    // Remove duplicate entries from local_S_par_comm recv_data (proc may have
-    // to send the same data to multiple nodes, but should only recv values a
-    // single time from each local proc)
-    ctr = 0;
-    start = local_S_recv->indptr[0];
-    for (int i = 0; i < local_S_recv->num_msgs; i++)
-    {
-        end = local_S_recv->indptr[i+1];
-        size = end - start;
-        if (size)
-        {
-            std::sort(local_S_recv->indices.begin() + start, 
-                    local_S_recv->indices.begin() + end);
-            local_S_recv->indices[ctr++] = 
-                local_S_recv->indices[start];
-            for (int j = start+1; j < end; j++)
-            {
-                if (local_S_recv->indices[j] 
-                        != local_S_recv->indices[j-1])
-                {
-                    local_S_recv->indices[ctr++] 
-                        = local_S_recv->indices[j];
-                }
-            }
-        }
-        local_S_recv->indptr[i+1] = ctr;
-        start = end;
-    }
-    local_S_recv->indices.resize(ctr);
-    local_S_recv->size_msgs = ctr;
-    local_S_recv->finalize();
-
-    // Send messages to local procs, informing of what data to send
-    for (int i = 0; i < local_S_recv->num_msgs; i++)
-    {
-        proc = local_S_recv->procs[i];
-        start = local_S_recv->indptr[i];
-        end = local_S_recv->indptr[i+1];
-        RAPtor_MPI_Isend(&(local_S_recv->indices[start]), 
-                end - start, RAPtor_MPI_INT, proc, 4321, topology->local_comm,
-                &(local_S_recv->requests[i]));
-    }
-    // Recv messages and form local_S_par_comm send_data
-    int count;
-    RAPtor_MPI_Status recv_status;
-    for (int i = 0; i < n_recvs; i++)
-    {
-        RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, 4321, topology->local_comm, &recv_status);
-        RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_INT, &count);
-        proc = recv_status.RAPtor_MPI_SOURCE;        
-        int recvbuf[count];
-        RAPtor_MPI_Recv(recvbuf, count, RAPtor_MPI_INT, proc, 4321, topology->local_comm, &recv_status);
-        for (int j = 0; j < count; j++)
-        {
-            local_S_par_comm->send_data->indices.emplace_back(recvbuf[j]);
-        }
-        local_S_par_comm->send_data->indptr.emplace_back(
-                local_S_par_comm->send_data->indices.size());
-        local_S_par_comm->send_data->procs.emplace_back(proc);
-    }
-    local_S_par_comm->send_data->num_msgs = local_S_par_comm->send_data->procs.size();
-    local_S_par_comm->send_data->size_msgs = local_S_par_comm->send_data->indices.size();
-    local_S_par_comm->send_data->finalize();
-    if (local_S_recv->num_msgs)
-    {
-        RAPtor_MPI_Waitall(local_S_recv->num_msgs,
-                local_S_recv->requests.data(),
-                RAPtor_MPI_STATUS_IGNORE);
-    }
-}
-
-
-void TAPComm::adjust_send_indices(const int first_local_col)
-{
-    int idx, idx_pos, size;
-    int local_S_idx, global_comm_idx;
-
-    if (local_S_par_comm)
-    {
-        DuplicateData* local_S_recv = (DuplicateData*) local_S_par_comm->recv_data;
-        // Update global row index with local row to send 
-        for (int i = 0; i < local_S_par_comm->send_data->size_msgs; i++)
-        {
-            local_S_par_comm->send_data->indices[i] -= first_local_col;
-        }
-
-        // Update global_par_comm->send_data->indices (global rows) to 
-        std::map<int, int> S_global_to_local;
-        for (int i = 0; i < local_S_recv->size_msgs; i++)
-        {
-            S_global_to_local[local_S_recv->indices[i]] = i;
-        }
-        std::vector<int> local_S_num_pos;
-        if (local_S_recv->size_msgs)
-            local_S_num_pos.resize(local_S_recv->size_msgs, 0);
-        for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-        {
-            idx = global_par_comm->send_data->indices[i];
-            local_S_idx = S_global_to_local[idx];
-            global_par_comm->send_data->indices[i] = local_S_idx;
-            local_S_num_pos[local_S_idx]++;
-        }
-        local_S_recv->indptr_T.resize(local_S_recv->size_msgs + 1);
-        local_S_recv->indptr_T[0] = 0;
-        size = 0;
-        for (int i = 0; i < local_S_recv->size_msgs; i++)
-        {
-            size += local_S_num_pos[i];
-            local_S_recv->indptr_T[i+1] = size;
-            local_S_num_pos[i] = 0;
-        }
-        local_S_recv->indices.resize(size);
-        for(int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-        {
-            idx = global_par_comm->send_data->indices[i];
-            idx_pos = local_S_recv->indptr_T[idx] + local_S_num_pos[idx]++;
-            local_S_recv->indices[idx_pos] = i;
-        }
-    }
-    else
-    {
-        for (int i = 0; i < global_par_comm->send_data->size_msgs; i++)
-        {
-            global_par_comm->send_data->indices[i] -= first_local_col;
-        }
-    }
-
-    // Update local_R_par_comm->send_data->indices (global_rows)
-    DuplicateData* global_recv = (DuplicateData*) global_par_comm->recv_data;
-    std::map<int, int> global_to_local;
-    for (int i = 0; i < global_recv->size_msgs; i++)
-    {
-        global_to_local[global_recv->indices[i]] = i;
-    }
-    std::vector<int> global_num_pos;
-    if (global_recv->size_msgs)
-        global_num_pos.resize(global_recv->size_msgs, 0);
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        idx = local_R_par_comm->send_data->indices[i];
-        global_comm_idx = global_to_local[idx];
-        local_R_par_comm->send_data->indices[i] = global_comm_idx;
-        global_num_pos[global_comm_idx]++;
-    }
-    global_recv->indptr_T.resize(global_recv->size_msgs + 1);
-    global_recv->indptr_T[0] = 0;
-    size = 0;
-    for (int i = 0; i < global_recv->size_msgs; i++)
-    {
-        size += global_num_pos[i];
-        global_recv->indptr_T[i+1] = size;
-        global_num_pos[i] = 0;
-    }
-    global_recv->indices.resize(size);
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        idx = local_R_par_comm->send_data->indices[i];
-        idx_pos = global_recv->indptr_T[idx] + global_num_pos[idx]++;
-        global_recv->indices[idx_pos] = i;
-    }
-
-}
-
-/**************************************************************
-*****  Form local_L_par_comm 
-**************************************************************
-***** Adjust send indices from global row index to index of 
-***** global column in previous recv buffer.  
-*****
-***** Parameters
-***** -------------
-***** on_node_column_map : std::vector<int>&
-*****    Columns corresponding to on_node processes
-***** on_node_col_to_proc : std::vector<int>&
-*****    On node process corresponding to each column
-*****    in on_node_column_map
-***** first_local_row : int
-*****    First row local to rank 
-**************************************************************/
-void TAPComm::form_local_L_par_comm(const std::vector<int>& on_node_column_map,
-        const std::vector<int>& on_node_col_to_proc, const int first_local_col)
-{
-    int local_rank;
-    RAPtor_MPI_Comm_rank(topology->local_comm, &local_rank);
-
-    int on_node_num_cols = on_node_column_map.size();
-    int prev_proc, prev_idx;
-    int num_sends;
-    int proc, start, end;
-    int count;
-    RAPtor_MPI_Status recv_status;
-    std::vector<int> recv_procs(topology->PPN, 0);
-
-    NonContigData* local_L_recv = (NonContigData*) local_L_par_comm->recv_data;
-
-    if (on_node_num_cols)
-    {
-        prev_proc = on_node_col_to_proc[0];
-        recv_procs[prev_proc] = 1;
-        prev_idx = 0;
-        for (int i = 1; i < on_node_num_cols; i++)
-        {
-            proc = on_node_col_to_proc[i];
-            if (proc != prev_proc)
-            {
-                local_L_recv->add_msg(prev_proc, i - prev_idx);
-                prev_proc = proc;
-                prev_idx = i;
-                recv_procs[proc] = 1;
-            }
-        }
-        local_L_recv->add_msg(prev_proc, on_node_num_cols - prev_idx);
-        local_L_recv->finalize();
-
-        for (int i = 0; i < on_node_num_cols; i++)
-        {
-            local_L_recv->indices.emplace_back(i);
-        }
-    }
-
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, recv_procs.data(), topology->PPN, RAPtor_MPI_INT, RAPtor_MPI_SUM, 
-            topology->local_comm);
-    num_sends = recv_procs[local_rank];
-
-    for (int i = 0; i < local_L_recv->num_msgs; i++)
-    {
-        proc = local_L_recv->procs[i];
-        start = local_L_recv->indptr[i];
-        end = local_L_recv->indptr[i+1];
-        RAPtor_MPI_Isend(&(on_node_column_map[start]), end - start, RAPtor_MPI_INT, proc,
-                7890, topology->local_comm, &(local_L_recv->requests[i]));
-    }
-    for (int i = 0; i < num_sends; i++)
-    {
-        RAPtor_MPI_Probe(RAPtor_MPI_ANY_SOURCE, 7890, topology->local_comm, &recv_status);
-        RAPtor_MPI_Get_count(&recv_status, RAPtor_MPI_INT, &count);
-        proc = recv_status.RAPtor_MPI_SOURCE;
-        int recvbuf[count];
-        RAPtor_MPI_Recv(recvbuf, count, RAPtor_MPI_INT, proc, 7890, topology->local_comm, &recv_status);
-        for (int j = 0; j < count; j++)
-        {
-            recvbuf[j] -= first_local_col;
-        }
-        local_L_par_comm->send_data->add_msg(proc, count, recvbuf);
-    }
-    local_L_par_comm->send_data->finalize();
-    
-    if (local_L_recv->num_msgs)
-    {
-        RAPtor_MPI_Waitall(local_L_recv->num_msgs,
-                local_L_recv->requests.data(), 
-                RAPtor_MPI_STATUSES_IGNORE);
-    }
-}
-
-void TAPComm::form_simple_R_par_comm(std::vector<int>& off_node_column_map,
-        std::vector<int>& off_node_col_to_proc)
-{
-    int rank, local_rank;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_rank(topology->local_comm, &local_rank);
-
-    int proc, local_proc;
-    int proc_idx, idx;
-    int off_node_num_cols = off_node_column_map.size();
-    std::vector<int> local_proc_sizes(topology->PPN, 0);
-    std::vector<int> proc_size_idx(topology->PPN);
-
-    NonContigData* local_R_recv = (NonContigData*) local_R_par_comm->recv_data;
-
-    // Form local_R_par_comm recv_data (currently with global recv indices)
-    for (std::vector<int>::iterator it = off_node_col_to_proc.begin();
-            it != off_node_col_to_proc.end(); ++it)
-    {
-        local_proc = topology->get_local_proc(*it);
-        local_proc_sizes[local_proc]++;
-    }
-
-    local_R_recv->size_msgs = 0;
-    local_R_recv->indptr[0] = local_R_recv->size_msgs;
-    for (int i = 0; i < topology->PPN; i++)
-    {
-        if (local_proc_sizes[i])
-        {
-            local_R_recv->num_msgs++;
-            local_R_recv->size_msgs += local_proc_sizes[i];
-            local_proc_sizes[i] = 0;
-
-            proc_size_idx[i] = local_R_recv->procs.size();
-            local_R_recv->procs.emplace_back(i);
-            local_R_recv->indptr.emplace_back(
-                    local_R_recv->size_msgs);
-        }
-    }
-    if (local_R_recv->size_msgs)
-    {
-        local_R_recv->indices.resize(local_R_recv->size_msgs);
-    }
-
-    for (int i = 0; i < off_node_num_cols; i++)
-    {
-        proc = off_node_col_to_proc[i];
-        local_proc = topology->get_local_proc(proc);
-        proc_idx = proc_size_idx[local_proc];
-        idx = local_R_recv->indptr[proc_idx] + local_proc_sizes[local_proc]++;
-        local_R_recv->indices[idx] = i;
-    }
-    local_R_recv->finalize();
-
-    // Communicate local_R recv_data so send_data can be formed
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, local_proc_sizes.data(), topology->PPN, RAPtor_MPI_INT,
-            RAPtor_MPI_SUM, topology->local_comm);
-
-    local_R_par_comm->recv_data->send(off_node_column_map.data(), 6543, topology->local_comm);
-    local_R_par_comm->send_data->probe(local_proc_sizes[local_rank], 6543, topology->local_comm);
-    local_R_par_comm->recv_data->waitall();
-}
-
-void TAPComm::form_simple_global_comm(std::vector<int>& off_proc_col_to_proc)
-{
-    int rank;
-    int num_procs;
-    int proc, start, end;
-    int idx, proc_idx;
-    int global_idx;
-
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    std::vector<int> proc_sizes(num_procs, 0);
-    std::vector<int> proc_ctr;
-
-    NonContigData* global_recv = (NonContigData*) global_par_comm->recv_data;
-
-    // Communicate processes on which each index originates
-    local_R_par_comm->communicate_T(off_proc_col_to_proc.data());
-    std::vector<int>& int_send_buffer = local_R_par_comm->send_data->get_buffer<int>();
-
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        proc = int_send_buffer[i];
-        if (proc_sizes[proc] == 0)
-        {
-            global_recv->procs.emplace_back(proc);
-        }
-        proc_sizes[proc]++;
-    }
-
-    global_recv->num_msgs = global_recv->procs.size();
-    global_recv->indptr[0] = 0;
-    global_recv->size_msgs = 0;
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        proc = global_recv->procs[i];
-        global_recv->size_msgs += proc_sizes[proc];
-        proc_sizes[proc] = i; // Will now use this for proc_idx
-        global_recv->indptr.emplace_back(global_recv->size_msgs);
-    }
-    if (global_recv->size_msgs)
-    {
-        global_recv->indices.resize(global_recv->size_msgs);
-        proc_ctr.resize(global_recv->num_msgs, 0);
-    }
-
-    for (int i = 0; i < local_R_par_comm->send_data->size_msgs; i++)
-    {
-        global_idx = local_R_par_comm->send_data->indices[i];
-        proc = int_send_buffer[i];
-        proc_idx = proc_sizes[proc];
-        idx = global_recv->indptr[proc_idx] + proc_ctr[proc_idx]++;
-        global_recv->indices[idx] = global_idx;
-    }
-    global_recv->finalize();
-
-    // Communicate global recv_data so send_data can be formed (dynamic comm)
-    std::vector<int> recv_sizes(num_procs, 0);
-    for (int i = 0; i < global_recv->num_msgs; i++)
-        recv_sizes[global_recv->procs[i]] = global_recv->indptr[i+1] - global_recv->indptr[i];
-    RAPtor_MPI_Allreduce(RAPtor_MPI_IN_PLACE, recv_sizes.data(), num_procs, RAPtor_MPI_INT, RAPtor_MPI_SUM, RAPtor_MPI_COMM_WORLD);
-
-    for (int i = 0; i < global_recv->num_msgs; i++)
-    {
-        proc = global_recv->procs[i];
-        start = global_recv->indptr[i];
-        end = global_recv->indptr[i+1];
-        RAPtor_MPI_Isend(&(global_recv->indices[start]), end - start, RAPtor_MPI_INT,
-                proc, 6789, RAPtor_MPI_COMM_WORLD, &(global_recv->requests[i]));
-    }
-    global_par_comm->send_data->probe(recv_sizes[rank], 6789, RAPtor_MPI_COMM_WORLD);
-    global_par_comm->recv_data->waitall();
-}
-
-void TAPComm::update_recv(const std::vector<int>& on_node_to_off_proc,
-        const std::vector<int>& off_node_to_off_proc, bool update_L)
-{
-    int idx;
-
-    // Determine size of final recvs (should be equal to 
-    // number of off_proc cols)
-    recv_size = local_R_par_comm->recv_data->size_msgs +
-        local_L_par_comm->recv_data->size_msgs;
-    NonContigData* local_R_recv = (NonContigData*) local_R_par_comm->recv_data;
-    NonContigData* local_L_recv = (NonContigData*) local_L_par_comm->recv_data;
-    if (recv_size)
-    {
-        // Want a single recv buffer local_R and local_L par_comms
-        buffer.resize(recv_size);
-        int_buffer.resize(recv_size);
-
-        // Map local_R recvs to original off_proc_column_map
-        if (local_R_recv->size_msgs)
-        {
-            for (int i = 0; i < local_R_recv->size_msgs; i++)
-            {
-                idx = local_R_recv->indices[i];
-                local_R_recv->indices[i] = off_node_to_off_proc[idx];
-            }
-        }
-
-
-        // Map local_L recvs to original off_proc_column_map
-        if (update_L && local_L_recv->size_msgs)
-        {
-            for (int i = 0; i < local_L_recv->size_msgs; i++)
-            {
-                idx = local_L_recv->indices[i];
-                local_L_recv->indices[i] = on_node_to_off_proc[idx];
-            }
-        }
-    }
-}
-
-
-
-
diff --git a/raptor/core/tests/CMakeLists.txt b/raptor/core/tests/CMakeLists.txt
deleted file mode 100644
index 0d879f81..00000000
--- a/raptor/core/tests/CMakeLists.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-if (WITH_MPI)
-    add_executable(test_par_comm test_par_comm.cpp)
-    target_link_libraries(test_par_comm raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParCommTest ${MPIRUN} -n 1 ${HOST} ./test_par_comm)
-    add_test(ParCommTest ${MPIRUN} -n 4 ${HOST} ./test_par_comm)
-    add_test(ParCommTest ${MPIRUN} -n 16 ${HOST} ./test_par_comm)
-
-    add_executable(test_tap_comm test_tap_comm.cpp)
-    target_link_libraries(test_tap_comm raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(TAPCommTest ${MPIRUN} -n 1 ${HOST} ./test_tap_comm)
-    add_test(TAPCommTest ${MPIRUN} -n 4 ${HOST} ./test_tap_comm)
-    add_test(TAPCommTest ${MPIRUN} -n 16 ${HOST} ./test_tap_comm)
-
-    add_executable(test_par_matrix test_par_matrix.cpp)
-    target_link_libraries(test_par_matrix raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParMatrixTest ${MPIRUN} -n 1 ${HOST} ./test_par_matrix)
-    add_test(ParMatrixTest ${MPIRUN} -n 4 ${HOST} ./test_par_matrix)
-    add_test(ParMatrixTest ${MPIRUN} -n 16 ${HOST} ./test_par_matrix)
-
-    add_executable(test_par_vector test_par_vector.cpp)
-    target_link_libraries(test_par_vector raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParVectorTest ${MPIRUN} -n 1 ${HOST} ./test_par_vector)
-    add_test(ParVectorTest ${MPIRUN} -n 4 ${HOST} ./test_par_vector)
-    add_test(ParVectorTest ${MPIRUN} -n 16 ${HOST} ./test_par_vector)
-
-    add_executable(test_par_transpose test_par_transpose.cpp)
-    target_link_libraries(test_par_transpose raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParTransposeTest ${MPIRUN} -n 1 ${HOST} ./test_par_transpose)
-    add_test(ParTransposeTest ${MPIRUN} -n 4 ${HOST} ./test_par_transpose)
-    add_test(ParTransposeTest ${MPIRUN} -n 16 ${HOST} ./test_par_transpose)
-        
-    add_executable(test_par_block_matrix test_par_block_matrix.cpp)
-    target_link_libraries(test_par_block_matrix raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParBlockMatrixTest ${MPIRUN} -n 1 ${HOST} ./test_par_block_matrix)
-    add_test(ParBlockMatrixTest ${MPIRUN} -n 4 ${HOST} ./test_par_block_matrix)
-    add_test(ParBlockMatrixTest ${MPIRUN} -n 16 ${HOST} ./test_par_block_matrix)
-    
-    add_executable(test_par_block_conversion test_par_block_conversion.cpp)
-    target_link_libraries(test_par_block_conversion raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParBlockConversionTest ${MPIRUN} -n 1 ${HOST} ./test_par_block_conversion)
-    add_test(ParBlockConversionTest ${MPIRUN} -n 4 ${HOST} ./test_par_block_conversion)
-    add_test(ParBlockConversionTest ${MPIRUN} -n 16 ${HOST} ./test_par_block_conversion)
-
-endif ()
-
-add_executable(test_matrix test_matrix.cpp)
-target_link_libraries(test_matrix raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(MatrixTest ./test_matrix)
-
-add_executable(test_transpose test_transpose.cpp)
-target_link_libraries(test_transpose raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(TransposeTest ./test_transpose)
-
-add_executable(test_bsr_matrix test_bsr_matrix.cpp)
-target_link_libraries(test_bsr_matrix raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRMatrixTest ./test_bsr_matrix)
diff --git a/raptor/core/tests/test_block_matrix.cpp b/raptor/core/tests/test_block_matrix.cpp
deleted file mode 100644
index c37408b0..00000000
--- a/raptor/core/tests/test_block_matrix.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor.hpp"
-using namespace raptor;
-
-void compare_vals(CSRMatrix* A, BSRMatrix* B)
-{
-    A->sort();
-    B->sort();
-    int ctr = 0;
-    for (int i = 0; i < B->n_rows; i++)
-    {
-        for (int k = 0; k < B->b_rows; k++)
-        {
-            for (int j = B->idx1[i]; j < B->idx1[i+1]; j++)
-            {
-                double* val = B->block_vals[j];
-                for (int l = 0; l < B->b_cols; l++)
-                {
-                    if (fabs(val[k*B->b_cols + l]) > zero_tol)
-                    {
-                        ASSERT_NEAR(val[k*B->b_cols + l], A->vals[ctr++], 1e-10);
-                    }
-
-                }
-            }
-        }
-    }
-
-}
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-
-} // end of main() //
-
-TEST(BlockMatrixTest, TestsInCore)
-{
-    int block_row_size = 2;
-    int block_col_size = 2;
-    int block_size = 4;
-    int block_nnz = 5;
-    int block_num_rows = 3;
-    int block_num_cols = 3;
-    int num_rows = block_num_rows * block_row_size;
-    int num_cols = block_num_cols * block_col_size;
-    int nnz = block_nnz * block_size;
-
-    std::vector<int> rows = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5};
-    std::vector<int> cols = {0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5};
-    std::vector<double> vals = {1.0, 0.0, 2.0, 1.0, 6.0, 7.0, 8.0, 2.0, 1.0, 4.0, 5.0, 1.0,
-                                4.0, 3.0, 0.0, 0.0, 7.0, 2.0, 0.0, 0.0};
-
-    std::vector<int> block_row_ptr = {0,2,3,5};    
-    std::vector<int> block_rows = {0, 0, 1, 2, 2};
-    std::vector<int> block_cols = {0, 1, 1, 1, 2};
-    std::vector<double*> block_vals;
-    for (int i = 0; i < block_nnz; i++)
-    {
-        double* block = new double[block_size];
-        for (int j = 0; j < block_size; j++)
-        {
-            block[j] = vals[i*block_size+j];
-        }
-        block_vals.push_back(block);
-    }
-
-    Matrix* A_bcoo = new BCOOMatrix(block_num_rows, block_num_cols,
-            block_row_size, block_col_size);
-    for (int i = 0; i < block_nnz; i++)
-        A_bcoo->add_value(block_rows[i], block_cols[i], block_vals[i]);
-
-    Matrix* A_coo = new COOMatrix(num_rows, num_cols);
-    for (int i = 0; i < nnz; i++)
-        A_coo->add_value(rows[i], cols[i], vals[i]);
-
-    Matrix* A_bsr = A_bcoo->to_CSR();
-    Matrix* A_csr = A_coo->to_CSR();
-    Matrix* A_bsc = A_bsr->to_CSC();
-    Matrix* A_csc = A_csr->to_CSC();
-    Matrix* A_csr_from_bsr = A_bsr->to_CSR();
-
-    Vector x(num_rows);
-    Vector b(num_cols);
-    Vector tmp(num_cols);
-    x.set_const_value(1.0);
-
-    A_bcoo->sort();
-    A_bcoo->move_diag();
-    A_bcoo->remove_duplicates();
-
-    A_bsr->sort();
-    A_bsr->move_diag();
-    A_bsr->remove_duplicates();
-
-    A_bsc->sort();
-    A_bsc->move_diag();
-    A_bsc->remove_duplicates();
-
-    ASSERT_EQ(A_bcoo->n_rows, A_bsr->n_rows);
-    ASSERT_EQ(A_bsr->n_rows, A_bsc->n_rows);
-    ASSERT_EQ(A_bcoo->n_cols, A_bsr->n_cols);
-    ASSERT_EQ(A_bsr->n_cols, A_bsc->n_cols);
-    ASSERT_EQ(A_bcoo->nnz, A_bsr->nnz);
-    ASSERT_EQ(A_bsr->nnz, A_bsc->nnz);
-    ASSERT_EQ(A_csr_from_bsr->nnz, A_csr->nnz);
-
-    double** bcoo_vals = (double**) A_bcoo->get_data();
-    double** bsr_vals = (double**) A_bsr->get_data();
-    for (int i = 0; i < A_bcoo->nnz; i++)
-    {
-        for (int j = 0; j < A_bcoo->b_size; j++)
-        {
-            ASSERT_NEAR(bcoo_vals[i][j], bsr_vals[i][j], 1e-10);
-        }
-    }
-
-    Matrix* Atmp = A_bsc->to_CSR();
-    Atmp->sort();
-    Atmp->move_diag();
-    double** tmp_vals = (double**) Atmp->get_data();
-    for (int i = 0; i < A_bsr->nnz; i++)
-    {
-        for (int j = 0; j < A_bsr->b_size; j++)
-        {
-            ASSERT_NEAR(bsr_vals[i][j], tmp_vals[i][j], 1e-10);
-        }
-    }
-
-    ASSERT_EQ(A_bcoo->format(), BCOO);
-    ASSERT_EQ(A_coo->format(), COO);
-    ASSERT_EQ(A_bsr->format(), BSR);
-    ASSERT_EQ(A_csr->format(), CSR);
-    ASSERT_EQ(A_bsc->format(), BSC);
-    ASSERT_EQ(A_csc->format(), CSC);
-    ASSERT_EQ(A_csr_from_bsr->format(), CSR);
-
-    A_csr->mult(x, b);
-    A_bsr->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-    
-    A_csr_from_bsr->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    A_coo->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    A_bcoo->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    A_csc->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    A_bsc->mult(x, tmp);
-    for (int i = 0; i < num_cols; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-
-    CSRMatrix* C_csr = A_csr->mult((CSRMatrix*)A_csr);
-    CSRMatrix* C_bsr = A_bsr->mult((BSRMatrix*)A_bsr);
-    ASSERT_EQ(C_csr->n_rows, C_bsr->n_rows * C_bsr->b_rows);
-    ASSERT_EQ(C_csr->n_cols, C_bsr->n_cols * C_bsr->b_cols);
-    compare_vals(C_csr, (BSRMatrix*) C_bsr);
-
-    CSRMatrix* C_csr_from_bsr = A_csr_from_bsr->mult((CSRMatrix*)A_csr_from_bsr);
-    ASSERT_EQ(C_csr_from_bsr->n_rows, C_bsr->n_rows * C_bsr->b_rows);
-    ASSERT_EQ(C_csr_from_bsr->n_cols, C_bsr->n_cols * C_bsr->b_cols);
-    compare_vals(C_csr_from_bsr, (BSRMatrix*) C_bsr);
-
-    CSRMatrix* D_csr = A_csr->mult_T((CSCMatrix*)A_csc);
-    CSRMatrix* D_bsr = A_bsr->mult_T((BSCMatrix*)A_bsc);
-    ASSERT_EQ(D_csr->n_rows, D_bsr->n_rows * D_bsr->b_rows);
-    ASSERT_EQ(D_csr->n_cols, D_bsr->n_cols * D_bsr->b_cols);
-    compare_vals(D_csr, (BSRMatrix*) D_bsr);
-    
-    CSRMatrix* D_csr_from_bsr = A_csr_from_bsr->mult_T((CSCMatrix*)A_csc);
-    ASSERT_EQ(D_csr_from_bsr->n_rows, D_bsr->n_rows * D_bsr->b_rows);
-    ASSERT_EQ(D_csr_from_bsr->n_cols, D_bsr->n_cols * D_bsr->b_cols);
-    compare_vals(D_csr_from_bsr, (BSRMatrix*) D_bsr);
-
-    delete A_bsr;
-    delete A_csr;
-    delete A_bsc;
-    delete A_csc;
-    delete A_bcoo;
-    delete A_coo;
-    delete A_csr_from_bsr;
-
-    delete C_csr;
-    delete C_bsr;
-    delete C_csr_from_bsr;
-
-    delete D_csr;
-    delete D_bsr;
-    delete D_csr_from_bsr;
-    
-    for (std::vector<double*>::iterator it = block_vals.begin();
-            it != block_vals.end(); ++it)
-        delete[] *it;
-
-} // end of TEST(MatrixTest, TestsInCore) //
-
-
diff --git a/raptor/core/tests/test_bsr_matrix.cpp b/raptor/core/tests/test_bsr_matrix.cpp
deleted file mode 100644
index 850a66da..00000000
--- a/raptor/core/tests/test_bsr_matrix.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-using namespace raptor;
-
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-
-} // end of main() //
-
-TEST(BSRMatrixTest, TestsInCore)
-{
-    // Matrix [0, 1], [1, 0]
-    //        [2, 0], [0, 2]
-    //        [3, 0], [0, 0]
-    //        [0, 4], [0, 0]
-    int n_csr = 4;
-    int nnz_csr = 6;
-    std::vector<int> rowptr_csr = {0, 2, 4, 5, 6};
-    std::vector<int> col_idx_csr = {1, 2, 0, 3, 0, 1};
-    std::vector<double> data_csr = {1, 1, 2, 2, 3, 4};
-    CSRMatrix* A_csr = new CSRMatrix(n_csr, n_csr, rowptr_csr, col_idx_csr, data_csr);
-    
-    int n = 2;  // 2 blocks by 2 blocks
-    int br = 2; // blocks are each 2x2
-    int bs = 4; 
-    int nnz = 3; // 3 blocks
-    std::vector<int> rowptr = {0, 2, 3};
-    std::vector<int> col_idx = {0, 1, 0};
-    std::vector<double> data = {0, 1, 2, 0, 1, 0, 0, 2, 3, 0, 0, 4};
-
-    // Hardcode one BSR Matrix
-    BSRMatrix* A = new BSRMatrix(n, n, br, br, nnz);
-    A->idx1[0] = 0;
-    for (int i = 0; i < n; i++)
-    {
-        A->idx1[i+1] = rowptr[i+1];
-        for (int j = A->idx1[i]; j < A->idx1[i+1]; j++)
-        {
-            A->idx2.push_back(col_idx[j]);
-            double* vals = new double[bs];
-            for (int k = 0; k < bs; k++)
-                vals[k] = data[j*bs + k];
-            A->block_vals.push_back(vals);
-        }
-    }
-
-    // Call method that converts CSR to BSR
-    BSRMatrix* A_conv = new BSRMatrix(A_csr, br, br);
-
-    // Check that both BSR matrices are equivalent
-    ASSERT_EQ(A_conv->n_rows, A->n_rows);
-    ASSERT_EQ(A_conv->n_cols, A->n_cols);
-    ASSERT_EQ(A_conv->b_rows, A->b_rows);
-    ASSERT_EQ(A_conv->b_cols, A->b_cols);
-    ASSERT_EQ(A_conv->b_size, A->b_size);
-
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        ASSERT_EQ(A_conv->idx1[i+1], A->idx1[i+1]);
-        for (int j = A->idx1[i]; j < A->idx1[i+1]; j++)
-        {
-            ASSERT_EQ(A_conv->idx2[j], A->idx2[j]);
-            for (int k = 0; k < A->b_size; k++)
-                ASSERT_EQ(A_conv->block_vals[j][k], A->block_vals[j][k]);
-        }
-    }
-                 
-    delete A_csr;
-    delete A;
-    delete A_conv;
-
-} // end of TEST(MatrixTest, TestsInCore) //
-
diff --git a/raptor/core/tests/test_matrix.cpp b/raptor/core/tests/test_matrix.cpp
deleted file mode 100644
index bd4c6078..00000000
--- a/raptor/core/tests/test_matrix.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-using namespace raptor;
-
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-
-} // end of main() //
-
-TEST(MatrixTest, TestsInCore)
-{
-    int rows[10] = {22, 17, 12, 0, 5, 7, 1, 0, 0, 12};
-    int cols[10] = {5, 18, 21, 0, 7, 7, 0, 1, 0, 21};
-    double vals[10] = {2.0, 1.0, 0.5, 1.0, 2.0, 1.0, 1.2, 2.2, 1.5, -1.0};
-
-    int row_ctr[26] = {0, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 9, 
-        9, 9, 9, 9, 10, 10, 10};
-
-    // Create COO Matrix (25x25)
-    COOMatrix* A_coo = new COOMatrix(25, 25, 1);
-    for (int i = 0; i < 10; i++)
-    {
-        A_coo->add_value(rows[i], cols[i], vals[i]);
-    }
-
-    // Check dimensions of A_coo
-    ASSERT_EQ(A_coo->n_rows, 25);
-    ASSERT_EQ(A_coo->n_cols, 25);
-    ASSERT_EQ(A_coo->nnz, 10);
-
-   // Check that rows, columns, and values in A_coo are correct
-    for (int i = 0; i < 10; i++)
-    {
-        ASSERT_EQ(A_coo->idx1[i], rows[i]);
-        ASSERT_EQ(A_coo->idx2[i], cols[i]);
-        ASSERT_EQ(A_coo->vals[i], vals[i]);
-    }
-
-    // Create CSR Matrix from COO
-    CSRMatrix* A_csr = A_coo->to_CSR();
-
-    // Check dimensions of A_csr
-    ASSERT_EQ(A_csr->n_rows,25);
-    ASSERT_EQ(A_csr->n_cols,25);
-    ASSERT_EQ(A_csr->nnz,10);
-
-    // Check that rows, columns, and values in A_coo are correct
-    
-    for (int i = 0; i < 26; i++)
-    {
-        ASSERT_EQ(A_csr->idx1[i],row_ctr[i]);
-    }
-
-    delete A_coo;
-    delete A_csr;
-
-} // end of TEST(MatrixTest, TestsInCore) //
-
diff --git a/raptor/core/tests/test_par_block_conversion.cpp b/raptor/core/tests/test_par_block_conversion.cpp
deleted file mode 100644
index 624aec8f..00000000
--- a/raptor/core/tests/test_par_block_conversion.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-
-TEST(ParBlockConversionTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    setenv("PPN", "4", 1);
-
-    // Form standard anisotropic matrix
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    int block_n = 2;
-    std::vector<int> grid(2, num_procs*block_n);
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    ParCSRMatrix* A = par_stencil_grid(stencil, grid.data(), 2);
-
-    ParBSRMatrix* A_bsr = A->to_ParBSR(block_n, block_n);
-    ParCSRMatrix* A_csr_from_bsr = A_bsr->to_ParCSR();
-    
-    ASSERT_EQ(A->local_nnz,A_csr_from_bsr->local_nnz);
-
-    // Test Partition of BSR to CSR
-    for (int i = 0; i < (int)A_csr_from_bsr->partition->first_cols.size(); i++)
-    {
-        ASSERT_EQ(A->partition->first_cols[i], A_csr_from_bsr->partition->first_cols[i]);
-    }
-    ASSERT_EQ(A->partition->local_num_rows, A_csr_from_bsr->partition->local_num_rows);
-    ASSERT_EQ(A->partition->local_num_cols, A_csr_from_bsr->partition->local_num_cols);
-    ASSERT_EQ(A->partition->first_local_row, A_csr_from_bsr->partition->first_local_row);
-    ASSERT_EQ(A->partition->first_local_col, A_csr_from_bsr->partition->first_local_col);
-    ASSERT_EQ(A->partition->last_local_row, A_csr_from_bsr->partition->last_local_row);
-    ASSERT_EQ(A->partition->last_local_col, A_csr_from_bsr->partition->last_local_col);
-    
-    // Test Row and Column Maps of BSR to CSR
-    for (int i = 0; i < (int)A_csr_from_bsr->off_proc_column_map.size(); i++)
-    {
-        ASSERT_EQ(A->off_proc_column_map[i], A_csr_from_bsr->off_proc_column_map[i]);
-    }
-    for (int i = 0; i < (int)A_csr_from_bsr->on_proc_column_map.size(); i++)
-    {
-        ASSERT_EQ(A->on_proc_column_map[i], A_csr_from_bsr->on_proc_column_map[i]);
-    }
-    for (int i = 0; i < (int)A_csr_from_bsr->local_row_map.size(); i++)
-    {
-        ASSERT_EQ(A->local_row_map[i], A_csr_from_bsr->local_row_map[i]);
-    }
-    
-    ParVector x(A->global_num_rows, A->local_num_rows);
-    ParVector b(A->global_num_rows, A->local_num_rows);
-    ParVector tmp(A->global_num_rows, A->local_num_rows);
-    x.set_const_value(1.0);
-
-    // Test BSR to CSR SpMV
-    A_bsr->mult(x, b);
-    A_csr_from_bsr->mult(x, tmp);
-    for (int i = 0; i < A_csr_from_bsr->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test BSR to CSR Transpose SpMV
-    A_bsr->mult_T(x, b);
-    A_csr_from_bsr->mult_T(x, tmp);
-    for (int i = 0; i < A_csr_from_bsr->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test BSR to CSR TAPSpMVs 
-    A_bsr->tap_mult(x, b);
-    A_csr_from_bsr->tap_mult(x, tmp);
-    for (int i = 0; i < A_csr_from_bsr->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test BSR to CSR Transpose TAPSpMV
-    A_bsr->tap_mult_T(x, b);
-    A_csr_from_bsr->tap_mult_T(x, tmp);
-    for (int i = 0; i < A_csr_from_bsr->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    delete A;
-    delete A_bsr;
-    delete A_csr_from_bsr;
-
-    setenv("PPN", "16", 1);
-    
-
-} // end of TEST(ParBlockConversionTest, TestsInCore) //
-
-
-
diff --git a/raptor/core/tests/test_par_block_matrix.cpp b/raptor/core/tests/test_par_block_matrix.cpp
deleted file mode 100644
index 6668ae79..00000000
--- a/raptor/core/tests/test_par_block_matrix.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-void compare_vals(CSRMatrix* A, BSRMatrix* B)
-{
-    A->sort();
-    B->sort();
-    int ctr = 0;
-    for (int i = 0; i < B->n_rows; i++)
-    {
-        for (int k = 0; k < B->b_rows; k++)
-        {
-            for (int j = B->idx1[i]; j < B->idx1[i+1]; j++)
-            {
-                double* val = B->block_vals[j];
-                for (int l = 0; l < B->b_cols; l++)
-                {
-                    if (fabs(val[(k*B->b_cols) + l]) > zero_tol)
-                    {
-                        ASSERT_NEAR(val[k*B->b_cols + l], A->vals[ctr++], 1e-10);
-                    }
-                }
-            }
-        }
-    }
-
-}
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-
-TEST(ParBlockMatrixTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    setenv("PPN", "4", 1);
-
-    // Form standard anisotropic matrix
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    int block_n = 2;
-    std::vector<int> grid(2, num_procs*block_n);
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    ParCSRMatrix* A = par_stencil_grid(stencil, grid.data(), 2);
-    ParBSRMatrix* A_bsr = A->to_ParBSR(block_n, block_n);
-
-    ParVector x(A->global_num_rows, A->local_num_rows);
-    ParVector b(A->global_num_rows, A->local_num_rows);
-    ParVector tmp(A->global_num_rows, A->local_num_rows);
-    x.set_const_value(1.0);
-
-    // Test Blocked Communication
-    std::vector<double> std;
-    std::vector<double> blocked;
-    std = A->comm->communicate(x);
-    blocked = A_bsr->comm->communicate(x, A_bsr->off_proc->b_cols);
-    ASSERT_EQ(std.size(), blocked.size());
-    int n = std.size();
-    for (int i = 0; i < n; i++)
-        ASSERT_NEAR(std[i], blocked[i], 1e-10);
-
-    // Test Blocked SpMV
-    A->mult(x, b);
-    A_bsr->mult(x, tmp);
-    for (int i = 0; i < A->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test Blocked Transpose Communication
-    A->comm->communicate_T(*x.local.storage, *b.local.storage);
-    A_bsr->comm->communicate_T(*x.local.storage, *tmp.local.storage, A_bsr->off_proc->b_cols);
-    ASSERT_EQ(std.size(), blocked.size());
-    for (int i = 0; i < n; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    // Test Blocked Transpose SpMV
-    A->mult_T(x, b);
-    A_bsr->mult_T(x, tmp);
-    for (int i = 0; i < A->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test Blocked TAPSpMVs
-    A->tap_comm = new TAPComm(A->partition, A->off_proc_column_map);
-    A_bsr->tap_comm = new TAPComm(A_bsr->partition, A_bsr->off_proc_column_map);
-    std = A->tap_comm->communicate(x);
-    blocked = A_bsr->tap_comm->communicate(x, A_bsr->off_proc->b_cols);
-    ASSERT_EQ(std.size(), blocked.size());
-
-    A->tap_mult(x, b);
-    A_bsr->tap_mult(x, tmp);
-    for (int i = 0; i < A->local_num_rows; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    // Test Blocked Transpose TAPSpMVs
-    A->tap_comm->communicate_T(*x.local.storage, *b.local.storage);
-    A_bsr->comm->communicate_T(*x.local.storage, *tmp.local.storage, A_bsr->off_proc->b_cols);
-    ASSERT_EQ(std.size(), blocked.size());
-    for (int i = 0; i < n; i++)
-        ASSERT_NEAR(b[i], tmp[i], 1e-10);
-
-    // Test Blocked Transpose TAPSpMV
-    A->tap_mult_T(x, b);
-    A_bsr->tap_mult_T(x, tmp);
-    for (int i = 0; i < A->local_num_rows; i++)
-        ASSERT_NEAR(tmp[i], b[i], 1e-10);
-
-    // Test Blocked Matrix Communication
-    CSRMatrix* C = A->comm->communicate(A);
-    BSRMatrix* C_bsr = (BSRMatrix*) A_bsr->comm->communicate(A_bsr);
-    C->sort();
-    C_bsr->sort();
-    ASSERT_EQ(C->n_rows, C_bsr->n_rows * C_bsr->b_rows);
-    compare_vals(C, C_bsr);
-    delete C;
-    delete C_bsr;
-
-    delete A;
-    delete A_bsr;
-
-    setenv("PPN", "16", 1);
-
-
-} // end of TEST(MatrixTest, TestsInCore) //
diff --git a/raptor/core/tests/test_par_bsr.cpp b/raptor/core/tests/test_par_bsr.cpp
deleted file mode 100644
index ebf4b3b5..00000000
--- a/raptor/core/tests/test_par_bsr.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-
-TEST(ParBSRMatrixTest, TestsInCore)
-{
-    /*int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    std::vector<int> row_ptr = {0, 3, 5, 8, 11, 13, 16};
-    std::vector<int> indices = {0, 1, 4, 1, 3, 1, 2, 5, 1, 3, 4, 0, 4, 2, 4, 5};
-    std::vector<double> data = {1,0,2,1, 6,7,8,2, 1,0,0,1, 1,4,5,1, 2,0,0,0, 4,3,0,0,
-    				7,2,0,0, 3,0,1,0, 1,0,0,1, 1,0,2,1, 6,7,8,2, 2,0,0,0,
-    				1,4,5,1, 3,0,1,0, 4,3,0,0, 7,2,0,0};
-    
-    std::vector<std::vector<double>> on_blocks = {{1,0,2,1}, {6,7,8,2}, {1,4,5,1}, 
-		    						{4,3,0,0}, {7,2,0,0}};
-    std::vector<std::vector<int>> on_indx = {{0,0}, {0,1}, {1,1}, {2,1}, {2,2}};
-
-    std::vector<std::vector<double>> off_blocks = {{1,0,0,1}, {2,0,0,0}, {3,0,1,0}};
-    std::vector<std::vector<int>> off_indx = {{0,4}, {1,3}, {2,5}};
-
-    // Create matrices for comparison
-    BSRMatrix* A_bsr = new BSRMatrix(12, 12, 2, 2, row_ptr, indices, data);
-    COOMatrix* A_coo = A_bsr->to_COO();
-    ParBSRMatrix* A_par_bsr = new ParBSRMatrix(12, 12, 2, 2);
-
-    // Add on_proc blocks
-    for (int i=0; i<on_blocks.size(); i++){
-        A_par_bsr->add_block(on_indx[i][0], on_indx[i][1], on_blocks[i]);
-        A_par_bsr->add_block(on_indx[i][0]+3, on_indx[i][1]+3, on_blocks[i]);
-    }
-
-    // Add off_proc blocks
-    for(int i=0; i<off_blocks.size(); i++){
-        A_par_bsr->add_block(off_indx[i][0], off_indx[i][1], off_blocks[i]);
-	A_par_bsr->add_block(off_indx[i][0]+3, off_indx[i][1]-3, off_blocks[i]);
-    }  
-
-    // Finalize ParBSRMatrix and create on and off process maps
-    A_par_bsr->finalize(true, 2);
-
-    // Compare nnz
-    int lcl_nnz = A_par_bsr->local_nnz;
-    int nnz;
-    MPI_Allreduce(&lcl_nnz, &nnz, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    ASSERT_EQ(A_bsr->nnz, nnz);
-
-    // Compare n_blocks
-    int lcl_nblocks = A_par_bsr->on_proc->idx2.size() + A_par_bsr->off_proc->idx2.size();
-    int nblocks;
-    MPI_Allreduce(&lcl_nblocks,& nblocks, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    ASSERT_EQ(A_bsr->n_blocks, nblocks);
-
-    // Create dense matrix to compare against
-    std::vector<double> A_dense = A_bsr->to_dense();
-
-    // Compare row_ptrs, indices, and data
-    if (num_procs <= 1)
-    {
-        for (int i=0; i<A_par_bsr->on_proc->idx1.size(); i++)
-	{
-            ASSERT_EQ(A_bsr->idx1[i], A_par_bsr->on_proc->idx1[i]);
-	}
-	for (int i=0; i<A_par_bsr->on_proc->idx2.size(); i++)
-	{
-            ASSERT_EQ(A_bsr->idx2[i], A_par_bsr->on_proc->idx2[i]);
-	}
-	for (int i=0; i<A_par_bsr->on_proc->vals.size(); i++)
-	{
-            ASSERT_EQ(A_bsr->vals[i], A_par_bsr->on_proc->vals[i]);
-	}
-    
-    }
-    else
-    {
-	int block_rows = A_par_bsr->b_rows;
-	int block_cols = A_par_bsr->b_cols;
-	int local_rows = A_par_bsr->local_num_rows;
-
-	for (int i = 0; i < local_rows/block_rows; i++)
-	{
-            int start = A_par_bsr->on_proc->idx1[i];
-	    int end = A_par_bsr->on_proc->idx1[i+1];
-	    for (int j = start; j < end; j++)
-	    {
-                int upper_i = A_par_bsr->local_row_map[i*block_rows];
-		int upper_j = A_par_bsr->on_proc_column_map[(A_par_bsr->on_proc->idx2[j])*block_cols];
-		int data_offset = j * block_rows * block_cols;
-		for (int bi = 0; bi < block_rows; bi++)
-		{
-                    for (int bj = 0; bj < block_cols; bj++)
-		    {
-                        int glob_i = upper_i + bi;
-			int glob_j = upper_j + bj;
-                        int ind = bi * block_cols + bj + data_offset;
-			double val = A_par_bsr->on_proc->vals[ind];
-			int glob_ind = glob_i*12+glob_j;
-			ASSERT_NEAR(A_dense[glob_ind], val, zero_tol);
-		    }
-		}
-	    }
-            
-	    start = A_par_bsr->off_proc->idx1[i];
-	    end = A_par_bsr->off_proc->idx1[i+1];
-	    for (int j = start; j < end; j++)
-	    {
-                int upper_i = A_par_bsr->local_row_map[i*block_rows];
-		int upper_j = A_par_bsr->off_proc_column_map[(A_par_bsr->off_proc->idx2[j])*block_cols];
-		int data_offset = j * block_rows * block_cols;
-		for (int bi = 0; bi < block_rows; bi++)
-		{
-                    for (int bj = 0; bj < block_cols; bj++)
-		    {
-                        int glob_i = upper_i + bi;
-			int glob_j = upper_j + bj;
-                        int ind = bi * block_cols + bj + data_offset;
-			int glob_ind = glob_i*12+glob_j;
-			double val = A_par_bsr->off_proc->vals[ind];
-			ASSERT_NEAR(A_dense[glob_i*12+glob_j], val, zero_tol);
-		    }
-		}
-	    }
-	}
-    }
-
-    // Delete pointers
-    delete A_par_bsr;
-    delete A_bsr;
-
-    */
-
-} // end of TEST(ParMatrixTest, TestsInCore) //
diff --git a/raptor/core/tests/test_par_comm.cpp b/raptor/core/tests/test_par_comm.cpp
deleted file mode 100644
index d6d249d9..00000000
--- a/raptor/core/tests/test_par_comm.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-} // end of main() //
-
-TEST(ParCommTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    int grid[2] = {10, 10};
-    int global_row, global_col;
-    int start, end;
-    double val;
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    std::vector<int> sendbuf;
-    std::vector<double> seq_row;
-
-    CSRMatrix* A_seq = stencil_grid(stencil, grid, 2);
-
-    ParCSRMatrix* A = par_stencil_grid(stencil, grid, 2);
-
-    ParVector x(A->global_num_rows, A->local_num_rows);
-    if (A->local_num_rows)
-    {
-        sendbuf.resize(A->local_num_rows);
-        for (int i = 0; i < A->local_num_rows; i++)
-        {
-            sendbuf[i] = A->local_row_map[i];
-        }
-    }
-
-    A->comm->communicate(sendbuf);
-
-    for (int i = 0; i < A->off_proc_num_cols; i++)
-    {
-        ASSERT_EQ(A->comm->recv_data->int_buffer[i], A->off_proc_column_map[i]);
-    }
-
-    seq_row.resize(A_seq->n_cols);
-    CSRMatrix* recv_mat = A->comm->communicate(A);
-    for (int i = 0; i < A->off_proc_num_cols; i++)
-    {
-        global_row = A->off_proc_column_map[i];
-        start = A_seq->idx1[global_row];
-        end = A_seq->idx1[global_row+1];
-        for (int j = start; j < end; j++)
-        {
-            seq_row[A_seq->idx2[j]] = A_seq->vals[j];
-        }
-
-        start = recv_mat->idx1[i];
-        end = recv_mat->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            global_col = recv_mat->idx2[j];
-            val = recv_mat->vals[j];
-            ASSERT_NEAR(seq_row[global_col], val, 1e-06);
-        }
-    }
-
-    delete recv_mat;
-    delete[] stencil;
-    delete A;
-    delete A_seq;
-
-} // end of TEST(ParCommTest, TestsInCore) //
diff --git a/raptor/core/tests/test_par_matrix.cpp b/raptor/core/tests/test_par_matrix.cpp
deleted file mode 100644
index e56fb1cd..00000000
--- a/raptor/core/tests/test_par_matrix.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-
-TEST(ParMatrixTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    int grid[2] = {10, 10};
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    CSRMatrix* A = stencil_grid(stencil, grid, 2);
-    ParCSRMatrix* A_par = par_stencil_grid(stencil, grid, 2);
-
-    ParCSCMatrix* A_par_csc = A_par->to_ParCSC();
-
-    int lcl_nnz = A_par->local_nnz;
-    int nnz;
-    MPI_Allreduce(&lcl_nnz, &nnz, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    ASSERT_EQ(A->nnz,nnz);
-
-    double A_dense[10000] = {0};
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        for (int j = A->idx1[i]; j < A->idx1[i+1]; j++)
-        {
-            A_dense[i*100 + A->idx2[j]] = A->vals[j];
-        }
-    }
-
-    // Compare A_par to A_dense
-    for (int i = 0; i < A_par->local_num_rows; i++)
-    {
-        int row = A_par->local_row_map[i];
-        for (int j = A_par->on_proc->idx1[i]; j < A_par->on_proc->idx1[i+1]; j++)
-        {
-            int col = A_par->on_proc_column_map[A_par->on_proc->idx2[j]];
-            //ASSERT_LT((fabs(A_dense[row*100+col] - A_par->on_proc->vals[j])), zero_tol);
-            ASSERT_NEAR(A_dense[row*100+col], A_par->on_proc->vals[j], zero_tol);
-        }
-
-        for (int j = A_par->off_proc->idx1[i]; j < A_par->off_proc->idx1[i+1]; j++)
-        {
-            int col = A_par->off_proc_column_map[A_par->off_proc->idx2[j]];
-            ASSERT_NEAR(A_dense[row*100+col], A_par->off_proc->vals[j], zero_tol);
-        }
-    }
-
-    // Compare A_par_csc to A_dense
-    for (int i = 0; i < A_par_csc->on_proc_num_cols; i++)
-    {
-        int col = A_par_csc->on_proc_column_map[i];
-        for (int j = A_par_csc->on_proc->idx1[i]; j < A_par_csc->on_proc->idx1[i+1]; j++)
-        {
-            int row = A_par_csc->local_row_map[A_par_csc->on_proc->idx2[j]];
-            ASSERT_NEAR(A_dense[row*100+col],A_par_csc->on_proc->vals[j], zero_tol);
-        }
-    }
-
-    for (int i = 0; i < A_par_csc->off_proc_num_cols; i++)
-    {
-        int col = A_par_csc->off_proc_column_map[i];
-        for (int j = A_par_csc->off_proc->idx1[i]; j < A_par_csc->off_proc->idx1[i+1]; j++)
-        {
-            int row = A_par_csc->local_row_map[A_par_csc->off_proc->idx2[j]];
-            ASSERT_NEAR(A_dense[row*100+col], A_par_csc->off_proc->vals[j], zero_tol);
-        }
-    }
-
-    delete[] stencil;
-
-} // end of TEST(ParMatrixTest, TestsInCore) //
diff --git a/raptor/core/tests/test_par_transpose.cpp b/raptor/core/tests/test_par_transpose.cpp
deleted file mode 100644
index 1c82d513..00000000
--- a/raptor/core/tests/test_par_transpose.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-#include "raptor/tests/par_compare.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-
-TEST(ParMatrixTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    ParCSRMatrix* A = readParMatrix("../../../../test_data/aniso.pm");
-    ParCSRMatrix* AT_py = readParMatrix("../../../../test_data/aniso_T.pm");
-    ParCSRMatrix* AT = (ParCSRMatrix*) A->transpose();
-    A->sort();
-    AT->sort();
-    AT_py->sort();
-    compare(AT, AT_py);
-    delete A;
-    delete AT_py;
-    delete AT;
-
-    A = readParMatrix("../../../../test_data/laplacian.pm");
-    AT_py = readParMatrix("../../../../test_data/laplacian_T.pm");
-    AT = (ParCSRMatrix*) A->transpose();
-    A->sort();
-    AT->sort();
-    AT_py->sort();
-    compare(AT, AT_py);
-    delete A;
-    delete AT_py;
-    delete AT;
-
-
-} // end of TEST(ParMatrixTest, TestsInCore) //
-
diff --git a/raptor/core/tests/test_par_vector.cpp b/raptor/core/tests/test_par_vector.cpp
deleted file mode 100644
index 55843165..00000000
--- a/raptor/core/tests/test_par_vector.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-} // end of main() //
-
-TEST(ParVectorTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    int global_n = 100;
-    int local_n = global_n / num_procs;
-    int first_n = rank * ( global_n / num_procs);
-
-    if (global_n % num_procs > rank)
-    {
-        local_n++;
-        first_n += rank;
-    }
-    else
-    {
-        first_n += (global_n % num_procs);
-    }
-
-    Vector v(global_n);
-    ParVector v_par(global_n, local_n);
-
-    v.set_const_value(1.0);
-    v_par.set_const_value(1.0);
-
-    Vector& v_par_l = v_par.local;
-    for (int i = 0; i < local_n; i++)
-    {
-        ASSERT_EQ( v[first_n+i], v_par_l[i] );
-        //EXPECT_EQ( v[first_n+i], v_par_l[i] );
-        //EXPECT_DOUBLE_EQ( v[first_n+i], v_par_l[i] );
-        //EXPECT_FLOAT_EQ( v[first_n+i], v_par_l[i] );
-    }
-    
-    for (int i = 0; i < global_n; i++)
-    {
-        srand(i);
-        v[i] = ((double)rand()) / RAND_MAX;
-    }
-    for (int i = 0; i < local_n; i++)
-    {
-        srand(i+first_n);
-        v_par_l[i] = ((double)rand()) / RAND_MAX;
-    }
-
-    for (int i = 0; i < local_n; i++)
-    {
-        ASSERT_EQ(v[first_n+i], v_par_l[i]);
-    }
-    
-} // end of TEST(ParVectorTest, TestsInCore) //
-
diff --git a/raptor/core/tests/test_tap_comm.cpp b/raptor/core/tests/test_tap_comm.cpp
deleted file mode 100644
index b20c0134..00000000
--- a/raptor/core/tests/test_tap_comm.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-#include "raptor/tests/compare.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp=RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-
-} // end of main() //
-TEST(TAPCommTest, TestsInCore)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    int grid[2] = {25, 25};
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    std::vector<double> tap_recv;
-    std::vector<double> par_recv;
-    std::vector<double> tap_simp_recv;
-
-    ParCSRMatrix* A = par_stencil_grid(stencil, grid, 2);
-    //A->tap_comm = new TAPComm(A->partition, A->off_proc_column_map);
-    //TAPComm* simple_tap = new TAPComm(A->partition, A->off_proc_column_map, false);
-    A->init_tap_communicators(MPI_COMM_WORLD);
-
-    ParVector x(A->global_num_rows, A->local_num_rows);
-    ParCSRMatrix* B = A->copy();
-
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        x[i] = A->local_row_map[i];
-    }
-    tap_recv = A->tap_comm->communicate(x);
-    tap_simp_recv = A->tap_mat_comm->communicate(x);
-    par_recv = A->comm->communicate(x);
-    ASSERT_EQ(tap_recv.size(), par_recv.size());
-    ASSERT_EQ(tap_recv.size(), tap_simp_recv.size());
-    for (int i = 0; i < (int)par_recv.size(); i++)
-    {
-        ASSERT_NEAR(par_recv[i], tap_recv[i], zero_tol);
-        ASSERT_NEAR(tap_recv[i], tap_simp_recv[i], zero_tol);
-    }
-
-    x.set_rand_values();
-    tap_recv = A->tap_comm->communicate(x);
-    tap_simp_recv = A->tap_mat_comm->communicate(x);
-    par_recv = A->comm->communicate(x);
-    ASSERT_EQ(tap_recv.size(), par_recv.size());
-    ASSERT_EQ(tap_simp_recv.size(), tap_recv.size());
-    for (int i = 0; i < (int)par_recv.size(); i++)
-    {
-        ASSERT_NEAR(par_recv[i], tap_recv[i], zero_tol);
-        ASSERT_NEAR(tap_recv[i], tap_simp_recv[i], zero_tol);
-    }
-
-    CSRMatrix* recv_mat = A->comm->communicate(B);
-    CSRMatrix* tap_recv_mat = A->tap_comm->communicate(B);
-    CSRMatrix* tap_recv_simp_mat = A->tap_mat_comm->communicate(B);
-    compare(recv_mat, tap_recv_mat);
-    compare(tap_recv_mat, tap_recv_simp_mat);
-    delete recv_mat;
-    delete tap_recv_mat;
-    delete tap_recv_simp_mat;
-
-    delete[] stencil;
-    delete A;
-
-
-} // end of TEST(TAPCommTest, TestsInCore) //
diff --git a/raptor/core/tests/test_transpose.cpp b/raptor/core/tests/test_transpose.cpp
deleted file mode 100644
index 2dcd22de..00000000
--- a/raptor/core/tests/test_transpose.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-#include "raptor/tests/compare.hpp"
-
-using namespace raptor;
-
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-
-} // end of main() //
-
-TEST(MatrixTest, TestsInCore)
-{
-    CSRMatrix* A = readMatrix("../../../../test_data/aniso.pm");
-    CSRMatrix* AT_py = readMatrix("../../../../test_data/aniso_T.pm");
-    CSRMatrix* AT = (CSRMatrix*) A->transpose();
-    A->sort();
-    AT->sort();
-    AT_py->sort();
-    compare(AT, AT_py);
-    delete A;
-    delete AT_py;
-    delete AT;
-
-    A = readMatrix("../../../../test_data/laplacian.pm");
-    AT_py = readMatrix("../../../../test_data/laplacian_T.pm");
-    AT = (CSRMatrix*) A->transpose();
-    A->sort();
-    AT->sort();
-    AT_py->sort();
-    compare(AT, AT_py);
-    delete A;
-    delete AT_py;
-    delete AT;
-
-
-} // end of TEST(MatrixTest, TestsInCore) //
-
-
diff --git a/raptor/core/topology.hpp b/raptor/core/topology.hpp
deleted file mode 100644
index ab8c772e..00000000
--- a/raptor/core/topology.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef TOPOLOGY_HPP 
-#define TOPOLOGY_HPP
-
-#include <mpi.h>
-#include <math.h>
-#include <set>
-
-#include "types.hpp"
-
-/**************************************************************
- *****   Topology Class
- **************************************************************
- ***** This class holds information about the topology of
- ***** the parallel computer on which Raptor is being run
- *****
- ***** Attributes
- ***** -------------
- ***** global_num_indices : index_t
- *****    Number of rows to be partitioned
- ***** first_local_idx : index_t
- *****    First global index of a row in partition local to rank
- ***** local_num_indices : index_t
- *****    Number of rows local to rank's partition
- *****
- ***** Methods
- ***** ---------
- **************************************************************/
-namespace raptor
-{
-  class Topology
-  {
-  public:
-    Topology(int _PPN = 16, int _standard_rank_ordering = 1)
-    {     
-        int rank, num_procs;
-        RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-        RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-        int rank_node;
-
-        char* proc_layout_c = getenv("RAPtor_MPICH_RANK_REORDER_METHOD");
-        char* PPN_c = getenv("PPN");
-        if (PPN_c) 
-        {
-            PPN = atoi(PPN_c);
-        }
-        else
-        {
-            PPN = _PPN;
-        }
-
-        if (proc_layout_c)
-        {
-            rank_ordering = atoi(proc_layout_c);
-        }
-        else
-        {
-            rank_ordering = _standard_rank_ordering;
-        }
-
-        num_nodes = num_procs / PPN;
-        if (num_procs % PPN) num_nodes++;
-        rank_node = get_node(rank);
-
-        // Create intra-node communicator
-        RAPtor_MPI_Comm_split(RAPtor_MPI_COMM_WORLD, rank_node, rank, &local_comm);
-        num_shared = 0;
-    }
-
-    ~Topology()
-    {
-        RAPtor_MPI_Comm_free(&local_comm);
-    }
-
-    int get_node(int proc)
-    {
-        if (rank_ordering == 0)
-        {
-            return proc % num_nodes;
-        }
-        else if (rank_ordering == 1)
-        {
-            return proc / PPN;
-        }
-        else if (rank_ordering == 2)
-        {
-            if ((proc / num_nodes) % 2 == 0)
-            {
-                return proc % num_nodes;
-            }
-            else
-            {
-                return num_nodes - (proc % num_nodes) - 1;
-            }
-        }
-        else
-        { 
-            int rank;
-            RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-            if (rank == 0)
-            {
-                printf("This RAPtor_MPI rank ordering is not supported!\n");
-            }
-            return -1;
-        }
-    }
-
-    int get_local_proc(int proc)
-    {
-        if (rank_ordering == 0 || rank_ordering == 2)
-        {
-            return proc / num_nodes;
-        }
-        else if (rank_ordering == 1)
-        {
-            return proc % PPN;
-        }
-        else
-        { 
-            int rank;
-            RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-            if (rank == 0)
-            {
-                printf("This RAPtor_MPI rank ordering is not supported!\n");
-            }
-            return -1;
-        }
-    }
-
-    int get_global_proc(int node, int local_proc)
-    {
-        if (rank_ordering == 0)
-        {
-            return local_proc * num_nodes + node;
-        }
-        else if (rank_ordering == 1)
-        {
-            return local_proc + (node * PPN);
-        }
-        else if (rank_ordering == 2)
-        {
-            if (local_proc % 2 == 0)
-            {
-                return local_proc * num_nodes + node;
-            }
-            else
-            {
-                return local_proc * num_nodes + num_nodes - node - 1;                
-            }
-        }
-        else
-        { 
-            int rank;
-            RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-            if (rank == 0)
-            {
-                printf("This RAPtor_MPI rank ordering is not supported!\n");
-            }
-            return -1;
-        }
-    }
-
-    int PPN;
-    int rank_ordering;
-    int num_shared;
-    int num_nodes;
-
-    RAPtor_MPI_Comm local_comm;
-  };
-}
-
-#endif
diff --git a/raptor/core/types.hpp b/raptor/core/types.hpp
deleted file mode 100644
index 850a4627..00000000
--- a/raptor/core/types.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_TYPES_HPP_
-#define RAPTOR_CORE_TYPES_HPP_
-
-#include <float.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-
-#include <vector>
-#include <map>
-#include <algorithm>
-#include <iterator>
-#include <numeric>
-#include <functional>
-#include <set>
-
-#include <cstdint>
-#include <vector>
-#include <stdexcept>
-
-#define zero_tol 1e-16
-#define RAPtor_MPI_INDEX_T MPI_INT
-#define RAPtor_MPI_DATA_T MPI_DOUBLE
-
-// Defines for CF splitting and aggregation
-#define TmpSelection 4
-#define NewSelection 3
-#define NewUnselection 2
-#define Selected 1
-#define Unselected 0
-#define Unassigned -1
-#define NoNeighbors -2
-
-
-// Global Timing Variables
-struct PairData
-{
-    double val;
-    int index;
-};
-
-namespace raptor
-{
-    using data_t = double;
-    using index_t = int;
-    enum strength_t {Classical, Symmetric};
-    enum format_t {COO, CSR, CSC, BCOO, BSR, BSC};
-    enum coarsen_t {RS, CLJP, Falgout, PMIS, HMIS};
-    enum interp_t {Direct, ModClassical, Extended};
-    enum agg_t {MIS};
-    enum prolong_t {JacobiProlongation};
-    enum relax_t {Jacobi, SOR, SSOR};
-
-    template<typename T, typename U>
-    U sum_func(const U& a, const T&b)
-    {
-        return a + b;
-    }
-
-    template<typename T, typename U>
-    U max_func(const U& a, const T&b)
-    {
-        if (a > b)
-        {
-            return a;
-        }
-        else
-        {
-            return b;
-        }
-    }
-}
-
-#endif
diff --git a/raptor/core/utilities.hpp b/raptor/core/utilities.hpp
deleted file mode 100644
index e586826b..00000000
--- a/raptor/core/utilities.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-#ifndef RAPTOR_CORE_UTILITIES_HPP
-#define RAPTOR_CORE_UTILITIES_HPP
-
-#include <limits>
-#include <type_traits>
-#include <cassert>
-
-#include "types.hpp"
-
-// BLAS LU routine that is used for coarse solve
-extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda,
-        int* ipiv, int* info);
-extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A,
-        int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
-
-namespace raptor {
-template <typename T, typename U>
-void vec_sort(std::vector<T>& vec1, std::vector<U>& vec2, int start = 0, int end = -1)
-{
-    vec1.shrink_to_fit();
-    vec2.shrink_to_fit();
-
-    int k, prev_k;
-    int n = vec1.size();
-    if (end < 0) end = n;
-    int size = end - start;
-
-    std::vector<int> p(size);
-    std::vector<bool> done(size, false);
-
-    std::iota(p.begin(), p.end(), 0);
-    std::sort(p.begin(), p.end(),
-            [&](const int i, const int j)
-            {
-                return vec1[i+start] < vec1[j+start];
-            });
-    for (int i = 0; i < size; i++)
-    {
-        if (done[i]) continue;
-        done[i] = true;
-        prev_k = i;
-        k = p[i];
-        while (i != k)
-        {
-            std::swap(vec1[prev_k + start], vec1[k + start]);
-            std::swap(vec2[prev_k + start], vec2[k + start]);
-            done[k] = true;
-            prev_k = k;
-            k = p[k];
-        }
-    }
-}
-
-template <typename T, typename U>
-void vec_sort(std::vector<T>& vec1, std::vector<T>& vec2,
-        std::vector<U>& vec3,
-        int start = 0, int end = -1)
-{
-    vec1.shrink_to_fit();
-    vec2.shrink_to_fit();
-    vec3.shrink_to_fit();
-
-    int k, prev_k;
-    int n = vec1.size();
-    if (end < 0) end = n;
-    int size = end - start;
-
-    std::vector<int> p(size);
-    std::vector<bool> done(size, false);
-
-    std::iota(p.begin(), p.end(), 0);
-    std::sort(p.begin(), p.end(),
-            [&](const int i, const int j)
-            {
-                int idx1 = i + start;
-                int idx2 = j + start;
-                if (vec1[idx1] == vec1[idx2])
-                    return vec2[idx1] < vec2[idx2];
-                else
-                    return vec1[idx1] < vec1[idx2];
-            });
-    for (int i = 0; i < size; i++)
-    {
-        if (done[i]) continue;
-        done[i] = true;
-        prev_k = i;
-        k = p[i];
-        while (i != k)
-        {
-            std::swap(vec1[prev_k + start], vec1[k + start]);
-            std::swap(vec2[prev_k + start], vec2[k + start]);
-            std::swap(vec3[prev_k + start], vec3[k + start]);
-            done[k] = true;
-            prev_k = k;
-            k = p[k];
-        }
-    }
-}
-
-
-enum extents : std::size_t {
-  dynamic_extent = std::numeric_limits<std::size_t>::max()
-};
-template <std::size_t E>
-struct extent_storage
-{
-	extent_storage(std::size_t) {}
-	constexpr std::size_t value() const { return E; }
-};
-template <>
-struct extent_storage<dynamic_extent>
-{
-	constexpr std::size_t value() const { return e; }
-	std::size_t e;
-};
-
-
-template <class T, std::size_t Extent = dynamic_extent>
-struct span {
-	using element_type = T;
-	using value_type = typename std::remove_cv<T>::type;
-	using size_type = std::size_t;
-	using difference_type = std::ptrdiff_t;
-	using pointer = T*;
-	using const_pointer = const T*;
-	using reference = T&;
-	using const_reference = const T&;
-	using iterator = T*;
-	using reverse_iterator = std::reverse_iterator<iterator>;
-
-	static constexpr std::size_t extent = Extent;
-
-
-	template<std::size_t E = Extent,
-	         class = typename std::enable_if<E == dynamic_extent || E == 0>::type>
-	span() : b(nullptr), ext{0} {}
-
-	constexpr span(pointer p, size_type s) : b(p), ext{s} {}
-
-	constexpr span(std::vector<T> & v) :
-		span(v.data(), v.size()) {}
-
-	constexpr iterator begin() const noexcept {
-		return b;
-	}
-
-	constexpr iterator end() const noexcept {
-		return b + size();
-	}
-
-	constexpr reverse_iterator rbegin() const noexcept {
-		return reverse_iterator(end());
-	}
-
-	constexpr reverse_iterator rend() const noexcept {
-		return reverse_iterator(begin());
-	}
-
-	constexpr reference front() const {
-		return *b;
-	}
-
-	constexpr reference back() const {
-		return *(b + (size() - 1));
-	}
-
-	constexpr reference operator[](size_type idx) const {
-		return begin()[idx];
-	}
-
-	constexpr pointer data() const noexcept {
-		return b;
-	}
-
-	constexpr size_type size() const noexcept {
-		return ext.value();
-	}
-
-	constexpr size_type size_bytes() const noexcept {
-		return sizeof(T)*size();
-	}
-
-	[[nodiscard]] constexpr bool empty() const noexcept {
-		return size() == 0;
-	}
-
-	template<size_type Count>
-	constexpr span<T, Count> first() const noexcept {
-		return {b, Count};
-	}
-
-	constexpr span<T, dynamic_extent> first(size_type count) const noexcept {
-		return {data(), count};
-	}
-
-	template<size_type Count>
-	constexpr span<T, Count> last() const noexcept {
-		return {data() + (size() - Count), Count};
-	}
-
-	constexpr span<T, dynamic_extent> last(size_type count) const noexcept {
-		return {data() + (size() - count), count};
-	}
-
-protected:
-	pointer b;
-	extent_storage<Extent> ext;
-};
-
-}
-#endif
diff --git a/raptor/core/vector.cpp b/raptor/core/vector.cpp
deleted file mode 100644
index 5154a139..00000000
--- a/raptor/core/vector.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "vector.hpp"
-
-using namespace raptor;
-
-/**************************************************************
-*****   Vector Set Constant Value
-**************************************************************
-***** Initializes the vector to a constant value
-*****
-***** Parameters
-***** -------------
-***** alpha : data_t
-*****    Constant value to set each element of vector to
-**************************************************************/
-void Vector::set_const_value(data_t alpha)
-{
-    for (index_t i = 0; i < size(); i++)
-    {
-        values[i] = alpha;
-    }
-}
-
-/**************************************************************
-*****   Vector Set Random Values
-**************************************************************
-***** Initializes each element of the vector to a random
-***** value
-**************************************************************/
-void Vector::set_rand_values()
-{
-    srand(time(NULL));
-    for (index_t i = 0; i < size(); i++)
-    {
-        values[i] = ((double)rand()) / RAND_MAX;
-    }
-}
-
-/**************************************************************
-*****   Vector AXPY
-**************************************************************
-***** Multiplies the vector x by a constant, alpha, and then
-***** sums each element with corresponding local entry 
-*****
-***** Parameters
-***** -------------
-***** x : Vector&
-*****    Vector to be summed with
-***** alpha : data_t
-*****    Constant value to multiply each element of vector by
-**************************************************************/
-void Vector::axpy(Vector& x, data_t alpha)
-{
-    for (index_t i = 0; i < size(); i++)
-    {
-        values[i] += x.values[i]*alpha;
-    }
-}
-
-/**************************************************************
-*****   Vector Copy
-**************************************************************
-***** Copies each vector value of y into values
-*****
-***** Parameters
-***** -------------
-***** y : Vector&
-*****    Vector to be copied.  Must have same local rows
-*****    and same first row
-**************************************************************/
-void Vector::copy(const Vector& y)
-{
-	if (!storage) storage = std::make_shared<storage_type>();
-	resize(y.size());
-    std::copy(y.values.begin(), y.values.end(), values.begin());
-}
-
-/**************************************************************
-*****   Vector Scale
-**************************************************************
-***** Multiplies each element of the vector by a constant value
-*****
-***** Parameters
-***** -------------
-***** alpha : data_t
-*****    Constant value to set multiply element of vector by
-**************************************************************/
-void Vector::scale(data_t alpha)
-{
-    for (index_t i = 0; i < size(); i++)
-    {
-        values[i] *= alpha;
-    }
-}
-
-/**************************************************************
-*****   Vector Norm
-**************************************************************
-***** Calculates the P norm of the vector (for a given P)
-*****
-***** Parameters
-***** -------------
-***** p : index_t
-*****    Determines which p-norm to calculate
-**************************************************************/
-data_t Vector::norm(index_t p)
-{
-    data_t result = 0.0;
-    double val;
-    for (index_t i = 0; i < size(); i++)
-    {
-        val = values[i];
-        if (fabs(val) > zero_tol)
-            result += pow(val, p);
-    }
-    return pow(result, 1.0/p);
-}
-
-/**************************************************************
-*****   Print Vector
-**************************************************************
-***** Prints all nonzero elements in vector
-*****
-***** Parameters
-***** -------------
-***** vec_name : const char* (optional)
-*****    Name to be printed.  Default prints Vec[%d] = %e.
-**************************************************************/
-void Vector::print(const char* vec_name)
-{
-    printf("Size = %d\n", size());
-    for (int i = 0; i < size(); i++)
-    {
-        if (fabs(values[i]) > zero_tol)
-            printf("%s[%d] = %e\n", vec_name, i, values[i]);
-    }
-}
-
-/**************************************************************
-*****   Vector Element Access
-**************************************************************
-***** Function overload for element access
-*****
-***** Returns
-***** ------------
-***** data_t& element at position passed
-**************************************************************/
-data_t& Vector::operator[](const int index)
-{
-    return values[index];
-}
-
-
-data_t Vector::inner_product(Vector& x)
-{
-    data_t result = 0.0;
-
-    for (int i = 0; i < size(); i++)
-    {
-        result += values[i] * x[i];
-    }
-
-    return result;
-}
-
-
diff --git a/raptor/core/vector.hpp b/raptor/core/vector.hpp
deleted file mode 100644
index 12f235fc..00000000
--- a/raptor/core/vector.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef RAPTOR_CORE_VECTOR_HPP_
-#define RAPTOR_CORE_VECTOR_HPP_
-
-#include <memory>
-
-#include "types.hpp"
-#include "utilities.hpp"
-
-// Vector Class
-//
-// This class constructs a vector, supporting simple linear
-// algebra operations.
-//
-// Attributes
-// -------------
-// values : std::vector<double>
-//    stl vector of vector values
-// size : index_t
-//    Dimension of vector
-//
-// Methods
-// -------
-// set_const_value(data_t alpha)
-//    Sets the vector to a constant value
-// set_rand_values()
-//    Sets each element of the vector to a random value
-// axpy(Vector& y, data_t alpha)
-//    Multiplies each element by a constant, alpha, and then
-//    adds corresponding values from y
-// scale(data_t alpha)
-//    Multiplies entries of vector by a constant
-// norm(index_t p)
-//    Calculates the p-norm of the vector
-// print()
-//    Prints the nonzero values and positions
-// data()
-//    Returns the data values as a data_t*
-//
-namespace raptor {
-
-class Vector
-{
-public:
-    /**************************************************************
-    *****   Vector Class Constructor
-    **************************************************************
-    ***** Initializes an empty vector of the given size
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** len : index_t
-    *****    Size of the vector
-    **************************************************************/
-	Vector(int len) :
-		storage(std::make_shared<storage_type>(len)),
-		values(*storage)
-	{}
-
-    /**************************************************************
-    *****   Vector Class Constructor
-    **************************************************************
-    ***** Initializes an empty vector without setting the size
-    **************************************************************/
-	Vector() :
-		storage(std::make_shared<storage_type>()),
-		values(*storage)
-	{}
-
-	Vector(double * base, std::size_t len) :
-		values(base, len) {}
-
-    Vector(const Vector& v)
-    {
-       copy(v);
-    }
-
-	void resize(std::size_t len)
-    {
-	    if (owns_data()) {
-		    storage->resize(len);
-		    values = span<double>(*storage);
-	    } else {
-		    assert(len <= values.size());
-		    values = values.first(len);
-	    }
-    }
-
-	bool owns_data() {
-		return static_cast<bool>(storage);
-	}
-
-    /**************************************************************
-    *****   Vector Set Constant Value
-    **************************************************************
-    ***** Initializes the vector to a constant value
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** alpha : data_t
-    *****    Constant value to set each element of vector to
-    **************************************************************/
-    void set_const_value(data_t alpha);
-
-    /**************************************************************
-    *****   Vector Set Random Values
-    **************************************************************
-    ***** Initializes each element of the vector to a random
-    ***** value
-    **************************************************************/
-    void set_rand_values();
-
-    /**************************************************************
-    *****   Vector AXPY
-    **************************************************************
-    ***** Multiplies the vector by a constant, alpha, and then
-    ***** sums each element with corresponding entry of Y
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** y : Vector&
-    *****    Vector to be summed with
-    ***** alpha : data_t
-    *****    Constant value to multiply each element of vector by
-    **************************************************************/
-    void axpy(Vector& y, data_t alpha);
-
-    /**************************************************************
-    *****   Vector Copy
-    **************************************************************
-    ***** Copies each vector value of y into values
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** y : Vector&
-    *****    Vector to be copied
-    **************************************************************/
-    void copy(const Vector& y);
-
-    /**************************************************************
-    *****   Vector Scale
-    **************************************************************
-    ***** Multiplies each element of the vector by a constant value
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** alpha : data_t
-    *****    Constant value to set multiply element of vector by
-    **************************************************************/
-    void scale(data_t alpha);
-
-    /**************************************************************
-    *****   Vector Norm
-    **************************************************************
-    ***** Calculates the P norm of the vector (for a given P)
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** p : index_t
-    *****    Determines which p-norm to calculate
-     **************************************************************/
-    data_t norm(index_t p);
-
-    /**************************************************************
-    *****   Print Vector
-    **************************************************************
-    ***** Prints all nonzero elements in vector
-    *****
-    ***** Parameters
-    ***** -------------
-    ***** vec_name : const char* (optional)
-    *****    Name to be printed.  Default prints Vec[%d] = %e.
-    **************************************************************/
-    void print(const char* vec_name = "Vec");
-
-    /**************************************************************
-    *****   Vector Element Access
-    **************************************************************
-    ***** Function overload for element access
-    *****
-    ***** Returns
-    ***** ------------
-    ***** data_t& element at position passed
-    **************************************************************/
-    data_t& operator[](const int index);
-
-    /**************************************************************
-    *****   Vector Data
-    **************************************************************
-    ***** Returns pointer to vector entries
-    *****
-    ***** Returns
-    ***** -------------
-    ***** data_t*
-    *****    Pointer to values of vector
-    **************************************************************/
-    data_t* data()
-    {
-        return values.data();
-    }
-
-    index_t size() const
-    {
-	    return values.size();
-    }
-
-    data_t inner_product(Vector& x);
-
-	void set_base(double *base) {
-		auto sz = size();
-		if (storage) storage.reset();
-		values = span<double>(base, sz);
-	}
-
-	using storage_type = std::vector<double>;
-	std::shared_ptr<storage_type> storage;
-	span<double> values;
-};
-
-}
-
-
-#endif
diff --git a/raptor/gallery/CMakeLists.txt b/raptor/gallery/CMakeLists.txt
deleted file mode 100644
index 9ee05889..00000000
--- a/raptor/gallery/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Include the directory itself as a path to include directories
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
-
-# Create a variable called gallery_SOURCES containing all .cpp files:
-if (WITH_MPI)
-    set(par_gallery_HEADERS
-        gallery/par_stencil.hpp
-        gallery/par_random.hpp
-        gallery/par_matrix_IO.hpp
-        gallery/par_matrix_market.hpp
-        )
-    set(par_gallery_SOURCES
-        gallery/par_stencil.cpp
-        gallery/par_random.cpp
-        gallery/par_matrix_IO.cpp
-        gallery/par_matrix_market.cpp
-        )
-else ()
-    set(par_gallery_HEADERS
-        ""
-        )
-    set(par_gallery_SOURCES
-        ""
-        )
-endif()
-
-set(gallery_HEADERS
-    gallery/diffusion.hpp
-    gallery/laplacian27pt.hpp
-    gallery/stencil.hpp
-    gallery/random.hpp
-    gallery/matrix_IO.hpp
-    gallery/matrix_market.hpp
-    ${par_gallery_HEADERS}
-    PARENT_SCOPE)
-
-set(gallery_SOURCES
-    gallery/diffusion.cpp
-    gallery/laplacian27pt.cpp
-    gallery/stencil.cpp
-    gallery/random.cpp
-    gallery/matrix_IO.cpp
-    gallery/matrix_market.cpp
-    ${par_gallery_SOURCES}
-    PARENT_SCOPE)
-
-
diff --git a/raptor/gallery/diffusion.cpp b/raptor/gallery/diffusion.cpp
deleted file mode 100644
index 470fd626..00000000
--- a/raptor/gallery/diffusion.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "diffusion.hpp"
-
-namespace raptor {
-// diffusion_stencil_2d
-//
-// Generate a diffusion stencil
-//
-// Supports isotropic diffusion (FE,FD), anisotropic diffusion (FE, FD), and
-// rotated anisotropic diffusion (FD).
-//
-// Rotated Anisotropic diffusion in 2d of the form:
-//
-// -div Q A Q^T grad u
-//
-// Q = [cos(theta) -sin(theta)]
-//     [sin(theta)  cos(theta)]
-//
-// A = [1          0        ]
-//     [0          eps      ]
-//
-// Parameters
-// ----------
-// epsilon  : double, optional
-//     Anisotropic diffusion coefficient: -div A grad u,
-//     where A = [1 0; 0 epsilon].  The default is isotropic, epsilon=1.0
-// theta : double, optional
-//     Rotation angle `theta` in radians defines -div Q A Q^T grad,
-//     where Q = [cos(`theta`) -sin(`theta`); sin(`theta`) cos(`theta`)].
-// type : {'FE','FD'}
-//     Specifies the discretization as Q1 finite element (FE) or 2nd order
-//     finite difference (FD)
-//     The default is `theta` = 0.0
-//
-// Returns
-// -------
-// stencil : numpy array
-//     A 3x3 diffusion stencil
-//
-// See Also
-// --------
-// stencil_grid
-//
-// Notes
-// -----
-// Not all combinations are supported.
-//
-// TODO
-// ----
-// Add FD option
-//
-data_t* diffusion_stencil_2d(data_t eps, data_t theta)
-{
-    data_t* stencil = new data_t[9];
-
-    data_t C = cos(theta);
-    data_t S = sin(theta);
-    data_t CS = C*S;
-    data_t CC = C*C;
-    data_t SS = S*S;
-
-    data_t val1 =  ((-1*eps - 1)*CC + (-1*eps - 1)*SS + ( 3*eps - 3)*CS) / 6.0;
-    data_t val2 =  (( 2*eps - 4)*CC + (-4*eps + 2)*SS) / 6.0;
-    data_t val3 =  ((-1*eps - 1)*CC + (-1*eps - 1)*SS + (-3*eps + 3)*CS) / 6.0;
-    data_t val4 =  ((-4*eps + 2)*CC + ( 2*eps - 4)*SS) / 6.0;
-    data_t val5 =  (( 8*eps + 8)*CC + ( 8*eps + 8)*SS) / 6.0;
-
-    stencil[0] = val1;
-    stencil[1] = val2;
-    stencil[2] = val3;
-    stencil[3] = val4;
-    stencil[4] = val5;
-    stencil[5] = val4;
-    stencil[6] = val3;
-    stencil[7] = val2;
-    stencil[8] = val1;
-
-    return stencil;
-}
-
-}
diff --git a/raptor/gallery/diffusion.hpp b/raptor/gallery/diffusion.hpp
deleted file mode 100644
index bc49904a..00000000
--- a/raptor/gallery/diffusion.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef DIFFUSION_HPP
-#define DIFFUSION_HPP
-
-#include "raptor/core/types.hpp"
-
-namespace raptor {
-// diffusion_stencil_2d
-//
-// Generate a diffusion stencil
-//
-// Supports isotropic diffusion (FE,FD), anisotropic diffusion (FE, FD), and
-// rotated anisotropic diffusion (FD).
-//
-// Rotated Anisotropic diffusion in 2d of the form:
-//
-// -div Q A Q^T grad u
-//
-// Q = [cos(theta) -sin(theta)]
-//     [sin(theta)  cos(theta)]
-//
-// A = [1          0        ]
-//     [0          eps      ]
-//
-// Parameters
-// ----------
-// epsilon  : double, optional
-//     Anisotropic diffusion coefficient: -div A grad u,
-//     where A = [1 0; 0 epsilon].  The default is isotropic, epsilon=1.0
-// theta : double, optional
-//     Rotation angle `theta` in radians defines -div Q A Q^T grad,
-//     where Q = [cos(`theta`) -sin(`theta`); sin(`theta`) cos(`theta`)].
-// type : {'FE','FD'}
-//     Specifies the discretization as Q1 finite element (FE) or 2nd order
-//     finite difference (FD)
-//     The default is `theta` = 0.0
-//
-// Returns
-// -------
-// stencil : numpy array
-//     A 3x3 diffusion stencil
-//
-// See Also
-// --------
-// stencil_grid
-//
-// Notes
-// -----
-// Not all combinations are supported.
-//
-// TODO
-// ----
-// Add FD option
-//
-data_t* diffusion_stencil_2d(data_t eps = 1.0, data_t theta = 0.0);
-
-}
-
-#endif
diff --git a/raptor/gallery/laplacian27pt.cpp b/raptor/gallery/laplacian27pt.cpp
deleted file mode 100644
index 012037a3..00000000
--- a/raptor/gallery/laplacian27pt.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "laplacian27pt.hpp"
-
-namespace raptor {
-// 27 Point Laplacian Stencil
-//
-// Generate a 27-point laplacian stencil
-//
-// Returns
-// -------
-// stencil : numpy array
-//     A 3x3 diffusion stencil
-//
-// See Also
-// --------
-// stencil_grid
-//
-//
-
-data_t* laplace_stencil_27pt()
-{
-    data_t* stencil = new data_t[27];
-
-    for (int i = 0; i < 27; i++)
-    {
-        stencil[i] = -1;
-    }
-
-    stencil[13] = 26;
-
-    return stencil;
-}
-
-}
diff --git a/raptor/gallery/laplacian27pt.hpp b/raptor/gallery/laplacian27pt.hpp
deleted file mode 100644
index 3a843ec2..00000000
--- a/raptor/gallery/laplacian27pt.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#ifndef LAPLACIAN27PT_HPP
-#define LAPLACIAN27PT_HPP
-
-//#include <mpi.h>
-#include "raptor/core/types.hpp"
-#include <stdlib.h>
-
-namespace raptor {
-
-// 27 Point Laplacian Stencil
-//
-// Generate a 27-point laplacian stencil
-//
-// Returns
-// -------
-// stencil : numpy array
-//     A 3x3 diffusion stencil
-//
-// See Also
-// --------
-// stencil_grid
-//
-//
-
-data_t* laplace_stencil_27pt();
-
-}
-#endif
diff --git a/raptor/gallery/matrix_IO.cpp b/raptor/gallery/matrix_IO.cpp
deleted file mode 100644
index e1eb166c..00000000
--- a/raptor/gallery/matrix_IO.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "matrix_IO.hpp"
-#include <assert.h>
-#include <float.h>
-#include <stdio.h>
-#include <iostream>     // std::cout
-#include <fstream>      // std::ifstream
-
-namespace {
-bool little_endian()
-{
-    int num = 1;
-    return (*(char *)&num == 1);
-}
-
-template <class T>
-void endian_swap(T *objp)
-{
-  unsigned char *memp = reinterpret_cast<unsigned char*>(objp);
-  std::reverse(memp, memp + sizeof(T));
-}
-}
-
-namespace raptor {
-CSRMatrix* readMatrix(const char* filename)
-{
-    CSRMatrix* A;
-
-    int32_t code;
-    int32_t n_rows;
-    int32_t n_cols;
-    int32_t nnz;
-    int32_t idx;
-    double val;
-
-    int sizeof_dbl = sizeof(val);
-    int sizeof_int32 = sizeof(code);
-    bool is_little_endian = false;
-
-    std::ifstream ifs (filename, std::ifstream::binary);
-    ifs.read(reinterpret_cast<char *>(&code), sizeof_int32);
-    ifs.read(reinterpret_cast<char *>(&n_rows), sizeof_int32);
-    ifs.read(reinterpret_cast<char *>(&n_cols), sizeof_int32);
-    ifs.read(reinterpret_cast<char *>(&nnz), sizeof_int32);
-
-    if (code != PETSC_MAT_CODE)
-    {
-        is_little_endian = true;
-        endian_swap(&code);
-        endian_swap(&n_rows);
-        endian_swap(&n_cols);
-        endian_swap(&nnz);
-    }
-
-    assert(code == PETSC_MAT_CODE);
-
-    A = new CSRMatrix(n_rows, n_cols, nnz);
-
-    int displ = 0;
-    A->idx1[0] = 0;
-    if (is_little_endian)
-    {
-        for (int32_t i = 0; i < n_rows; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&idx), sizeof_int32);
-            endian_swap(&idx);
-            displ += idx;
-            A->idx1[i+1] = displ;
-        }
-        for (int32_t i = 0; i < nnz; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&idx), sizeof_int32);
-            endian_swap(&idx);
-            A->idx2.emplace_back(idx);
-        }
-        for (int32_t i = 0; i < nnz; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&val), sizeof_dbl);
-            endian_swap(&val);
-            A->vals.emplace_back(val);
-        }
-    }
-    else
-    {
-        for (int32_t i = 0; i < n_rows; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&idx), sizeof_int32);
-            displ += idx;
-            A->idx1[i+1] = displ;
-        }
-        for (int32_t i = 0; i < nnz; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&idx), sizeof_int32);
-            A->idx2.emplace_back(idx);
-        }
-        for (int32_t i = 0; i < nnz; i++)
-        {
-            ifs.read(reinterpret_cast<char *>(&val), sizeof_dbl);
-            endian_swap(&val);
-            A->vals.emplace_back(val);
-        }
-    }
-    A->nnz = A->idx2.size();
-
-    ifs.close();
-
-    return A;
-
-}
-}
diff --git a/raptor/gallery/matrix_IO.hpp b/raptor/gallery/matrix_IO.hpp
deleted file mode 100644
index b48e1578..00000000
--- a/raptor/gallery/matrix_IO.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef MATRIX_IO_H
-#define MATRIX_IO_H
-
-#define PETSC_MAT_CODE 1211216
-
-//#include <mpi.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/types.hpp"
-
-namespace raptor {
-
-CSRMatrix* readMatrix(const char* filename);
-
-}
-#endif
-
diff --git a/raptor/gallery/matrix_market.cpp b/raptor/gallery/matrix_market.cpp
deleted file mode 100644
index 822ac102..00000000
--- a/raptor/gallery/matrix_market.cpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "matrix_market.hpp"
-
-namespace raptor {
-
-// Declare Private Methods
-char *mm_strdup(const char *s);
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode);
-
-CSRMatrix* read_mm(const char *fname)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    int row, col;
-    double val;
-
-    if ((f = fopen(fname, "r")) == NULL)
-            return NULL;
- 
- 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return NULL;
-    }
- 
- 
- 
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return NULL;
-    }
- 
-    /* find out size of sparse matrix: M, N, nz .... */
- 
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return NULL;
-    }
- 
-    /* reseve memory for matrices */
-    COOMatrix* A = new COOMatrix(M, N, nz);
-
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
- 
-    for (i=0; i<nz; i++)
-    {
-        if (fscanf(f, "%d %d %lg\n", &row, &col, &val) != 3)
-            return NULL;
-        A->add_value(row - 1, col - 1, val);
-    }
-    fclose(f);
-
-    CSRMatrix* A_csr = A->to_CSR();
-    delete A;
- 
-    return A_csr;
-}
-
-
-void write_mm(CSRMatrix* A, const char *fname)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int start, end;
-
-    if ((f = fopen(fname, "w")) == NULL)
-            return;
- 
-    mm_initialize_typecode(&matcode);
-    mm_set_matrix(&matcode);
-    mm_set_coordinate(&matcode);
-    mm_set_real(&matcode);
-
-    mm_write_banner(f, matcode);
-    fprintf(f, "%%\n");
-    mm_write_mtx_crd_size(f, A->n_rows, A->n_cols, A->nnz);
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            fprintf(f, "%d %d %2.15e\n", i+1, A->idx2[j]+1, A->vals[j]);
-        }
-    }
-    fclose(f); 
-}
-
-int mm_is_valid(MM_typecode matcode)
-{
-    if (!mm_is_matrix(matcode)) return 0;
-    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-                mm_is_skew(matcode))) return 0;
-    return 1;
-}
-
-int mm_read_banner(FILE *f, MM_typecode *matcode)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
-
-    mm_clear_typecode(matcode);  
-
-    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
-        return MM_PREMATURE_EOF;
-
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
-        return MM_PREMATURE_EOF;
-
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(matcode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(matcode);
-    else
-    if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(matcode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(matcode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(matcode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(matcode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-        
-
-    return 0;
-}
-
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
-{
-    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = *nz = 0;
-
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
-        return 0;
-        
-    else
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 3);
-
-    return 0;
-}
-
-
-int mm_read_mtx_array_size(FILE *f, int *M, int *N)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = 0;
-	
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d", M, N) == 2)
-        return 0;
-        
-    else /* we have a blank line */
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d", M, N); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 2);
-
-    return 0;
-}
-
-int mm_write_mtx_array_size(FILE *f, int M, int N)
-{
-    if (fprintf(f, "%d %d\n", M, N) != 2)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-
-
-/*-------------------------------------------------------------------------*/
-
-/******************************************************************/
-/* use when I[], J[], and val[]J, and val[] are already allocated */
-/******************************************************************/
-
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    int i;
-    if (mm_is_complex(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-        for (i=0; i<nz; i++)
-        {
-            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
-                != 3) return MM_PREMATURE_EOF;
-
-        }
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d", &I[i], &J[i])
-                != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
-        double *real, double *imag, MM_typecode matcode)
-{
-    if (mm_is_complex(matcode))
-    {
-            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-            if (fscanf(f, "%d %d %lg\n", I, J, real)
-                != 3) return MM_PREMATURE_EOF;
-
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-
-/************************************************************************
-    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
-                        type code, e.g. 'MCRS'
-
-                        if matrix is complex, values[] is of size 2*nz,
-                            (nz pairs of real/imaginary values)
-************************************************************************/
-
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode)
-{
-    int ret_code;
-    FILE *f;
-
-    if (strcmp(fname, "stdin") == 0) f=stdin;
-    else
-    if ((f = fopen(fname, "r")) == NULL)
-        return MM_COULD_NOT_READ_FILE;
-
-
-    if ((ret_code = mm_read_banner(f, matcode)) != 0)
-        return ret_code;
-
-    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
-            mm_is_matrix(*matcode)))
-        return MM_UNSUPPORTED_TYPE;
-
-    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
-        return ret_code;
-
-
-    *I = (int *)  malloc(*nz * sizeof(int));
-    *J = (int *)  malloc(*nz * sizeof(int));
-    *val = NULL;
-
-    if (mm_is_complex(*matcode))
-    {
-        *val = (double *) malloc(*nz * 2 * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-    else if (mm_is_real(*matcode))
-    {
-        *val = (double *) malloc(*nz * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    else if (mm_is_pattern(*matcode))
-    {
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    if (f != stdin) fclose(f);
-    return 0;
-}
-
-int mm_write_banner(FILE *f, MM_typecode matcode)
-{
-    char *str = mm_typecode_to_str(matcode);
-    int ret_code;
-
-    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
-    free(str);
-    if (ret_code !=2 )
-        return MM_COULD_NOT_WRITE_FILE;
-    else
-        return 0;
-}
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    FILE *f;
-    int i;
-
-    if (strcmp(fname, "stdout") == 0) 
-        f = stdout;
-    else
-    if ((f = fopen(fname, "w")) == NULL)
-        return MM_COULD_NOT_WRITE_FILE;
-    
-    /* print banner followed by typecode */
-    fprintf(f, "%s ", MatrixMarketBanner);
-    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
-
-    /* print matrix sizes and nonzeros */
-    fprintf(f, "%d %d %d\n", M, N, nz);
-
-    /* print values */
-    if (mm_is_pattern(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d\n", I[i], J[i]);
-    else
-    if (mm_is_real(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
-    else
-    if (mm_is_complex(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
-                        val[2*i+1]);
-    else
-    {
-        if (f != stdout) fclose(f);
-        return MM_UNSUPPORTED_TYPE;
-    }
-
-    if (f !=stdout) fclose(f);
-
-    return 0;
-}
-  
-
-/**
-*  Create a new copy of a string s.  mm_strdup() is a common routine, but
-*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
-*
-*/
-char *mm_strdup(const char *s)
-{
-	int len = strlen(s);
-	char *s2 = (char *) malloc((len+1)*sizeof(char));
-	return strcpy(s2, s);
-}
-
-char  *mm_typecode_to_str(MM_typecode matcode)
-{
-    char buffer[MM_MAX_LINE_LENGTH];
-    const char *types[4];
-	char *mm_strdup(const char *);
-
-    /* check for MTX type */
-    if (mm_is_matrix(matcode)) 
-        types[0] = MM_MTX_STR;
-    else
-        return NULL;
-
-    /* check for CRD or ARR matrix */
-    if (mm_is_sparse(matcode))
-        types[1] = MM_SPARSE_STR;
-    else
-        if (mm_is_dense(matcode))
-            types[1] = MM_DENSE_STR;
-        else
-            return NULL;
-
-    /* check for element data type */
-    if (mm_is_real(matcode))
-        types[2] = MM_REAL_STR;
-    else
-        if (mm_is_complex(matcode))
-            types[2] = MM_COMPLEX_STR;
-        else
-            if (mm_is_pattern(matcode))
-                types[2] = MM_PATTERN_STR;
-            else
-                if (mm_is_integer(matcode))
-                    types[2] = MM_INT_STR;
-                else
-                    return NULL;
-
-
-    /* check for symmetry type */
-    if (mm_is_general(matcode))
-        types[3] = MM_GENERAL_STR;
-    else
-        if (mm_is_symmetric(matcode))
-            types[3] = MM_SYMM_STR;
-        else 
-            if (mm_is_hermitian(matcode))
-                types[3] = MM_HERM_STR;
-            else 
-                if (mm_is_skew(matcode))
-                    types[3] = MM_SKEW_STR;
-                else
-                    return NULL;
-
-    snprintf(buffer, sizeof(buffer), "%s %s %s %s", types[0], types[1], types[2], types[3]);
-    return mm_strdup(buffer);
-
-}
-
-}
diff --git a/raptor/gallery/matrix_market.hpp b/raptor/gallery/matrix_market.hpp
deleted file mode 100644
index 83374b5d..00000000
--- a/raptor/gallery/matrix_market.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef MM_IO_H
-#define MM_IO_H
-
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-
-namespace raptor {
-
-#define MM_MAX_LINE_LENGTH 1025
-#define MatrixMarketBanner "%%MatrixMarket"
-#define MM_MAX_TOKEN_LENGTH 64
-
-typedef char MM_typecode[4];
-
-char *mm_typecode_to_str(MM_typecode matcode);
-
-int mm_read_banner(FILE *f, MM_typecode *matcode);
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
-int mm_read_mtx_array_size(FILE *f, int *M, int *N);
-
-int mm_write_banner(FILE *f, MM_typecode matcode);
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
-int mm_write_mtx_array_size(FILE *f, int M, int N);
-
-
-/********************* MM_typecode query functions ***************************/
-
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
-
-
-/********************* MM_typecode modify functions ***************************/
-
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
-
-
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
-
-#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
-
-
-/********************* Matrix Market error codes ***************************/
-
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
-
-/******************** Matrix Market internal definitions ********************
-
-   MM_matrix_typecode: 4-character sequence
-
-				    object 		sparse/   	data        storage 
-						  		dense     	type        scheme
-
-   string position:	 [0]        [1]			[2]         [3]
-
-   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
-						        A(array)	C(omplex)   H(ermitian)
-											P(attern)   S(ymmetric)
-								    		I(nteger)	K(kew)
-
- ***********************************************************************/
-
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-/*  high level routines */
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-		 double val[], MM_typecode matcode);
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-		double val[], MM_typecode matcode);
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
-			MM_typecode matcode);
-
-CSRMatrix* read_mm(const char* fname);
-void write_mm(CSRMatrix* A, const char* fname);
-
-}
-
-#endif
diff --git a/raptor/gallery/par_matrix_IO.cpp b/raptor/gallery/par_matrix_IO.cpp
deleted file mode 100644
index 1b96d750..00000000
--- a/raptor/gallery/par_matrix_IO.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "par_matrix_IO.hpp"
-#include "matrix_IO.hpp"
-#include <stdio.h>
-#include "limits.h"
-
-namespace {
-bool little_endian()
-{
-    int num = 1;
-    return (*(char *)&num == 1);
-}
-
-template <class T>
-void endian_swap(T *objp)
-{
-  unsigned char *memp = reinterpret_cast<unsigned char*>(objp);
-  std::reverse(memp, memp + sizeof(T));
-}
-}
-
-namespace raptor {
-ParCSRMatrix* readParMatrix(const char* filename, 
-        int local_num_rows, int local_num_cols,
-        int first_local_row, int first_local_col, 
-        RAPtor_MPI_Comm comm)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(comm, &rank);
-    RAPtor_MPI_Comm_size(comm, &num_procs);
-
-    ParCSRMatrix* A = NULL;
-
-    int64_t pos;
-    int32_t code;
-    int32_t global_num_rows;
-    int32_t global_num_cols;
-    int32_t global_nnz;
-    int32_t idx;
-    int n_items_read;
-    double val;
-
-    bool is_little_endian = false;
-
-    int ctr, size;
-
-    int sizeof_dbl = sizeof(val);
-    int sizeof_int32 = sizeof(code);
-
-    FILE* ifile = fopen(filename, "rb");
-    if (fseek(ifile, 0, SEEK_SET)) printf("Error seeking beginning of file\n"); 
-    
-    // Read code, and determine if little endian, or if long int
-    int32_t header[4];
-    n_items_read = fread(header, sizeof_int32, 4, ifile);
-    code = header[0];
-    global_num_rows = header[1];
-    global_num_cols = header[2];
-    global_nnz = header[3];
-    if (code != PETSC_MAT_CODE)
-    {
-        endian_swap(&code);
-        endian_swap(&global_num_rows);
-        endian_swap(&global_num_cols);
-        endian_swap(&global_nnz);
-        is_little_endian = true;
-    }
-
-    if (first_local_col >= 0)
-    {
-        A = new ParCSRMatrix(global_num_rows, global_num_cols,
-                local_num_rows, local_num_cols,
-                first_local_row, first_local_col);
-    }
-    else
-    {
-        A = new ParCSRMatrix(global_num_rows, global_num_cols);
-    }
-
-    std::vector<int32_t> row_sizes;
-    std::vector<int32_t> col_indices;
-    std::vector<double> vals;
-    std::vector<int> proc_nnz(num_procs);
-    if (A->local_num_rows)
-        row_sizes.resize(A->local_num_rows);
-    int nnz = 0;
-
-    // Find row sizes
-    pos = (4 + A->partition->first_local_row) * sizeof_int32;
-    if (fseek(ifile, pos, SEEK_SET)) printf("Error seeking pos\n"); 
-    if (A->local_num_rows)
-    {
-        n_items_read = fread(row_sizes.data(), sizeof_int32, A->local_num_rows, ifile);
-        if (n_items_read == EOF) printf("EOF reading code\n");
-        if (ferror(ifile)) printf("Error reading row_size\n");
-        if (is_little_endian)
-        {
-            for (int i = 0; i < A->local_num_rows; i++)
-            {
-                endian_swap(&(row_sizes[i]));
-                nnz += row_sizes[i];
-            }
-        }
-        else
-        {
-            for (int i = 0; i < A->local_num_rows; i++)
-            {
-                nnz += row_sizes[i];
-            }
-        }
-    }
-
-    // Find nnz per proc (to find first_nnz)
-    RAPtor_MPI_Allgather(&nnz, 1, RAPtor_MPI_INT, proc_nnz.data(), 1, RAPtor_MPI_INT, comm);
-    long first_nnz = 0;
-    for (int i = 0; i < rank; i++)
-        first_nnz += proc_nnz[i];
-    long total_nnz = first_nnz;
-    for (int i = rank; i < num_procs; i++)
-        total_nnz += proc_nnz[i];
-
-    // Resize variables
-    if (nnz)
-    {
-        col_indices.resize(nnz);
-        vals.resize(nnz);
-    }
-
-    // Read in col_indices
-    pos = (4 + A->global_num_rows + first_nnz) * sizeof_int32;
-    if (fseek(ifile, pos, SEEK_SET)) printf("Error seeking pos\n"); 
-    n_items_read = fread(col_indices.data(), sizeof_int32, nnz, ifile);
-    if (n_items_read == EOF) printf("EOF reading code\n");
-    if (ferror(ifile)) printf("Error reading col idx\n");
-    
-    pos = (4 + A->global_num_rows + total_nnz) * sizeof_int32 + (first_nnz * sizeof_dbl);
-    if (fseek(ifile, pos, SEEK_SET)) printf("Error seeking pos\n"); 
-    n_items_read = fread(vals.data(), sizeof_dbl, nnz, ifile);
-    if (n_items_read == EOF) printf("EOF reading code\n");
-    if (ferror(ifile)) printf("Error reading value\n");
-   
-    if (is_little_endian)
-    {
-        for (int i = 0; i < nnz; i++)
-        {
-            endian_swap(&(col_indices[i]));
-            endian_swap(&(vals[i]));
-        }
-    }
-
-    fclose(ifile);
-
-    A->on_proc->idx1[0] = 0;
-    A->off_proc->idx1[0] = 0;
-    ctr = 0;
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        size = row_sizes[i];
-        for (int j = 0; j < size; j++)
-        {
-            idx = col_indices[ctr];
-            val = vals[ctr++];
-            if ((int) idx >= A->partition->first_local_col &&
-                    (int) idx <= A->partition->last_local_col)
-            {
-                A->on_proc->idx2.emplace_back(idx - A->partition->first_local_col);
-                A->on_proc->vals.emplace_back(val);
-            }
-            else
-            {
-                A->off_proc->idx2.emplace_back(idx);
-                A->off_proc->vals.emplace_back(val);
-            }
-        } 
-        A->on_proc->idx1[i+1] = A->on_proc->idx2.size();
-        A->off_proc->idx1[i+1] = A->off_proc->idx2.size();
-    }
-    A->on_proc->nnz = A->on_proc->idx2.size();
-    A->off_proc->nnz = A->off_proc->idx2.size();
-
-    A->finalize();
-
-    return A;
-}
-}
diff --git a/raptor/gallery/par_matrix_IO.hpp b/raptor/gallery/par_matrix_IO.hpp
deleted file mode 100644
index a5ea6ef2..00000000
--- a/raptor/gallery/par_matrix_IO.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef PAR_MATRIX_IO_H
-#define PAR_MATRIX_IO_H
-
-#include <mpi.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <iostream>     // std::cout
-#include <fstream>      // std::ifstream
-
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/types.hpp"
-
-namespace raptor {
-
-ParCSRMatrix* readParMatrix(const char* filename, 
-        int local_num_rows = -1, int local_num_cols = -1,
-        int first_local_row = -1, int first_local_col = -1, 
-        RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD);
-
-}
-#endif
diff --git a/raptor/gallery/par_matrix_market.cpp b/raptor/gallery/par_matrix_market.cpp
deleted file mode 100644
index eabcd962..00000000
--- a/raptor/gallery/par_matrix_market.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "par_matrix_market.hpp"
-
-namespace raptor {
-
-// Declare Private Methods
-void write_par_data(FILE* f, int n, int* rowptr, int* col_idx,
-        double* vals, int first_row, int* col_map);
-
-ParCSRMatrix* read_par_mm(const char *fname)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    int row, col;
-    int n_items_read;
-    double val;
-
-    if ((f = fopen(fname, "r")) == NULL)
-            return NULL;
-
-
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return NULL;
-    }
-
-
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return NULL;
-    }
-
-    /* find out size of sparse matrix: M, N, nz .... */
-
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return NULL;
-    }
-
-    int row_nnz = nz / M;
-    ParCOOMatrix* A = new ParCOOMatrix(M, N);
-    A->on_proc->vals.reserve(row_nnz);
-    A->off_proc->vals.reserve(row_nnz);
-
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
-
-    bool symmetric = mm_is_symmetric(matcode);
-    bool row_local;
-    bool col_local;
-    for (i=0; i<nz; i++)
-    {
-        n_items_read = fscanf(f, "%d %d %lg\n", &row, &col, &val);
-        if (n_items_read == EOF) printf("EOF reading code\n");
-        row--;
-        col--;
-        if (row >= A->partition->first_local_row && row <= A->partition->last_local_row)
-        {
-            row_local = true;
-            row -= A->partition->first_local_row;
-        }
-        else
-        {
-            row_local = false;
-            if (!symmetric)
-                continue;
-        }
-        if (col >= A->partition->first_local_col && col <= A->partition->last_local_col)
-        {
-            col_local = true;
-            col -= A->partition->first_local_col;
-        }
-        else
-        {
-            col_local = false;
-            if (!row_local)
-                continue;
-        }
-
-        if (row_local)
-        {
-            if (col_local)
-            {
-                A->on_proc->add_value(row, col, val);
-            }
-            else
-            {
-                A->off_proc->add_value(row, col, val);
-            }
-        }
-
-        if (symmetric)
-        {
-            if (col_local)
-            {
-                if (row_local)
-                {
-                    A->on_proc->add_value(col, row, val);
-                }
-                else
-                {
-                    A->off_proc->add_value(col, row, val);
-                }
-            }
-        }
-    }
-
-    A->finalize();
-    ParCSRMatrix* A_csr = A->to_ParCSR();
-    delete A;
-
-    fclose(f);
-
-    return A_csr;
-}
-
-void write_par_data(FILE* f, int n, int* rowptr, int* col_idx,
-        double* vals, int first_row, int* col_map)
-{
-    int start, end, global_row;
-
-    for (int i = 0; i < n; i++)
-    {
-        global_row = first_row + i;
-        start = rowptr[i];
-        end = rowptr[i+1];
-        for (int j = start; j < end; j++)
-        {
-            fprintf(f, "%d %d %2.15e\n", global_row + 1,
-                    col_map[col_idx[j]] + 1, vals[j]);
-        }
-    }
-}
-
-
-void write_par_mm(ParCSRMatrix* A, const char *fname)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    FILE *f;
-    MM_typecode matcode;
-    int pos;
-    int int_bytes, double_bytes;
-    int num_ints, num_doubles;
-    int comm_size;
-
-    std::vector<char> buffer;
-
-    int nnz = A->local_nnz;
-    int global_nnz;
-    RAPtor_MPI_Reduce(&nnz, &global_nnz, 1, RAPtor_MPI_INT, RAPtor_MPI_SUM, 0, RAPtor_MPI_COMM_WORLD);
-
-    std::vector<int> proc_dims(5*num_procs);
-    int dims[5];
-    dims[0] = A->local_num_rows + 1;
-    dims[1] = A->on_proc_num_cols;
-    dims[2] = A->off_proc_num_cols;
-    dims[3] = A->on_proc->nnz;
-    dims[4] = A->off_proc->nnz;
-    RAPtor_MPI_Gather(dims, 5, RAPtor_MPI_INT, proc_dims.data(), 5, RAPtor_MPI_INT, 0, RAPtor_MPI_COMM_WORLD);
-
-    if (rank == 0) // RANK 0 IS ONLY ONE WRITING TO FILE
-    {
-        f = fopen(fname, "w");
-
-        mm_initialize_typecode(&matcode);
-        mm_set_matrix(&matcode);
-        mm_set_coordinate(&matcode);
-        mm_set_real(&matcode);
-
-        mm_write_banner(f, matcode);
-        fprintf(f, "%%\n");
-        mm_write_mtx_crd_size(f, A->global_num_rows, A->global_num_cols,
-                global_nnz);
-
-        // Write local data
-        int first_row = 0;
-        write_par_data(f, A->local_num_rows, A->on_proc->idx1.data(),
-                A->on_proc->idx2.data(), A->on_proc->vals.data(),
-                first_row, A->on_proc_column_map.data());
-        write_par_data(f, A->local_num_rows, A->off_proc->idx1.data(),
-                A->off_proc->idx2.data(), A->off_proc->vals.data(),
-                first_row, A->off_proc_column_map.data());
-        first_row += A->local_num_rows;
-
-        // Write data from other processes
-        std::vector<int> idx1;
-        std::vector<int> idx2;
-        std::vector<double> vals;
-        std::vector<int> row_map;
-        std::vector<int> col_map;
-        for (int i = 1; i < num_procs; i++)
-        {
-            // Calculate comm_size and allocate recv_buf
-            int* i_dims = &proc_dims[i*5];
-            num_ints = i_dims[0] * 2 + i_dims[1] + i_dims[3] + i_dims[3] + i_dims[4];
-            num_doubles = i_dims[3] + i_dims[4];
-            RAPtor_MPI_Pack_size(num_ints, RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD, &int_bytes);
-            RAPtor_MPI_Pack_size(num_doubles, RAPtor_MPI_DOUBLE, RAPtor_MPI_COMM_WORLD, &double_bytes);
-            comm_size = int_bytes + double_bytes;
-            if ((int)buffer.size() < comm_size) buffer.resize(comm_size);
-
-            // Resize Matrix Arrays
-            int row_max = i_dims[0];
-            int col_max = i_dims[1];
-            int nnz_max = i_dims[3];
-            if (i_dims[2] > i_dims[1]) col_max = i_dims[2];
-            if (i_dims[4] > i_dims[3]) nnz_max = i_dims[4];
-            if ((int)col_map.size() < col_max) col_map.resize(col_max);
-            if ((int)idx1.size() < row_max) idx1.resize(row_max);
-            if ((int)idx2.size() < nnz_max)
-            {
-                idx2.resize(nnz_max);
-                vals.resize(nnz_max);
-            }
-
-            // Recv Packed Buffer
-            RAPtor_MPI_Recv(buffer.data(), comm_size, RAPtor_MPI_PACKED, i, 1234, RAPtor_MPI_COMM_WORLD,
-                    RAPtor_MPI_STATUS_IGNORE);
-
-            // Unpack On Proc Data
-            pos = 0;
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, col_map.data(), i_dims[1],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, idx1.data(), i_dims[0],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, idx2.data(), i_dims[3],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, vals.data(), i_dims[3],
-                    RAPtor_MPI_DOUBLE, RAPtor_MPI_COMM_WORLD);
-            write_par_data(f, i_dims[0] - 1, idx1.data(), idx2.data(),
-                    vals.data(), first_row, col_map.data());
-
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, col_map.data(), i_dims[2],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, idx1.data(), i_dims[0],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, idx2.data(), i_dims[4],
-                    RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD);
-            RAPtor_MPI_Unpack(buffer.data(), comm_size, &pos, vals.data(), i_dims[4],
-                    RAPtor_MPI_DOUBLE, RAPtor_MPI_COMM_WORLD);
-            write_par_data(f, i_dims[0] - 1, idx1.data(), idx2.data(),
-                    vals.data(), first_row, col_map.data());
-
-            first_row += i_dims[0] - 1;
-        }
-
-        fclose(f);
-    }
-    else // All processes that are not 0, send to 0
-    {
-        // Determine send size (in bytes)
-        num_ints = dims[0] * 2 + dims[1] + dims[3] + dims[3] + dims[4];
-        num_doubles = dims[3] + dims[4];
-        RAPtor_MPI_Pack_size(num_ints, RAPtor_MPI_INT, RAPtor_MPI_COMM_WORLD, &int_bytes);
-        RAPtor_MPI_Pack_size(num_doubles, RAPtor_MPI_DOUBLE, RAPtor_MPI_COMM_WORLD, &double_bytes);
-        comm_size = int_bytes + double_bytes;
-        buffer.resize(comm_size);
-
-        // Pack Data
-        pos = 0;
-        RAPtor_MPI_Pack(A->on_proc_column_map.data(), dims[1], RAPtor_MPI_INT, buffer.data(), comm_size,
-               &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->on_proc->idx1.data(), dims[0], RAPtor_MPI_INT, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->on_proc->idx2.data(), dims[3], RAPtor_MPI_INT, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->on_proc->vals.data(), dims[3], RAPtor_MPI_DOUBLE, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-
-        RAPtor_MPI_Pack(A->off_proc_column_map.data(), dims[2], RAPtor_MPI_INT, buffer.data(), comm_size,
-               &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->off_proc->idx1.data(), dims[0], RAPtor_MPI_INT, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->off_proc->idx2.data(), dims[4], RAPtor_MPI_INT, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-        RAPtor_MPI_Pack(A->off_proc->vals.data(), dims[4], RAPtor_MPI_DOUBLE, buffer.data(), comm_size,
-                &pos, RAPtor_MPI_COMM_WORLD);
-
-        // Send Packed Data
-        RAPtor_MPI_Send(buffer.data(), comm_size, RAPtor_MPI_PACKED, 0, 1234, RAPtor_MPI_COMM_WORLD);
-    }
-}
-
-}
diff --git a/raptor/gallery/par_matrix_market.hpp b/raptor/gallery/par_matrix_market.hpp
deleted file mode 100644
index 5a7af8d5..00000000
--- a/raptor/gallery/par_matrix_market.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef PAR_MM_IO_H
-#define PAR_MM_IO_H
-
-#include "matrix_market.hpp"
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-
-namespace raptor {
-
-/*  high level routines */
-ParCSRMatrix* read_par_mm(const char *fname);
-void write_par_mm(ParCSRMatrix* A, const char *fname);
-}
-
-#endif
diff --git a/raptor/gallery/par_random.cpp b/raptor/gallery/par_random.cpp
deleted file mode 100644
index bb49d5ce..00000000
--- a/raptor/gallery/par_random.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "par_random.hpp"
-
-namespace raptor {
-ParCSRMatrix* par_random(int global_rows, int global_cols, int nnz_per_row)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    ParCOOMatrix* A_coo;
-    double val = 1.0;
-
-    A_coo = new ParCOOMatrix(global_rows, global_cols);
-    int local_nnz = nnz_per_row * A_coo->local_num_rows;
-    for (int i = 0; i < local_nnz; i++)
-    {
-        A_coo->add_value(rand() % A_coo->local_num_rows, rand() % global_cols, val);
-    }
-    A_coo->finalize();
-
-    ParCSRMatrix* A = A_coo->to_ParCSR();
-    delete A_coo;
-
-    return A;
-
-}
-
-}
diff --git a/raptor/gallery/par_random.hpp b/raptor/gallery/par_random.hpp
deleted file mode 100644
index 1a93a128..00000000
--- a/raptor/gallery/par_random.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef RAPTOR_GALLERY_PARRANDOM_HPP
-#define RAPTOR_GALLERY_PARRANDOM_HPP
-
-#include <mpi.h>
-#include <float.h>
-#include <cmath>
-#include <stdlib.h>
-
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/types.hpp"
-
-namespace raptor {
-
-ParCSRMatrix* par_random(int global_rows, int global_cols, int nnz_per_row);
-}
-#endif
diff --git a/raptor/gallery/par_stencil.cpp b/raptor/gallery/par_stencil.cpp
deleted file mode 100644
index ab18f3a3..00000000
--- a/raptor/gallery/par_stencil.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "par_stencil.hpp"
-
-namespace raptor {
-ParCSRMatrix* par_stencil_grid(data_t* stencil, int* grid, int dim)
-{
-    // Get MPI Information
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    std::vector<int> diags;
-    std::vector<double> nonzero_stencil;
-    std::vector<int> strides(dim);
-    std::vector<double> data;
-    std::vector<int> stack_indices;
-
-    int stencil_len, ctr;
-    int N_v;  // Number of rows (and cols) in matrix
-    int N_s;  // Number of nonzero stencil entries
-    int n_v;  // Local number of rows (and cols)
-
-    int init_step, idx;
-    int len, step, current_step;
-    int col;
-    double value;
-
-    // Initialize variables
-    stencil_len = (index_t)pow(3, dim); // stencil - 3 ^ dim
-
-    //N_v is global number of rows
-    N_v = 1;
-    for (index_t i = 0; i < dim; i++)
-    {
-       N_v *= grid[i];
-    }
-
-    //N_s is number of nonzero stencil entries
-    N_s = 0;
-    for (index_t i = 0; i < stencil_len; i++)
-    {
-        if (fabs(stencil[i]) > zero_tol)
-        {
-            N_s++;
-        }
-    }
-
-    ParCSRMatrix* A = new ParCSRMatrix(N_v, N_v);
-
-    n_v = A->partition->local_num_rows;
-    int first_local_row = A->partition->first_local_row;
-    int last_local_row = first_local_row + n_v - 1;
-
-    A->on_proc->n_rows = n_v;
-    A->on_proc->n_cols = n_v;
-    A->on_proc->nnz = 0;
-    A->on_proc->idx1.resize(n_v+1);
-    A->on_proc->idx2.reserve(n_v*stencil_len);
-    A->on_proc->vals.reserve(n_v*stencil_len);
-
-    A->off_proc->n_rows = n_v;
-    A->off_proc->n_cols = N_v;
-    A->off_proc->nnz = 0;
-    A->off_proc->idx1.resize(n_v+1);
-    A->off_proc->idx2.reserve(0.3*n_v*stencil_len);
-    A->off_proc->vals.reserve(0.3*n_v*stencil_len);
-
-
-    diags.resize(N_s, 0);
-    nonzero_stencil.resize(N_s);
-    strides.resize(dim);
-    //Calculate strides for index offset for each dof in stencil
-    strides[0] = 1;
-    for (index_t i = 0; i < dim-1; i++)
-    {
-        strides[i+1] = grid[dim-i-1] * strides[i];
-    }
-
-    //Calculate indices of nonzeros in  stencil
-    index_t indices[N_s][dim];
-    ctr = 0;
-    for (index_t i = 0; i < stencil_len; i++)
-    {
-        if (fabs(stencil[i]) > zero_tol)
-        {
-            for (index_t j = 0; j < dim; j++)
-            {
-                //index_t power = pow(3, j);
-                index_t idiv = i / pow(3, j);
-                indices[ctr][dim-j-1] = (idiv % 3) - (3 / 2);
-            }
-            nonzero_stencil[ctr] = stencil[i];
-            ctr++;
-        }
-    }
-
-    //Add strides to diags
-    for (index_t i = 0; i < dim; i++)
-    {
-        for (index_t j = 0; j < N_s; j++)
-        {
-            diags[j] += strides[i] * indices[j][dim-i-1];
-        }
-    }
-
-    //Initial data array
-    data.resize(N_s*n_v);
-    for (index_t i = 0; i < N_s; i++)
-    {
-        for (index_t j = 0; j < n_v; j++)
-        {
-            data[i*n_v + j] = nonzero_stencil[i];
-        }
-    }
-
-    //Vertically stack indices (reorder)
-    stack_indices.resize(N_s*dim);
-    for (index_t i = 0; i < N_s; i++)
-    {
-        for (index_t j = 0; j < dim; j++)
-        {
-            stack_indices[i*dim+j] = indices[i][j];
-        }
-    }
-
-    //Zero boundary conditions
-    for (index_t i = 0; i < N_s; i++)
-    {
-        //get correct chunk of data
-        //(corresponding to single stencil entry)
-        init_step = i*n_v;
-        for (index_t j = 0; j < dim; j++)
-        {
-            //If main diagonal, no boundary conditions
-            idx = stack_indices[i*dim + j];
-            if (idx == 0)
-            {
-                continue;
-            }
-
-            //Calculate length of chunks that are to
-            // be set to zero, and step size between
-            // these blocks of data
-            len = 1;
-            step = 1;
-            for (index_t k = 0; k < (dim-j-1); k++)
-            {
-                len *= grid[k];
-            }
-            step = len * grid[0];
-
-            //zeros at beginning
-            if (idx > 0)
-            {
-                current_step = step * (first_local_row / step);
-
-                //If previous boundary lies on processor
-                for (index_t k = current_step; k < last_local_row+1; k+=step)
-                {
-                    for (index_t l = 0; l < len; l++)
-                    {
-                        if (k+l > last_local_row)
-                        {
-                            break;
-                        }
-                        if (k+l < first_local_row)
-                        {
-                            continue;
-                        }
-                        data[init_step + (k-first_local_row) + l] = 0;
-                    }
-                }
-            }
-
-            //zeros at end
-            else if (idx < 0)
-            {
-                current_step = step*(((last_local_row-1)/step)+1);
-
-                //If previous boundary lies on processor
-                for (index_t k = current_step; k > first_local_row; k-=step)
-                {
-                    for (index_t l = 0; l < len; l++)
-                    {
-                        if (k - l - 1 < first_local_row)
-                        {
-                            break;
-                        }
-                        else if (k - l - 1 > last_local_row)
-                        {
-                            continue;
-                        }
-                        data[init_step + (k-l-first_local_row) -1] = 0;
-                    }
-                }
-            }
-        }
-    }
-
-    //Add diagonals to ParMatrix A
-    A->on_proc->idx1[0] = 0;
-    A->off_proc->idx1[0] = 0;
-    for (index_t i = 0; i < n_v; i++)
-    {
-        for (index_t d = 0; d < N_s; d++)
-        {
-            //add data[i] if nonzero
-            col = diags[d] + i + first_local_row;
-            value = data[(N_s-d-1)*n_v+i];
-            if (col >= 0 && col < N_v && fabs(value) > zero_tol)
-            {
-                A->add_value(i, col, value);
-            }
-        }
-        A->on_proc->idx1[i+1] = A->on_proc->idx2.size();
-        A->off_proc->idx1[i+1] = A->off_proc->idx2.size();
-    }
-
-    A->on_proc->nnz = A->on_proc->idx2.size();
-    A->off_proc->nnz = A->off_proc->idx2.size();
-
-    A->finalize();
-
-    return A;
-}
-
-}
diff --git a/raptor/gallery/par_stencil.hpp b/raptor/gallery/par_stencil.hpp
deleted file mode 100644
index 407734b4..00000000
--- a/raptor/gallery/par_stencil.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef PARSTENCIL_HPP
-#define PARSTENCIL_HPP
-
-#include <float.h>
-#include <cmath>
-#include <stdlib.h>
-
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-
-namespace raptor {
-
-ParCSRMatrix* par_stencil_grid(data_t* stencil, int* grid, int dim);
-
-}
-#endif
diff --git a/raptor/gallery/random.cpp b/raptor/gallery/random.cpp
deleted file mode 100644
index 878ce332..00000000
--- a/raptor/gallery/random.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "random.hpp"
-
-namespace raptor {
-CSRMatrix* random(int rows, int cols, int nnz_per_row)
-{
-    CSRMatrix* A;
-    COOMatrix* Atmp = new COOMatrix(rows, cols, nnz_per_row);
-
-    int nnz = nnz_per_row * rows;
-    for (int i = 0; i < nnz; i++)
-    {
-        Atmp->idx1.emplace_back(rand() % rows);
-        Atmp->idx2.emplace_back(rand() % cols);
-        Atmp->vals.emplace_back(1.0);
-    }
-    Atmp->nnz = nnz;
-
-    A = Atmp->to_CSR();
-    delete Atmp;
-
-    return A;
-
-}
-
-}
diff --git a/raptor/gallery/random.hpp b/raptor/gallery/random.hpp
deleted file mode 100644
index 166a150f..00000000
--- a/raptor/gallery/random.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef RAPTOR_GALLERY_RANDOM_HPP
-#define RAPTOR_GALLERY_RANDOM_HPP
-
-#include <mpi.h>
-#include <float.h>
-#include <cmath>
-#include <stdlib.h>
-
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/types.hpp"
-
-namespace raptor {
-
-CSRMatrix* random(int rows, int cols, int nnz_per_row);
-}
-#endif
diff --git a/raptor/gallery/stencil.cpp b/raptor/gallery/stencil.cpp
deleted file mode 100644
index 863ec797..00000000
--- a/raptor/gallery/stencil.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "stencil.hpp"
-
-namespace raptor {
-// Stencils are symmetric, so A could be CSR or CSC
-CSRMatrix* stencil_grid(data_t* stencil, int* grid, int dim)
-{
-    std::vector<int> diags;
-    std::vector<double> nonzero_stencil;
-    std::vector<int> strides(dim);
-    std::vector<double> data;
-    std::vector<int> stack_indices;
-
-    int stencil_len, ctr;
-    int N_v;  // Number of rows (and cols) in matrix
-    int N_s;  // Number of nonzero stencil entries
-    int init_step, idx;
-    int len, step;
-    int col;
-    double value;
-
-    stencil_len = (int)pow(3, dim);
-
-    N_v = 1;
-    for (int i = 0; i < dim; i++)
-    {
-        N_v *= grid[i];
-    }
-
-    N_s = 0;
-    for (int i = 0; i < stencil_len; i++)
-    {
-        if (fabs(stencil[i]) > zero_tol)
-        {
-            N_s++;
-        }
-    }
-
-    // Set dimensions of A
-    CSRMatrix* A = new CSRMatrix(N_v, N_v);
-
-    diags.resize(N_s, 0);
-    nonzero_stencil.resize(N_s);
-    strides[0] = 1;
-    for (int i = 0; i < dim - 1; i++)
-    {
-        strides[i+1] = grid[dim-i-1] * strides[i];
-    }
-
-    // Calculate indices of nonzeros in  stencil
-    int indices[N_s][dim];
-    ctr = 0;
-    for (int i = 0; i < stencil_len; i++)
-    {
-        if (fabs(stencil[i]) > zero_tol)
-        {
-            for (int j = 0; j < dim; j++)
-            {
-                //int power = pow(3, j);
-                int idiv = i / pow(3, j);
-                indices[ctr][dim-j-1] = (idiv % 3) - (3 / 2);
-            }
-            nonzero_stencil[ctr] = stencil[i];
-            ctr++;
-        }
-    }
-
-    // Add strides to diags
-    for (int i = 0; i < dim; i++)
-    {
-        for (int j = 0; j < N_s; j++)
-        {
-            diags[j] += strides[i] * indices[j][dim-i-1];
-        }
-    }
-
-    // Initial data array
-    data.resize(N_s*N_v);
-    for (int i = 0; i < N_s; i++)
-    {
-        for (int j = 0; j < N_v; j++)
-        {
-            data[i*N_v + j] = nonzero_stencil[i];
-        }
-    }
-
-    // Vertically stack indices (reorder)
-    stack_indices.resize(N_s*dim);
-    for (int i = 0; i < N_s; i++)
-    {
-        for (int j = 0; j < dim; j++)
-        {
-            stack_indices[i*dim+j] = indices[i][j];
-        }
-    }
-
-
-    //Zero boundary conditions
-    for (int i = 0; i < N_s; i++)
-    {
-        //get correct chunk of data 
-        //(corresponding to single stencil entry)
-        init_step = i*N_v;
-        for (int j = 0; j < dim; j++)
-        {
-            //If main diagonal, no boundary conditions
-            idx = stack_indices[i*dim + j];
-            if (idx == 0)
-            {
-                continue;
-            }
-
-            //Calculate length of chunks that are to
-            // be set to zero, and step size between
-            // these blocks of data
-            len = 1;
-            step = 1;
-            for (int k = 0; k < (dim-j-1); k++)
-            {
-                len *= grid[k];
-            }
-            step = len * grid[0];
-
-            //zeros at beginning
-            if (idx > 0)
-            {
-                //If previous boundary lies on processor
-                for (int k = 0; k < N_v; k+=step)
-                {
-                    for (int l = 0; l < len; l++)
-                    {
-                        if (k+l > N_v)
-                        {
-                            break;
-                        }
-                        if (k+l < 0)
-                        {
-                            continue;
-                        }
-                        data[init_step + (k-0) + l] = 0;
-                    }
-                }
-            }
-
-            //zeros at end
-            else if (idx < 0)
-            {
-                //If previous boundary lies on processor
-                for (int k = N_v; k > 0; k-=step)
-                {
-                    for (int l = 0; l < len; l++)
-                    {
-                        if (k - l - 1 < 0)
-                        {
-                            break;
-                        }
-                        else if (k - l - 1 > N_v)
-                        {
-                            continue;
-                        }
-                        data[init_step + (k-l-0) -1] = 0;
-                    }
-                }
-            }
-        }
-    }
-
-    //Add diagonals to ParMatrix A
-    A->idx2.reserve(N_s*N_v);
-    A->vals.reserve(N_s*N_v);
-
-    A->idx1[0] = 0;
-    for (int i = 0; i < N_v; i++)
-    {
-        for (int d = 0; d < N_s; d++)
-        {
-            //add data[i] if nonzero 
-            col = diags[d] + i;
-            value = data[(N_s-d-1)*N_v+i];
-            if (col >= 0 && col < N_v && fabs(value) > zero_tol)
-            //if (fabs(value) > zero_tol)
-            {
-                A->idx2.emplace_back(col);
-                A->vals.emplace_back(value);
-            }
-        }
-        A->idx1[i+1] = A->idx2.size();
-    }
-    A->nnz = A->idx2.size();
-
-    return A;
-}
-
-}
diff --git a/raptor/gallery/stencil.hpp b/raptor/gallery/stencil.hpp
deleted file mode 100644
index 4f0c643b..00000000
--- a/raptor/gallery/stencil.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef STENCIL_HPP
-#define STENCIL_HPP
-
-#include <float.h>
-#include <cmath>
-#include <stdlib.h>
-
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-
-namespace raptor {
-
-// Stencils are symmetric, so A could be CSR or CSC
-CSRMatrix* stencil_grid(data_t* stencil, int* grid, int dim);
-}
-#endif
-
diff --git a/raptor/gallery/tests/CMakeLists.txt b/raptor/gallery/tests/CMakeLists.txt
deleted file mode 100644
index 57719cba..00000000
--- a/raptor/gallery/tests/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-add_executable(test_stencil test_stencil.cpp)
-target_link_libraries(test_stencil raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(StencilTest ./test_stencil)
-
-add_executable(test_laplacian test_laplacian.cpp)
-target_link_libraries(test_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(LaplacianTest ./test_laplacian)
-
-add_executable(test_aniso test_aniso.cpp)
-target_link_libraries(test_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(AnisoTest ./test_aniso)
-
-add_executable(test_matrix_market test_matrix_market.cpp)
-target_link_libraries(test_matrix_market raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(MatrixMarketTest ./test_matrix_market)
-
-
-if (WITH_MPI)
-    add_executable(test_par_laplacian test_par_laplacian.cpp)
-    target_link_libraries(test_par_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParLaplacianTest ${MPIRUN} -n 1 ${HOST} ./test_par_laplacian)
-    add_test(ParLaplacianTest ${MPIRUN} -n 2 ${HOST} ./test_par_laplacian)
-
-    add_executable(test_par_aniso test_par_aniso.cpp)
-    target_link_libraries(test_par_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParAnisoTest ${MPIRUN} -n 1 ${HOST} ./test_par_aniso)
-    add_test(ParAnisoTest ${MPIRUN} -n 2 ${HOST} ./test_par_aniso)
-
-    add_executable(test_par_matrix_market test_par_matrix_market.cpp)
-    target_link_libraries(test_par_matrix_market raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParMatrixMarketTest ${MPIRUN} -n 1 ${HOST} ./test_par_matrix_market)
-    add_test(ParMatrixMarketTest ${MPIRUN} -n 2 ${HOST} ./test_par_matrix_market)
-endif()
-
diff --git a/raptor/gallery/tests/test_aniso.cpp b/raptor/gallery/tests/test_aniso.cpp
deleted file mode 100644
index 19223e0b..00000000
--- a/raptor/gallery/tests/test_aniso.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-} // end of main() //
-
-TEST(AnisoTest, TestsInGallery)
-{
-
-
-    int start, end;
-
-    int grid[2] = {25, 25};
-    double eps = 0.001;
-    double theta = M_PI/8.0;
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    CSRMatrix* A_sten = stencil_grid(stencil, grid, 2);
-    CSRMatrix* A_io = readMatrix("../../../../test_data/aniso.pm");
-
-    // Compare shapes
-    ASSERT_EQ(A_io->n_rows, A_sten->n_rows);
-    ASSERT_EQ(A_io->n_cols, A_sten->n_cols);
-
-    A_sten->sort();
-    //A_sten->remove_duplicates();
-
-    A_io->sort();
-    //A_io->remove_duplicates();
-
-    ASSERT_EQ(A_sten->idx1[0], A_io->idx1[0]);
-    for (int i = 0; i < A_io->n_rows; i++)
-    {
-        // Check correct row_ptrs
-        ASSERT_EQ(A_sten->idx1[i+1], A_io->idx1[i+1]);
-        start = A_sten->idx1[i];
-        end = A_sten->idx1[i+1];
-
-        // Check correct col indices / values
-
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->idx2[j], A_io->idx2[j]);
-            //ASSERT_NEAR(A_sten->vals[j], A_io->vals[j], 1e-12);
-        }
-    }
-
-    delete A_io;
-    delete[] stencil;
-    delete A_sten;
-} // end of TEST(AnisoTest, TestsInGallery) //
-
diff --git a/raptor/gallery/tests/test_laplacian.cpp b/raptor/gallery/tests/test_laplacian.cpp
deleted file mode 100644
index adbe094f..00000000
--- a/raptor/gallery/tests/test_laplacian.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-} // end of main() //
-
-TEST(LaplacianTest, TestsInGallery)
-{
-    int start, end;
-
-    int grid[3] = {10, 10, 10};
-    double* stencil = laplace_stencil_27pt();
-    CSRMatrix* A_sten = stencil_grid(stencil, grid, 3);
-
-    const char* mat_fn = "../../../../test_data/laplacian.pm";
-    CSRMatrix* A_io = readMatrix(mat_fn);
-
-    // Compare shapes
-    ASSERT_EQ(A_io->n_rows, A_sten->n_rows);
-    ASSERT_EQ(A_io->n_cols, A_sten->n_cols);
-
-    A_sten->sort();
-    A_io->sort();
-
-    ASSERT_EQ(A_sten->idx1[0], A_io->idx1[0]);
-
-    for (int i = 0; i < A_io->n_rows; i++)
-    {
-        // Check correct row_ptrs
-        ASSERT_EQ(A_sten->idx1[i+1], A_io->idx1[i+1]);
-        start = A_sten->idx1[i];
-        end   = A_sten->idx1[i+1];
-
-        // Check correct col indices / values
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->idx2[j], A_io->idx2[j]);
-            ASSERT_NEAR(A_sten->vals[j], A_io->vals[j], zero_tol);
-        }
-    }
-
-    delete[] stencil;
-    delete A_sten;
-    delete A_io;
-} // end of TEST(LaplacianTest, TestsInGallery) //
-
diff --git a/raptor/gallery/tests/test_matrix_market.cpp b/raptor/gallery/tests/test_matrix_market.cpp
deleted file mode 100644
index 2efa6bde..00000000
--- a/raptor/gallery/tests/test_matrix_market.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-#include "raptor/tests/compare.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-} // end of main() //
-
-TEST(AnisoTest, TestsInGallery)
-{
-    const char* f_in = "../../../../test_data/sas_P0.mtx";
-    const char* f_out = "../../../../test_data/sas_P0_out.mtx";
-    CSRMatrix* Amm = read_mm(f_in);
-    write_mm(Amm, f_out);
-    CSRMatrix* Amm_out = read_mm(f_out);
-    compare(Amm, Amm_out);
-
-    // Diff the two mtx files 
-    std::string command = "diff ";
-    command += f_in;
-    command += " ";
-    command += f_out;
-    int err = system(command.c_str());
-    ASSERT_EQ(err, 0);
-
-
-    remove(f_out);
-     
-    delete Amm;
-} // end of TEST(AnisoTest, TestsInGallery) //
-
-
diff --git a/raptor/gallery/tests/test_par_aniso.cpp b/raptor/gallery/tests/test_par_aniso.cpp
deleted file mode 100644
index 7ef40dc6..00000000
--- a/raptor/gallery/tests/test_par_aniso.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp = RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-} // end of main() //
-
-TEST(ParAnisoTest, TestsInGallery)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    const char* A0_fn = "../../../../test_data/aniso.pm";
-
-
-    int start, end;
-    int grid[2] = {25, 25};
-    double eps = 0.001;
-    double theta = M_PI/8.0;
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    ParCSRMatrix* A_sten = par_stencil_grid(stencil, grid, 2);
-    ParCSRMatrix* A_io = readParMatrix(A0_fn);
-
-    // Compare shapes
-    ASSERT_EQ(A_io->global_num_rows, A_sten->global_num_rows);
-    ASSERT_EQ(A_io->global_num_cols, A_sten->global_num_cols);
-
-    ASSERT_EQ(A_sten->local_num_rows, A_io->local_num_rows);
-    ASSERT_EQ(A_sten->on_proc_num_cols, A_io->on_proc_num_cols);
-    ASSERT_EQ(A_sten->partition->first_local_row, A_io->partition->first_local_row);
-    ASSERT_EQ(A_sten->partition->last_local_row, A_io->partition->last_local_row);
-    ASSERT_EQ(A_sten->partition->first_local_col, A_io->partition->first_local_col);
-    ASSERT_EQ(A_sten->partition->last_local_col, A_io->partition->last_local_col);
-
-    std::vector<int> global_col_starts(num_procs+1);
-    std::vector<int> global_row_starts(num_procs+1);
-    MPI_Allgather(&A_sten->partition->first_local_row, 1, MPI_INT, &global_row_starts[0],
-            1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Allgather(&A_sten->partition->first_local_col, 1, MPI_INT, &global_col_starts[0],
-            1, MPI_INT, MPI_COMM_WORLD);
-    global_row_starts[num_procs] = A_sten->global_num_rows;
-    global_col_starts[num_procs] = A_sten->global_num_cols;
-
-    ASSERT_EQ(A_sten->local_num_rows, (global_row_starts[rank+1] - global_row_starts[rank]));
-    ASSERT_EQ(A_sten->on_proc_num_cols, (global_col_starts[rank+1] - global_col_starts[rank]));
-
-    if (A_sten->local_num_rows)
-    {
-        ASSERT_EQ(A_sten->partition->last_local_row, (global_col_starts[rank+1] - 1));
-    }
-    if (A_sten->on_proc_num_cols)
-    {
-        ASSERT_EQ(A_sten->partition->last_local_col, (global_col_starts[rank+1] - 1));
-    }
-
-    A_sten->sort();
-    A_io->sort();
-
-    ASSERT_EQ(A_sten->on_proc->idx1[0], A_io->on_proc->idx1[0]);
-    ASSERT_EQ(A_sten->off_proc->idx1[0], A_io->off_proc->idx1[0]);
-
-    for (int i = 0; i < A_sten->local_num_rows; i++)
-    {
-        ASSERT_EQ(A_sten->on_proc->idx1[i+1], A_io->on_proc->idx1[i+1]);
-        start = A_sten->on_proc->idx1[i];
-        end = A_sten->on_proc->idx1[i+1];
-
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->on_proc->idx2[j], A_io->on_proc->idx2[j]);
-            ASSERT_NEAR(A_sten->on_proc->vals[j], A_io->on_proc->vals[j], 1e-05);
-        }
-        
-        ASSERT_EQ(A_sten->off_proc->idx1[i+1], A_io->off_proc->idx1[i+1]);
-        start = A_sten->off_proc->idx1[i];
-        end = A_sten->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->off_proc->idx2[j], A_io->off_proc->idx2[j]);
-            ASSERT_NEAR(A_sten->off_proc->vals[j], A_io->off_proc->vals[j], 1e-05);
-        }
-    }
-
-    delete A_io;
-    delete A_sten;
-    delete[] stencil;
-} // end of TEST(ParAnisoTest, TestsInGallery) //
-
diff --git a/raptor/gallery/tests/test_par_laplacian.cpp b/raptor/gallery/tests/test_par_laplacian.cpp
deleted file mode 100644
index 398a103f..00000000
--- a/raptor/gallery/tests/test_par_laplacian.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp = RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-} // end of main() //
-
-TEST(ParLaplacianTest, TestsInGallery)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-
-
-    int start, end;
-    int grid[3] = {10, 10, 10};
-    double* stencil = laplace_stencil_27pt();
-    ParCSRMatrix* A_sten = par_stencil_grid(stencil, grid, 3);
-
-    ParCSRMatrix* A_io = readParMatrix("../../../../test_data/laplacian27.pm");
-
-    // Compare shapes
-    ASSERT_EQ(A_io->global_num_rows, A_sten->global_num_rows);
-    ASSERT_EQ(A_io->global_num_rows, A_sten->global_num_cols);
-
-    ASSERT_EQ(A_sten->local_num_rows, A_io->local_num_rows);
-    ASSERT_EQ(A_sten->on_proc_num_cols, A_io->on_proc_num_cols);
-    ASSERT_EQ(A_sten->partition->first_local_row, A_io->partition->first_local_row);
-    ASSERT_EQ(A_sten->partition->last_local_row, A_io->partition->last_local_row);
-    ASSERT_EQ(A_sten->partition->first_local_col, A_io->partition->first_local_col);
-    ASSERT_EQ(A_sten->partition->last_local_col, A_io->partition->last_local_col);
-
-    std::vector<int> global_col_starts(num_procs+1);
-    std::vector<int> global_row_starts(num_procs+1);
-    MPI_Allgather(&A_sten->partition->first_local_row, 1, MPI_INT, &global_row_starts[0],
-            1, MPI_INT, MPI_COMM_WORLD);
-    MPI_Allgather(&A_sten->partition->first_local_col, 1, MPI_INT, &global_col_starts[0],
-            1, MPI_INT, MPI_COMM_WORLD);
-    global_row_starts[num_procs] = A_sten->global_num_rows;
-    global_col_starts[num_procs] = A_sten->global_num_cols;
-
-    ASSERT_EQ( A_sten->local_num_rows, (global_row_starts[rank+1] - global_row_starts[rank]));
-    ASSERT_EQ( A_sten->on_proc_num_cols, (global_col_starts[rank+1] - global_col_starts[rank]));
-
-    if (A_sten->local_num_rows)
-    {
-        ASSERT_EQ(A_sten->partition->last_local_row, (global_col_starts[rank+1] - 1));
-    }
-    if (A_sten->on_proc_num_cols)
-    {
-        ASSERT_EQ(A_sten->partition->last_local_col, (global_col_starts[rank+1] - 1));
-    }
-
-    A_sten->sort();
-    A_io->sort();
-
-    ASSERT_EQ(A_sten->on_proc->idx1[0], A_io->on_proc->idx1[0]);
-    ASSERT_EQ(A_sten->off_proc->idx1[0],A_io->off_proc->idx1[0]);
-
-    for (int i = 0; i < A_sten->local_num_rows; i++)
-    {
-        ASSERT_EQ(A_sten->on_proc->idx1[i+1], A_io->on_proc->idx1[i+1]);
-        start = A_sten->on_proc->idx1[i];
-        end = A_sten->on_proc->idx1[i+1];
-
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->on_proc->idx2[j], A_io->on_proc->idx2[j]);
-            ASSERT_NEAR(A_sten->on_proc->vals[j], A_io->on_proc->vals[j], 1e-05);
-        }
-        
-        ASSERT_EQ(A_sten->off_proc->idx1[i+1], A_io->off_proc->idx1[i+1]);
-        start = A_sten->off_proc->idx1[i];
-        end = A_sten->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            ASSERT_EQ(A_sten->off_proc->idx2[j], A_io->off_proc->idx2[j]);
-            ASSERT_NEAR(A_sten->off_proc->vals[j], A_io->off_proc->vals[j], 1e-05);
-        }
-    }
-
-    delete A_io;
-    delete A_sten;
-    delete[] stencil;
-
-} // end of TEST(ParLaplacianTest, TestsInGallery) //
-
diff --git a/raptor/gallery/tests/test_par_matrix_market.cpp b/raptor/gallery/tests/test_par_matrix_market.cpp
deleted file mode 100644
index 000678c8..00000000
--- a/raptor/gallery/tests/test_par_matrix_market.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-#include "raptor/tests/par_compare.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    MPI_Init(&argc, &argv);
-    ::testing::InitGoogleTest(&argc, argv);
-    int temp = RUN_ALL_TESTS();
-    MPI_Finalize();
-    return temp;
-} // end of main() //
-
-TEST(ParAnisoTest, TestsInGallery)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-    const char* f_in = "../../../../test_data/sas_P0.mtx";
-    const char* f_out = "../../../../test_data/sas_P0_out.mtx";
-
-    ParCSRMatrix* Amm = read_par_mm(f_in);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    write_par_mm(Amm, f_out);
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    ParCSRMatrix* Amm_out = read_par_mm(f_out);
-    
-    MPI_Barrier(MPI_COMM_WORLD);
-    compare(Amm, Amm_out);
-
-    // Diff the two mtx files 
-    if (rank == 0)
-    {
-        remove(f_out);
-    }
-
-    delete Amm_out;
-    delete Amm;
-
- } // end of TEST(ParAnisoTest, TestsInGallery) //
-
-
diff --git a/raptor/gallery/tests/test_stencil.cpp b/raptor/gallery/tests/test_stencil.cpp
deleted file mode 100644
index ebe62ae1..00000000
--- a/raptor/gallery/tests/test_stencil.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "gtest/gtest.h"
-#include "raptor/raptor.hpp"
-
-using namespace raptor;
-
-int main(int argc, char** argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-} // end of main() //
-
-TEST(StencilTest, TestsInGallery)
-{
-    // Create A from diffusion stencil
-    int dim = 2;
-    std::vector<int> grid(2, 4);
-    double eps = 0.001;
-    double theta = M_PI / 8.0;
-    double* stencil = diffusion_stencil_2d(eps, theta);
-    CSRMatrix* A = stencil_grid(stencil, grid.data(), dim);
-    delete[] stencil;
-
-    std::vector<double> A_python(16 * 16, 0);
-    std::vector<double> A_dense(16 * 16, 0);
-
-    // Add values of A_python
-    int rows[100] = {0, 0, 0, 0, 
-        1, 1, 1, 1, 1, 1, 
-        2, 2, 2, 2, 2, 2,
-        3, 3, 3, 3,
-        4, 4, 4, 4, 4, 4,
-        5, 5, 5, 5, 5, 5, 5, 5, 5, 
-        6, 6, 6, 6, 6, 6, 6, 6, 6,
-        7, 7, 7, 7, 7, 7,
-        8, 8, 8, 8, 8, 8,
-        9, 9, 9, 9, 9, 9, 9, 9, 9,
-        10, 10, 10, 10, 10, 10, 10, 10, 10,
-        11, 11, 11, 11, 11, 11,
-        12, 12, 12, 12,
-        13, 13, 13, 13, 13, 13,
-        14, 14, 14, 14, 14, 14,
-        15, 15, 15, 15};
-
-    int cols[100] = {0, 1, 4, 5,
-        0, 1, 2, 4, 5, 6,
-        1, 2, 3, 5, 6, 7,
-        2, 3, 6, 7, 
-        0, 1, 4, 5, 8, 9,
-        0, 1, 2, 4, 5, 6, 8, 9, 10,
-        1, 2, 3, 5, 6, 7, 9, 10, 11,
-        2, 3, 6, 7, 10, 11, 
-        4, 5, 8, 9, 12, 13, 
-        4, 5, 6, 8, 9, 10, 12, 13, 14, 
-        5, 6, 7, 9, 10, 11, 13, 14, 15,
-        6, 7, 10, 11, 14, 15,
-        8, 9, 12, 13,
-        8, 9, 10, 12, 13, 14,
-        9, 10, 11, 13, 14, 15,
-        10, 11, 14, 15};
-
-    double data[100] = {1.33466666667, 0.186366503869, -0.520033170536, -0.343433251935, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, 0.186366503869, 1.33466666667, 0.00976658526801, -0.520033170536, -0.520033170536, 0.00976658526801, 1.33466666667, 0.186366503869, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.186366503869, 1.33466666667, 0.00976658526801, -0.520033170536, -0.520033170536, 0.00976658526801, 1.33466666667, 0.186366503869, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, 0.00976658526801, -0.520033170536, -0.343433251935, -0.343433251935, -0.520033170536, 0.186366503869, 1.33466666667, 0.00976658526801, -0.520033170536, -0.520033170536, 0.00976658526801, 1.33466666667, 0.186366503869, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, -0.343433251935, -0.520033170536, 0.00976658526801, 0.186366503869, 1.33466666667, 0.186366503869, -0.343433251935, -0.520033170536, 0.186366503869, 1.33466666667};
-
-    for (int i = 0; i < 100; i++)
-    {
-        int row = rows[i];
-        int col = cols[i];
-        A_python[row*16 + col] = data[i];
-    }
-
-
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        int row_start = A->idx1[i];
-        int row_end = A->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            int col = A->idx2[j];
-            A_dense[i*16 + col] = A->vals[j];
-        }
-    }
-
-    for (int i = 0; i < 16; i++)
-    {
-        for (int j = 0; j < 16; j++)
-        {
-            ASSERT_NEAR(A_python[i*16+j], A_dense[i*16+j], 1e-06);
-        }
-    }
-
-} // end of TEST(StencilTest, TestsInGallery)//
-
diff --git a/raptor/krylov/bicgstab.hpp b/raptor/krylov/bicgstab.hpp
index a0487fea..473a851e 100644
--- a/raptor/krylov/bicgstab.hpp
+++ b/raptor/krylov/bicgstab.hpp
@@ -3,9 +3,7 @@
 
 #include <vector>
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/krylov/cg.hpp b/raptor/krylov/cg.hpp
index 737d64ac..73ae66fe 100644
--- a/raptor/krylov/cg.hpp
+++ b/raptor/krylov/cg.hpp
@@ -3,9 +3,7 @@
 
 #include <vector>
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/krylov/par_bicgstab.hpp b/raptor/krylov/par_bicgstab.hpp
index 64652cb6..af4f4120 100644
--- a/raptor/krylov/par_bicgstab.hpp
+++ b/raptor/krylov/par_bicgstab.hpp
@@ -3,9 +3,7 @@
 
 #include <vector>
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 #include "raptor/multilevel/par_multilevel.hpp"
 #include "raptor/aggregation/par_smoothed_aggregation_solver.hpp"
 
diff --git a/raptor/krylov/par_cg.hpp b/raptor/krylov/par_cg.hpp
index 5a667671..e9e53381 100644
--- a/raptor/krylov/par_cg.hpp
+++ b/raptor/krylov/par_cg.hpp
@@ -3,9 +3,7 @@
 
 #include <vector>
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 #include "raptor/multilevel/par_multilevel.hpp"
 
 namespace raptor {
diff --git a/raptor/krylov/partial_inner.hpp b/raptor/krylov/partial_inner.hpp
index 36f7ba02..98f543b8 100644
--- a/raptor/krylov/partial_inner.hpp
+++ b/raptor/krylov/partial_inner.hpp
@@ -3,8 +3,7 @@
 
 #include <vector>
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/multilevel/level.hpp b/raptor/multilevel/level.hpp
index 174a9f95..757345eb 100644
--- a/raptor/multilevel/level.hpp
+++ b/raptor/multilevel/level.hpp
@@ -3,9 +3,7 @@
 #ifndef RAPTOR_ML_LEVEL_H
 #define RAPTOR_ML_LEVEL_H
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+#include "raptor-sparse.hpp"
 
 // Coarse Matrices (A) are CSC
 // Prolongation Matrices (P) are CSC
diff --git a/raptor/multilevel/multilevel.hpp b/raptor/multilevel/multilevel.hpp
index 97f578d3..c5bc7a8e 100644
--- a/raptor/multilevel/multilevel.hpp
+++ b/raptor/multilevel/multilevel.hpp
@@ -3,11 +3,9 @@
 #ifndef RAPTOR_ML_MULTILEVEL_H
 #define RAPTOR_ML_MULTILEVEL_H
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+#include "raptor-sparse.hpp"
 #include "level.hpp"
-#include "raptor/util/linalg/relax.hpp"
+#include "raptor/precondition/relax.hpp"
 
 // Coarse Matrices (A) are CSC
 // Prolongation Matrices (P) are CSC
diff --git a/raptor/multilevel/par_level.hpp b/raptor/multilevel/par_level.hpp
index a116bb38..53428e40 100644
--- a/raptor/multilevel/par_level.hpp
+++ b/raptor/multilevel/par_level.hpp
@@ -3,9 +3,7 @@
 #ifndef RAPTOR_ML_PARLEVEL_H
 #define RAPTOR_ML_PARLEVEL_H
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 
 // Coarse Matrices (A) are CSR
 // Prolongation Matrices (P) are CSR
diff --git a/raptor/multilevel/par_multilevel.hpp b/raptor/multilevel/par_multilevel.hpp
index a55db77d..be4ffc5d 100644
--- a/raptor/multilevel/par_multilevel.hpp
+++ b/raptor/multilevel/par_multilevel.hpp
@@ -3,11 +3,9 @@
 #ifndef RAPTOR_ML_PARMULTILEVEL_H
 #define RAPTOR_ML_PARMULTILEVEL_H
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
+#include "raptor-sparse.hpp"
 #include "raptor/multilevel/par_level.hpp"
-#include "raptor/util/linalg/par_relax.hpp"
+#include "raptor/precondition/par_relax.hpp"
 #include "raptor/ruge_stuben/par_interpolation.hpp"
 #include "raptor/ruge_stuben/par_cf_splitting.hpp"
 
diff --git a/raptor/util/CMakeLists.txt b/raptor/precondition/CMakeLists 2.txt
similarity index 100%
rename from raptor/util/CMakeLists.txt
rename to raptor/precondition/CMakeLists 2.txt
diff --git a/raptor/precondition/CMakeLists.txt b/raptor/precondition/CMakeLists.txt
new file mode 100644
index 00000000..f4c3f827
--- /dev/null
+++ b/raptor/precondition/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Include the directory itself as a path to include directories
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+ 
+#Create a variable called linalg_SOURCES containing all .cpp files:
+
+if (WITH_MPI)
+    set(par_precond_HEADERS
+        precondition/par_relax.hpp
+        precondition/par_diag_scale.hpp
+        )
+    set(par_precond_SOURCES
+        precondition/par_relax.cpp
+        precondition/par_diag_scale.cpp
+        )
+else ()
+    set(par_precond_HEADERS
+        ""
+        )
+    set (par_precond_SOURCES 
+        ""
+        )
+endif()
+
+set(precond_HEADERS
+    precondition/relax.hpp
+    ${par_precond_HEADERS}
+    PARENT_SCOPE
+    )
+set(precond_SOURCES 
+    precondition/relax.cpp
+    ${par_precond_SOURCES}
+    PARENT_SCOPE
+    )
+
+
+
+
diff --git a/raptor/util/linalg/par_diag_scale.cpp b/raptor/precondition/par_diag_scale.cpp
similarity index 100%
rename from raptor/util/linalg/par_diag_scale.cpp
rename to raptor/precondition/par_diag_scale.cpp
diff --git a/raptor/util/linalg/par_diag_scale.hpp b/raptor/precondition/par_diag_scale.hpp
similarity index 86%
rename from raptor/util/linalg/par_diag_scale.hpp
rename to raptor/precondition/par_diag_scale.hpp
index f734118d..ec896328 100644
--- a/raptor/util/linalg/par_diag_scale.hpp
+++ b/raptor/precondition/par_diag_scale.hpp
@@ -6,8 +6,7 @@
 #include <mpi.h>
 #include <float.h>
 
-#include "raptor/core/par_vector.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/util/linalg/par_relax.cpp b/raptor/precondition/par_relax.cpp
similarity index 99%
rename from raptor/util/linalg/par_relax.cpp
rename to raptor/precondition/par_relax.cpp
index 2cc253a8..f22b26b0 100644
--- a/raptor/util/linalg/par_relax.cpp
+++ b/raptor/precondition/par_relax.cpp
@@ -1,9 +1,7 @@
 // Copyright (c) 2015-2017, RAPtor Developer Team
 // License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
 
-#include "raptor/core/types.hpp"
 #include "par_relax.hpp"
-#include "raptor/core/par_matrix.hpp"
 
 namespace raptor {
 // Declare Private Methods
diff --git a/raptor/util/linalg/par_relax.hpp b/raptor/precondition/par_relax.hpp
similarity index 90%
rename from raptor/util/linalg/par_relax.hpp
rename to raptor/precondition/par_relax.hpp
index 24cfc8f9..90430f68 100644
--- a/raptor/util/linalg/par_relax.hpp
+++ b/raptor/precondition/par_relax.hpp
@@ -6,8 +6,7 @@
 #include <mpi.h>
 #include <float.h>
 
-#include "raptor/core/par_vector.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 #include "raptor/multilevel/par_level.hpp"
 
 namespace raptor {
diff --git a/raptor/util/linalg/relax.cpp b/raptor/precondition/relax.cpp
similarity index 98%
rename from raptor/util/linalg/relax.cpp
rename to raptor/precondition/relax.cpp
index b3942a37..58a31d2e 100644
--- a/raptor/util/linalg/relax.cpp
+++ b/raptor/precondition/relax.cpp
@@ -1,8 +1,6 @@
 // Copyright (c) 2015-2017, RAPtor Developer Team
 // License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
-#include "raptor/core/vector.hpp"
+
 #include "relax.hpp"
 
 extern "C" {
diff --git a/raptor/util/linalg/relax.hpp b/raptor/precondition/relax.hpp
similarity index 94%
rename from raptor/util/linalg/relax.hpp
rename to raptor/precondition/relax.hpp
index 6f87fc37..50b04ff3 100644
--- a/raptor/util/linalg/relax.hpp
+++ b/raptor/precondition/relax.hpp
@@ -4,8 +4,7 @@
 #define RAPTOR_UTILS_LINALG_SEQ_RELAX_H
 #include <float.h>
 
-#include "raptor/core/vector.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 #include "raptor/multilevel/level.hpp"
 #include <cstring>
 
diff --git a/raptor/precondition/tests/CMakeLists.txt b/raptor/precondition/tests/CMakeLists.txt
new file mode 100644
index 00000000..08abb192
--- /dev/null
+++ b/raptor/precondition/tests/CMakeLists.txt
@@ -0,0 +1,53 @@
+add_executable(test_jacobi_aniso test_jacobi_aniso.cpp)
+target_link_libraries(test_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(AnisoJacobiTest ./test_jacobi_aniso)
+
+add_executable(test_jacobi_laplacian test_jacobi_laplacian.cpp)
+target_link_libraries(test_jacobi_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(LaplaceJacobiTest ./test_jacobi_laplacian)
+
+add_executable(test_gs_aniso test_gs_aniso.cpp)
+target_link_libraries(test_gs_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(AnisoGSTest ./test_gs_aniso)
+
+add_executable(test_gs_laplacian test_gs_laplacian.cpp)
+target_link_libraries(test_gs_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(LaplaceGSTest ./test_gs_laplacian)
+
+# CANNOT CURRENTLY RUN THESE TESTS, BUT RAPTOR SEEMS CORRECT
+# TODO : UNCOMMENT WHEN PYAMG BUG IS FIXED
+#
+#add_executable(test_sor_aniso test_sor_aniso.cpp)
+#target_link_libraries(test_sor_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+#add_test(AnisoSORTest ./test_sor_aniso)
+#
+#add_executable(test_sor_laplacian test_sor_laplacian.cpp)
+#target_link_libraries(test_sor_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
+#add_test(LaplaceSORTest ./test_sor_laplacian)
+
+#add_executable(test_bsr_jacobi_aniso test_bsr_jacobi_aniso.cpp)
+#target_link_libraries(test_bsr_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+#add_test(BSRAnisoJacobiTest ./test_bsr_jacobi_aniso)
+
+add_executable(test_bsr_jacobi_aniso test_bsr_jacobi_aniso.cpp)
+target_link_libraries(test_bsr_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(BSRAnisoJacobiTest ./test_bsr_jacobi_aniso)
+
+add_executable(test_bsr_jacobi_laplacian test_bsr_jacobi_laplacian.cpp)
+target_link_libraries(test_bsr_jacobi_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(BSRLaplaceJacobiTest ./test_bsr_jacobi_laplacian)
+
+add_executable(test_bsr_gs_aniso test_bsr_gs_aniso.cpp)
+target_link_libraries(test_bsr_gs_aniso raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(BSRAnisoGSTest ./test_bsr_gs_aniso)
+
+add_executable(test_bsr_gs_laplacian test_bsr_gs_laplacian.cpp)
+target_link_libraries(test_bsr_gs_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
+add_test(BSRLaplaceGSTest ./test_bsr_gs_laplacian)
+
+if (WITH_MPI)
+    #TODO Add Parallel Relaxation Tests Here
+
+    # TODO Add Parallel Diagonal Scale Tests Here 
+endif()
+
diff --git a/raptor/core/tests/README.md b/raptor/precondition/tests/README.md
similarity index 100%
rename from raptor/core/tests/README.md
rename to raptor/precondition/tests/README.md
diff --git a/raptor/util/tests/test_bsr_gs_aniso.cpp b/raptor/precondition/tests/test_bsr_gs_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_gs_aniso.cpp
rename to raptor/precondition/tests/test_bsr_gs_aniso.cpp
diff --git a/raptor/util/tests/test_bsr_gs_laplacian.cpp b/raptor/precondition/tests/test_bsr_gs_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_gs_laplacian.cpp
rename to raptor/precondition/tests/test_bsr_gs_laplacian.cpp
diff --git a/raptor/util/tests/test_bsr_jacobi_aniso.cpp b/raptor/precondition/tests/test_bsr_jacobi_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_jacobi_aniso.cpp
rename to raptor/precondition/tests/test_bsr_jacobi_aniso.cpp
diff --git a/raptor/util/tests/test_bsr_jacobi_laplacian.cpp b/raptor/precondition/tests/test_bsr_jacobi_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_jacobi_laplacian.cpp
rename to raptor/precondition/tests/test_bsr_jacobi_laplacian.cpp
diff --git a/raptor/util/tests/test_bsr_spmv_aniso.cpp b/raptor/precondition/tests/test_bsr_spmv_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_spmv_aniso.cpp
rename to raptor/precondition/tests/test_bsr_spmv_aniso.cpp
diff --git a/raptor/util/tests/test_bsr_spmv_laplacian.cpp b/raptor/precondition/tests/test_bsr_spmv_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_spmv_laplacian.cpp
rename to raptor/precondition/tests/test_bsr_spmv_laplacian.cpp
diff --git a/raptor/util/tests/test_bsr_spmv_random.cpp b/raptor/precondition/tests/test_bsr_spmv_random.cpp
similarity index 100%
rename from raptor/util/tests/test_bsr_spmv_random.cpp
rename to raptor/precondition/tests/test_bsr_spmv_random.cpp
diff --git a/raptor/util/tests/test_gs_aniso.cpp b/raptor/precondition/tests/test_gs_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_gs_aniso.cpp
rename to raptor/precondition/tests/test_gs_aniso.cpp
diff --git a/raptor/util/tests/test_gs_laplacian.cpp b/raptor/precondition/tests/test_gs_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_gs_laplacian.cpp
rename to raptor/precondition/tests/test_gs_laplacian.cpp
diff --git a/raptor/util/tests/test_jacobi_aniso.cpp b/raptor/precondition/tests/test_jacobi_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_jacobi_aniso.cpp
rename to raptor/precondition/tests/test_jacobi_aniso.cpp
diff --git a/raptor/util/tests/test_jacobi_laplacian.cpp b/raptor/precondition/tests/test_jacobi_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_jacobi_laplacian.cpp
rename to raptor/precondition/tests/test_jacobi_laplacian.cpp
diff --git a/raptor/util/tests/test_par_add.cpp b/raptor/precondition/tests/test_par_add.cpp
similarity index 100%
rename from raptor/util/tests/test_par_add.cpp
rename to raptor/precondition/tests/test_par_add.cpp
diff --git a/raptor/util/tests/test_par_scale_aniso.cpp b/raptor/precondition/tests/test_par_scale_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_par_scale_aniso.cpp
rename to raptor/precondition/tests/test_par_scale_aniso.cpp
diff --git a/raptor/util/tests/test_par_spmv_aniso.cpp b/raptor/precondition/tests/test_par_spmv_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_par_spmv_aniso.cpp
rename to raptor/precondition/tests/test_par_spmv_aniso.cpp
diff --git a/raptor/util/tests/test_par_spmv_laplacian.cpp b/raptor/precondition/tests/test_par_spmv_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_par_spmv_laplacian.cpp
rename to raptor/precondition/tests/test_par_spmv_laplacian.cpp
diff --git a/raptor/util/tests/test_par_spmv_random.cpp b/raptor/precondition/tests/test_par_spmv_random.cpp
similarity index 100%
rename from raptor/util/tests/test_par_spmv_random.cpp
rename to raptor/precondition/tests/test_par_spmv_random.cpp
diff --git a/raptor/util/tests/test_parmetis.cpp b/raptor/precondition/tests/test_parmetis.cpp
similarity index 100%
rename from raptor/util/tests/test_parmetis.cpp
rename to raptor/precondition/tests/test_parmetis.cpp
diff --git a/raptor/util/tests/test_ptscotch.cpp b/raptor/precondition/tests/test_ptscotch.cpp
similarity index 100%
rename from raptor/util/tests/test_ptscotch.cpp
rename to raptor/precondition/tests/test_ptscotch.cpp
diff --git a/raptor/util/tests/test_repartition.cpp b/raptor/precondition/tests/test_repartition.cpp
similarity index 100%
rename from raptor/util/tests/test_repartition.cpp
rename to raptor/precondition/tests/test_repartition.cpp
diff --git a/raptor/util/tests/test_sor_aniso.cpp b/raptor/precondition/tests/test_sor_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_sor_aniso.cpp
rename to raptor/precondition/tests/test_sor_aniso.cpp
diff --git a/raptor/util/tests/test_sor_laplacian.cpp b/raptor/precondition/tests/test_sor_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_sor_laplacian.cpp
rename to raptor/precondition/tests/test_sor_laplacian.cpp
diff --git a/raptor/util/tests/test_spmv_aniso.cpp b/raptor/precondition/tests/test_spmv_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_spmv_aniso.cpp
rename to raptor/precondition/tests/test_spmv_aniso.cpp
diff --git a/raptor/util/tests/test_spmv_laplacian.cpp b/raptor/precondition/tests/test_spmv_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_spmv_laplacian.cpp
rename to raptor/precondition/tests/test_spmv_laplacian.cpp
diff --git a/raptor/util/tests/test_spmv_random.cpp b/raptor/precondition/tests/test_spmv_random.cpp
similarity index 100%
rename from raptor/util/tests/test_spmv_random.cpp
rename to raptor/precondition/tests/test_spmv_random.cpp
diff --git a/raptor/util/tests/test_tap_spmv_aniso.cpp b/raptor/precondition/tests/test_tap_spmv_aniso.cpp
similarity index 100%
rename from raptor/util/tests/test_tap_spmv_aniso.cpp
rename to raptor/precondition/tests/test_tap_spmv_aniso.cpp
diff --git a/raptor/util/tests/test_tap_spmv_laplacian.cpp b/raptor/precondition/tests/test_tap_spmv_laplacian.cpp
similarity index 100%
rename from raptor/util/tests/test_tap_spmv_laplacian.cpp
rename to raptor/precondition/tests/test_tap_spmv_laplacian.cpp
diff --git a/raptor/util/tests/test_tap_spmv_random.cpp b/raptor/precondition/tests/test_tap_spmv_random.cpp
similarity index 100%
rename from raptor/util/tests/test_tap_spmv_random.cpp
rename to raptor/precondition/tests/test_tap_spmv_random.cpp
diff --git a/raptor/profiling/profile_comm.cpp b/raptor/profiling/profile_comm.cpp
index da4da2ed..74e956aa 100644
--- a/raptor/profiling/profile_comm.cpp
+++ b/raptor/profiling/profile_comm.cpp
@@ -1,4 +1,4 @@
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 using namespace raptor;
 
 #define short_cutoff 500
diff --git a/raptor/raptor.hpp b/raptor/raptor.hpp
index 23590b98..967a20e3 100644
--- a/raptor/raptor.hpp
+++ b/raptor/raptor.hpp
@@ -3,48 +3,7 @@
 #ifndef RAPTOR_HPP
 #define RAPTOR_HPP
 
-// Define types such as int and double sizes
-#include "core/types.hpp"
-#include "core/utilities.hpp"
-
-// Data about topology and matrix partitions
-#ifndef NO_MPI
-    #include "core/mpi_types.hpp"
-    #include "core/partition.hpp"
-    #include "core/topology.hpp"
-#endif 
-
-// Matrix and vector classes
-#include "core/matrix.hpp"
-#include "core/vector.hpp"
-#ifndef NO_MPI
-    #include "core/par_matrix.hpp"
-    #include "core/par_vector.hpp"
-#endif 
-
-// Communication classes
-#ifndef NO_MPI
-    #include "core/comm_data.hpp"
-    #include "core/comm_pkg.hpp"
-#endif
-
-// Stencil and diffusion classes
-#include "gallery/laplacian27pt.hpp"
-#include "gallery/diffusion.hpp"
-#include "gallery/stencil.hpp"
-#include "gallery/random.hpp"
-#ifndef NO_MPI
-    #include "gallery/par_stencil.hpp"
-    #include "gallery/par_random.hpp"
-#endif
-
-// Matrix IO
-#include "gallery/matrix_IO.hpp"
-#include "gallery/matrix_market.hpp"
-#ifndef NO_MPI
-    #include "gallery/par_matrix_IO.hpp"
-    #include "gallery/par_matrix_market.hpp"
-#endif
+#include "raptor-sparse.hpp"
 
 // External 
 #ifdef USING_HYPRE
@@ -96,25 +55,14 @@
 #include "krylov/par_bicgstab.hpp"
 
 // Relaxation methods
-#include "util/linalg/relax.hpp"
+#include "precondition/relax.hpp"
 #ifndef NO_MPI
-    #include "util/linalg/par_relax.hpp"
-#endif
-
-// Repartitioning matrix methods
-#ifndef NO_MPI
-#include "util/linalg/repartition.hpp"
-#endif
-#ifdef USING_PTSCOTCH
-    #include "util/linalg/external/ptscotch_wrapper.hpp"
-#endif
-#ifdef USING_PARMETIS
-    #include "util/linalg/external/parmetis_wrapper.hpp"
+    #include "precondition/par_relax.hpp"
 #endif
 
 // Preconditioning Methods
 #ifndef NO_MPI
-    #include "util/linalg/par_diag_scale.hpp"
+    #include "precondition/par_diag_scale.hpp"
 #endif
 
 
diff --git a/raptor/ruge_stuben/cf_splitting.hpp b/raptor/ruge_stuben/cf_splitting.hpp
index 2d478fa5..93372f81 100644
--- a/raptor/ruge_stuben/cf_splitting.hpp
+++ b/raptor/ruge_stuben/cf_splitting.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_SPLITTING_HPP
 #define RAPTOR_SPLITTING_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/ruge_stuben/interpolation.cpp b/raptor/ruge_stuben/interpolation.cpp
index a1a3501b..34596e52 100644
--- a/raptor/ruge_stuben/interpolation.cpp
+++ b/raptor/ruge_stuben/interpolation.cpp
@@ -1,7 +1,6 @@
 // Copyright (c) 2015-2017, RAPtor Developer Team
 // License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
 #include "assert.h"
-#include "raptor/core/types.hpp"
 #include "interpolation.hpp"
 
 namespace raptor {
diff --git a/raptor/ruge_stuben/interpolation.hpp b/raptor/ruge_stuben/interpolation.hpp
index e9d24392..67a1aa1f 100644
--- a/raptor/ruge_stuben/interpolation.hpp
+++ b/raptor/ruge_stuben/interpolation.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_DIRECT_INTERPOLATION_HPP
 #define RAPTOR_DIRECT_INTERPOLATION_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/ruge_stuben/par_cf_splitting.hpp b/raptor/ruge_stuben/par_cf_splitting.hpp
index 420b375c..a44ebffd 100644
--- a/raptor/ruge_stuben/par_cf_splitting.hpp
+++ b/raptor/ruge_stuben/par_cf_splitting.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_PAR_SPLITTING_HPP
 #define RAPTOR_PAR_SPLITTING_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 #include "cf_splitting.hpp"
 
 namespace raptor {
diff --git a/raptor/ruge_stuben/par_interpolation.cpp b/raptor/ruge_stuben/par_interpolation.cpp
index a77a5355..c79ba180 100644
--- a/raptor/ruge_stuben/par_interpolation.cpp
+++ b/raptor/ruge_stuben/par_interpolation.cpp
@@ -1,8 +1,7 @@
 // Copyright (c) 2015-2017, RAPtor Developer Team
 // License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
 #include "assert.h"
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "par_interpolation.hpp"
 
 namespace raptor {
 
diff --git a/raptor/ruge_stuben/par_interpolation.hpp b/raptor/ruge_stuben/par_interpolation.hpp
index ac546954..0e29b935 100644
--- a/raptor/ruge_stuben/par_interpolation.hpp
+++ b/raptor/ruge_stuben/par_interpolation.hpp
@@ -3,8 +3,7 @@
 #ifndef RAPTOR_PAR_DIRECT_INTERPOLATION_HPP
 #define RAPTOR_PAR_DIRECT_INTERPOLATION_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 
diff --git a/raptor/tests/compare.hpp b/raptor/tests/compare.hpp
index bfb0cf10..fc6196b1 100644
--- a/raptor/tests/compare.hpp
+++ b/raptor/tests/compare.hpp
@@ -9,8 +9,7 @@
 #ifndef RAPTOR_TEST_COMPARE_HPP
 #define RAPTOR_TEST_COMPARE_HPP
 
-#include "raptor/core/types.hpp"
-#include "raptor/core/matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 void compare(CSRMatrix* A, CSRMatrix* A_rap)
diff --git a/raptor/tests/par_compare.hpp b/raptor/tests/par_compare.hpp
index 6b1256e1..0e32869f 100644
--- a/raptor/tests/par_compare.hpp
+++ b/raptor/tests/par_compare.hpp
@@ -8,8 +8,7 @@
 
 #ifndef RAPTOR_TEST_PAR_COMPARE_HPP
 #define RAPTOR_TEST_PAR_COMPARE_HPP
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
+#include "raptor-sparse.hpp"
 
 namespace raptor {
 void compare(ParCSRMatrix* A, ParCSRMatrix* A_rap)
diff --git a/raptor/util/linalg/CMakeLists.txt b/raptor/util/linalg/CMakeLists.txt
deleted file mode 100644
index 39db5374..00000000
--- a/raptor/util/linalg/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-# Include the directory itself as a path to include directories
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
- 
-#Create a variable called linalg_SOURCES containing all .cpp files:
-
-if (WITH_MPI)
-    set(par_linalg_HEADERS
-        util/linalg/repartition.hpp
-        util/linalg/par_relax.hpp
-        util/linalg/par_diag_scale.hpp
-        )
-    set(par_linalg_SOURCES
-        util/linalg/par_spmv.cpp
-        util/linalg/par_matmult.cpp
-        util/linalg/par_add.cpp
-        util/linalg/par_relax.cpp
-        util/linalg/repartition.cpp
-        util/linalg/par_diag_scale.cpp
-        )
-else ()
-    set(par_linalg_HEADERS
-        ""
-        )
-    set (par_linalg_SOURCES 
-        ""
-        )
-endif()
-
-set(linalg_HEADERS
-    util/linalg/relax.hpp
-    ${par_linalg_HEADERS}
-    ${external_linalg_HEADERS}
-    PARENT_SCOPE
-    )
-set(linalg_SOURCES 
-    util/linalg/matmult.cpp
-    util/linalg/relax.cpp
-    util/linalg/add.cpp
-    util/linalg/spmv.cpp
-    ${par_linalg_SOURCES}
-    PARENT_SCOPE
-    )
-
-
-
-
diff --git a/raptor/util/linalg/add.cpp b/raptor/util/linalg/add.cpp
deleted file mode 100644
index 765c91e5..00000000
--- a/raptor/util/linalg/add.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "assert.h"
-#include "raptor/core/matrix.hpp"
-
-using namespace raptor;
-
-// TODO -- currently assumes partitions are the same 
-Matrix* Matrix::add(CSRMatrix* B, bool remove_dup)
-{
-    CSRMatrix* A = to_CSR();
-    CSRMatrix* C = new CSRMatrix(n_rows, n_cols, 2*nnz);
-    A->add_append(B, C, remove_dup);
-    delete A;
-    return C;
-}
-void Matrix::add_append(CSRMatrix* B, CSRMatrix* C, bool remove_dup)
-{
-    CSRMatrix* A = to_CSR();
-    A->add_append(B, C, remove_dup);
-    delete A;
-}
-Matrix* Matrix::subtract(CSRMatrix* B)
-{
-    CSRMatrix* A = to_CSR();
-    CSRMatrix* C = A->subtract(B);
-    delete A;
-    return C;
-}
-
-
-CSRMatrix* CSRMatrix::add(CSRMatrix* B, bool remove_dup)
-{
-    CSRMatrix* C = new CSRMatrix(n_rows, n_cols, 2*nnz);
-    add_append(B, C, remove_dup);
-    return C;
-}
-
-void CSRMatrix::add_append(CSRMatrix* B, CSRMatrix* C, bool remove_dup)
-{
-    int start, end;
-
-    C->resize(n_rows, n_cols);
-    int C_nnz = nnz + B->nnz;
-    C->idx2.resize(C_nnz);
-    C->vals.resize(C_nnz);
-
-    C_nnz = 0;
-    C->idx1[0] = 0;
-    for (int i = 0; i < n_rows; i++)
-    {
-        start = idx1[i];
-        end = idx1[i+1];
-        std::copy(idx2.begin() + start,
-                idx2.begin() + end,
-                C->idx2.begin() + C_nnz);
-        std::copy(vals.begin() + start,
-                vals.begin() + end,
-                C->vals.begin() + C_nnz);
-        C_nnz += (end - start);
-
-        start = B->idx1[i];
-        end = B->idx1[i+1];
-        std::copy(B->idx2.begin() + start,
-                B->idx2.begin() + end,
-                C->idx2.begin() + C_nnz);
-        std::copy(B->vals.begin() + start,
-                B->vals.begin() + end,
-                C->vals.begin() + C_nnz);
-        C_nnz += (end - start);
-
-        C->idx1[i+1] = C_nnz;
-    }
-    C->nnz = C_nnz;
-    C->sort();
-    if (remove_dup) 
-        C->remove_duplicates();
-}
-
-CSRMatrix* CSRMatrix::subtract(CSRMatrix* B)
-{
-    int start, end;
-
-    assert(n_rows == B->n_rows);
-    assert(n_cols == B->n_cols);
-
-    CSRMatrix* C = new CSRMatrix(n_rows, n_cols, 2*nnz);
-    C->idx1[0] = 0;
-    for (int i = 0; i < n_rows; i++)
-    {
-        start = idx1[i];
-        end = idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->idx2.emplace_back(idx2[j]);
-            C->vals.emplace_back(vals[j]);
-        }
-        start = B->idx1[i];
-        end = B->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->idx2.emplace_back(B->idx2[j]);
-            C->vals.emplace_back(-B->vals[j]);
-        }
-        C->idx1[i+1] = C->idx2.size();
-    }
-    C->nnz = C->idx2.size();
-    C->sort();
-    C->remove_duplicates();
-
-    return C;
-}
-
-
diff --git a/raptor/util/linalg/external/CMakeLists.txt b/raptor/util/linalg/external/CMakeLists.txt
deleted file mode 100644
index 9bf557f3..00000000
--- a/raptor/util/linalg/external/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Include the directory itself as a path to include directories
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
-
-#Create a variable called linalg_SOURCES containing all .cpp files:
-
-if (WITH_PTSCOTCH)
-    set(ptscotch_linalg_HEADERS
-        util/linalg/external/ptscotch_wrapper.hpp
-        )
-else()
-    set(ptscotch_linalg_HEADERS
-        ""
-        )
-endif()
-
-if (WITH_PARMETIS)
-    set(parmetis_linalg_HEADERS
-        util/linalg/external/parmetis_wrapper.hpp
-        )
-else()
-    set(parmetis_linalg_HEADERS
-        ""
-        )
-endif()
-
-set(ext_linalg_HEADERS
-    ${ptscotch_linalg_HEADERS}
-    ${parmetis_linalg_HEADERS}
-    PARENT_SCOPE
-    )
-
-
-
-
diff --git a/raptor/util/linalg/external/parmetis_wrapper.hpp b/raptor/util/linalg/external/parmetis_wrapper.hpp
deleted file mode 100644
index 6f04be6d..00000000
--- a/raptor/util/linalg/external/parmetis_wrapper.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef RAPTOR_GALLERY_PARMETIS_HPP
-#define RAPTOR_GALLERY_PARMETIS_HPP
-
-#include "raptor/raptor.hpp"
-#include "parmetis.h"
-
-using namespace raptor;
-
-int* parmetis_partition(ParCSRMatrix* A)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    int start, end;
-    int col, global_col;
-
-    // ParMetis Partitioner Variables
-    RAPtor_MPI_Comm comm = RAPtor_MPI_COMM_WORLD;
-    
-    // How vertices of graph are distributed among processes;
-    // Array size num_procs+1
-    // Range of vertices local to each processor
-    int* vtxdist = A->partition->first_cols.data();
-
-    // Local adjacency structure
-    std::vector<int> xadj(A->local_num_rows+1);
-    std::vector<int> adjncy(A->local_nnz);
-    xadj[0] = 0;
-    int nnz = 0;
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        start = A->on_proc->idx1[i];
-        end = A->on_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            col = A->on_proc->idx2[j];
-            global_col = A->on_proc_column_map[col];
-            adjncy[nnz++] = global_col;
-        }
-
-        start = A->off_proc->idx1[i];
-        end = A->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            col = A->off_proc->idx2[j];
-            global_col = A->off_proc_column_map[col];
-            adjncy[nnz++] = global_col;
-        }
-
-        xadj[i+1] = nnz;
-    }
-
-    // Weights of vertices and edges
-    int* vwgt = NULL;
-    int* adjwgt = NULL;
-
-    // Is the graph weighted?
-    // 0 - No weighting
-    // 1 - Edges only
-    // 2 - Vertices only
-    // 3 - Both edges and vertices
-    int wgtflag = 0;
-
-    // Numbering scheme
-    // 0 - Cstyle
-    // 1 - Fortran
-    int numflag = 0;
-
-    // Number of weights that each vertex has;
-    int ncon = 1;
-
-    // Number of sub-domains desired;
-    int nparts = num_procs;
-
-    // Fraction of vertex weight distributed to each subdomain
-    // Array size ncon x nparts
-    // For balanced sub-domains, each part gets 1/nparts
-    std::vector<float> tpwgts(nparts, 1.0/nparts);
-
-    // Imbalance tolerance for each vertex weight
-    // Array size ncon 
-    // Perfect balance: 1
-    // Perfect imbalance: nparts
-    // Recommended: 1.05
-    std::vector<float> ubvec(1, 1.05);
-
-    // Additional Options:
-    // Options[0] = 0 (default values) or 1 (specify options[1], options[2])
-    // Options[1]: levels of info to be returned (0-default, 1-timing info)
-    // Options[2]: random number seed for routine
-    std::vector<int> options(3, 0);
-
-    // Return value: Number of edges that are cut by partitioning
-    int edgecut;
-
-    // Return value: Array (size of local_num_rows) of partition for each row
-    int* part = NULL;
-    if (A->local_num_rows) 
-        part = new int[A->local_num_rows];
-
-    int err = ParMETIS_V3_PartKway(vtxdist, xadj.data(), adjncy.data(), vwgt, adjwgt, 
-            &wgtflag, &numflag, &ncon, &nparts, tpwgts.data(), ubvec.data(), options.data(),
-            &edgecut, part, &comm);
-
-    return part;
-}
-
-#endif
diff --git a/raptor/util/linalg/external/ptscotch_wrapper.hpp b/raptor/util/linalg/external/ptscotch_wrapper.hpp
deleted file mode 100644
index fd83bb2f..00000000
--- a/raptor/util/linalg/external/ptscotch_wrapper.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#ifndef RAPTOR_GALLERY_PTSCOTCH_HPP
-#define RAPTOR_GALLERY_PTSCOTCH_HPP
-
-#include <mpi.h>
-#include "core/types.hpp"
-#include "ptscotch.h"
-#include <unistd.h>
-#include <set>
-#include "core/par_matrix.hpp"
-#include <stdio.h>
-
-using namespace raptor;
-
-int* ptscotch_partition(ParCSRMatrix* A)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    // Variables for Graph Partitioning
-    SCOTCH_Num* partition = new SCOTCH_Num[A->local_num_rows + 2];
-    SCOTCH_Num baseval = 0;
-    SCOTCH_Num vertlocnbr = A->local_num_rows;
-    SCOTCH_Num vertlocmax = A->local_num_rows;
-    SCOTCH_Num* vertloctab = new SCOTCH_Num[vertlocnbr + 2];
-    SCOTCH_Num* vendloctab = &vertloctab[1];
-    SCOTCH_Num* veloloctab = NULL;
-    SCOTCH_Num* vlblloctab = NULL;
-    SCOTCH_Num edgelocnbr = A->local_nnz;
-    SCOTCH_Num edgelocsiz = A->local_nnz;
-    SCOTCH_Num* edgeloctab = new SCOTCH_Num[edgelocsiz + 1];
-    SCOTCH_Num* edgegsttab = NULL;
-    SCOTCH_Num* edloloctab = NULL;
-
-    int row_start, row_end;
-    int idx, gbl_idx, ctr;
-    int err;
-
-    // Find matrix edge indices for PT Scotch
-    ctr = 0;
-    vertloctab[0] = 0;
-    for (int row = 0; row < A->local_num_rows; row++)
-    {
-        row_start = A->on_proc->idx1[row];
-        row_end = A->on_proc->idx1[row+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            idx = A->on_proc->idx2[j];
-            if (idx == row) continue;
-            gbl_idx = A->on_proc_column_map[idx];
-            edgeloctab[ctr] = gbl_idx;
-            ctr++;
-        }
-
-        if (A->off_proc_num_cols)
-        {
-            row_start = A->off_proc->idx1[row];
-            row_end = A->off_proc->idx1[row+1];
-            for (int j = row_start; j < row_end; j++)
-            {
-                idx = A->off_proc->idx2[j];
-                gbl_idx = A->off_proc_column_map[idx];
-                edgeloctab[ctr] = gbl_idx;
-                ctr++;
-            }
-        }
-        vertloctab[row+1] = ctr;
-    }
-    edgelocnbr = ctr;
-    edgelocsiz = ctr;
-
-
-    SCOTCH_Dgraph dgraphdata;
-    SCOTCH_Strat stratdata;
-    SCOTCH_Arch archdata;
-
-    RAPtor_MPI_Comm comm;
-    RAPtor_MPI_Comm_dup(RAPtor_MPI_COMM_WORLD, &comm);
-
-    SCOTCH_dgraphInit(&dgraphdata, comm);
-    SCOTCH_dgraphBuild(&dgraphdata, baseval, vertlocnbr, vertlocmax,
-            vertloctab, vendloctab, veloloctab, vlblloctab, edgelocnbr, edgelocsiz,
-            edgeloctab, edgegsttab, edloloctab);
-    SCOTCH_dgraphCheck(&dgraphdata);
-
-    SCOTCH_stratInit(&stratdata);
-    SCOTCH_dgraphPart(&dgraphdata, num_procs, &stratdata, partition);
-
-    SCOTCH_stratExit(&stratdata);
-    SCOTCH_dgraphExit(&dgraphdata);
-
-    delete[] vertloctab;
-    delete[] edgeloctab;
-
-    RAPtor_MPI_Comm_free(&comm);
-
-    return partition;
-}
-
-
-#endif
-
diff --git a/raptor/util/linalg/matmult.cpp b/raptor/util/linalg/matmult.cpp
deleted file mode 100644
index 3552ae6d..00000000
--- a/raptor/util/linalg/matmult.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-#include "raptor/core/matrix.hpp"
-
-using namespace raptor;
-
-// Declare Private Methods
-std::vector<double>& form_new(const CSRMatrix* A, const CSRMatrix* B, 
-        CSRMatrix** C_ptr, std::vector<double>& A_vals);
-std::vector<double*>& form_new(const CSRMatrix* A, const CSRMatrix* B, 
-        CSRMatrix** C_ptr, std::vector<double*>& A_vals);
-std::vector<double>& form_new(const CSCMatrix* A, const CSRMatrix* B,
-        CSRMatrix** C_ptr, std::vector<double>& A_vals);
-std::vector<double*>& form_new(const CSCMatrix* A, const CSRMatrix* B,
-        CSRMatrix** C_ptr, std::vector<double*>& A_vals);
-void init_sums(std::vector<double>& sums, int size, int b_size);
-void init_sums(std::vector<double*>& sums, int size, int b_size);
-void zero_sum(double* sum, int b_size);
-void zero_sum(double** sum, int b_size);
-void finalize_sums(std::vector<double>& sums);
-void finalize_sums(std::vector<double*>& sums);
-
-
-std::vector<double>& form_new(const CSRMatrix* A, const CSRMatrix* B, 
-        CSRMatrix** C_ptr, std::vector<double>& A_vals)
-{
-    CSRMatrix* C = new CSRMatrix(A->n_rows, B->n_cols);
-    *C_ptr = C;
-    return C->vals;
-}
-std::vector<double*>& form_new(const CSRMatrix* A, const CSRMatrix* B, 
-        CSRMatrix** C_ptr, std::vector<double*>& A_vals)
-{
-    BSRMatrix* C = new BSRMatrix(A->n_rows, B->n_cols, 
-            A->b_rows, B->b_cols);
-    *C_ptr = C;
-    return C->block_vals;
-}
-std::vector<double>& form_new(const CSCMatrix* A, const CSRMatrix* B,
-        CSRMatrix** C_ptr, std::vector<double>& A_vals)
-{
-    CSRMatrix* C = new CSRMatrix(A->n_cols, B->n_cols);
-    *C_ptr = C;
-    return C->vals;
-}
-std::vector<double*>& form_new(const CSCMatrix* A, const CSRMatrix* B,
-        CSRMatrix** C_ptr, std::vector<double*>& A_vals)
-{
-    BSRMatrix* C = new BSRMatrix(A->n_cols, B->n_cols,
-            A->b_cols, B->b_cols);
-    *C_ptr = C;
-    return C->block_vals;
-}
-
-void init_sums(std::vector<double>& sums, int size, int b_size)
-{
-    sums.resize(size, 0);
-}
-void init_sums(std::vector<double*>& sums, int size, int b_size)
-{
-    for (int i = 0; i < size; i++)
-    {
-        sums.emplace_back(new double[b_size]);
-        for (int j = 0; j < b_size; j++)
-            sums[i][j] = 0.0;
-    }
-}
-
-void zero_sum(double* sum, int b_size)
-{
-    *sum = 0;
-}
-void zero_sum(double** sum, int b_size)
-{
-    (*sum) = new double[b_size];
-    for (int i = 0; i < b_size; i++)
-        (*sum)[i] = 0;
-}
-
-void finalize_sums(std::vector<double>& sums)
-{
-    return;
-}
-void finalize_sums(std::vector<double*>& sums)
-{
-    for (std::vector<double*>::iterator it = sums.begin();
-            it != sums.end(); ++it)
-        delete[] *it;
-}
-
-template <typename T>
-CSRMatrix* spgemm_helper(const CSRMatrix* A, const CSRMatrix* B, 
-        std::vector<T>& A_vals, std::vector<T>& B_vals,
-        int* B_to_C = NULL)
-{
-    std::vector<int> next(B->n_cols, -1);
-    std::vector<T> sums;
-    init_sums(sums, B->n_cols, B->b_size);
-
-    CSRMatrix* C = NULL;
-    std::vector<T>& C_vals = form_new(A, B, &C, A_vals);
-    C->reserve_size(1.5*A->nnz);
-
-    C->idx1[0] = 0;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        int head = -2;
-        int length = 0;
-        int row_start_A = A->idx1[i];
-        int row_end_A = A->idx1[i+1];
-        for (int j = row_start_A; j < row_end_A; j++)
-        {
-            int col_A = A->idx2[j];
-            T val_A = A_vals[j];
-            int row_start_B = B->idx1[col_A];
-            int row_end_B = B->idx1[col_A+1];
-            for (int k = row_start_B; k < row_end_B; k++)
-            {
-                int col_B = B->idx2[k];
-                A->mult_vals(val_A, B_vals[k], &sums[col_B],
-                        A->b_rows, B->b_cols, A->b_cols);
-                if (next[col_B] == -1)
-                {
-                    next[col_B] = head;
-                    head = col_B;
-                    length++;
-                }
-            }
-        }
-        for (int j = 0; j < length; j++)
-        {
-            double val = A->abs_val(sums[head]);
-            if (val > zero_tol)
-            {
-                if (B_to_C) 
-                {
-                    C->idx2.emplace_back(B_to_C[head]);
-                }
-                else
-                {
-                    C->idx2.emplace_back(head);
-                }
-                C_vals.emplace_back(sums[head]);
-            }
-            int tmp = head;
-            head = next[head];
-            next[tmp] = -1;
-            zero_sum(&sums[tmp], A->b_size);
-        }
-        C->idx1[i+1] = C->idx2.size();
-    }
-    C->nnz = C->idx2.size();
-
-    finalize_sums(sums);
-
-    return C;
-}
-
-template <typename T>
-CSRMatrix* spgemm_T_helper(const CSCMatrix* A, const CSRMatrix* B,
-        std::vector<T>& A_vals, std::vector<T>& B_vals,
-        int* C_map = NULL)
-{
-    CSRMatrix* C;
-    std::vector<T>& C_vals = form_new(A, B, &C, A_vals);
-    C->reserve_size(1.5*B->nnz);
-
-    std::vector<int> next(B->n_cols, -1); 
-    std::vector<T> sums;
-    init_sums(sums, B->n_cols, A->b_size);
-
-    C->idx1[0] = 0;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        int head = -2;
-        int length = 0;
-        int row_start_AT = A->idx1[i];
-        int row_end_AT = A->idx1[i+1];
-        for (int j = row_start_AT; j < row_end_AT; j++)
-        {
-            int col_AT = A->idx2[j];
-            T val_AT = A_vals[j];
-            int row_start = B->idx1[col_AT];
-            int row_end = B->idx1[col_AT+1];
-            for (int k = row_start; k < row_end; k++)
-            {
-                int col = B->idx2[k];
-                A->mult_T_vals(val_AT, B_vals[k], &sums[col],
-                        A->b_cols, B->b_cols, A->b_rows);
-                if (next[col] == -1)
-                {
-                    next[col] = head;
-                    head = col;
-                    length++;
-                }
-            }
-        }
-        for (int j = 0; j < length; j++)
-        {
-            if (A->abs_val(sums[head]) > zero_tol)
-            {
-                if (C_map)
-                {
-                    C->idx2.emplace_back(C_map[head]);
-                }
-                else
-                {
-                    C->idx2.emplace_back(head);
-                }
-                C_vals.emplace_back(sums[head]);
-            }
-            int tmp = head;
-            head = next[head];
-            next[tmp] = -1;
-            zero_sum(&sums[tmp], A->b_size);
-        }
-        C->idx1[i+1] = C->idx2.size();
-    }
-    C->nnz = C->idx2.size();
-
-    finalize_sums(sums);
-
-    return C;
-}
-
-
-CSRMatrix* Matrix::mult(CSRMatrix* B, int* B_to_C)
-{
-    return spgemm(B, B_to_C);
-}
-CSRMatrix* Matrix::mult(CSCMatrix* B, int* B_to_C)
-{
-    CSRMatrix* B_csr = B->to_CSR();
-    CSRMatrix* C = spgemm(B_csr, B_to_C);
-    delete B_csr;
-    return C;
-}
-CSRMatrix* Matrix::mult(COOMatrix* B, int* B_to_C)
-{
-    CSRMatrix* B_csr = B->to_CSR();
-    CSRMatrix* C = spgemm(B_csr, B_to_C);
-    delete B_csr;
-    return C;
-}
-
-CSRMatrix* Matrix::mult_T(CSCMatrix* A, int* C_map)
-{
-    return spgemm_T(A, C_map);
-}
-CSRMatrix* Matrix::mult_T(CSRMatrix* A, int* C_map)
-{
-    CSCMatrix* A_csc = A->to_CSC();
-    CSRMatrix* C = spgemm_T(A_csc, C_map);
-    delete A_csc;
-    return C;
-}
-CSRMatrix* Matrix::mult_T(COOMatrix* A, int* C_map)
-{
-    CSCMatrix* A_csc = A->to_CSC();
-    CSRMatrix* C = spgemm_T(A_csc, C_map);
-    delete A_csc;
-    return C;
-}
-
-CSRMatrix* CSRMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    return spgemm_helper(this, B, vals, B->vals, B_to_C);
-}
-BSRMatrix* BSRMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    BSRMatrix* B_bsr = (BSRMatrix*) B;
-    return (BSRMatrix*) spgemm_helper(this, B_bsr, block_vals, 
-            B_bsr->block_vals, B_to_C);
-}
-CSRMatrix* COOMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    CSRMatrix* A_csr = to_CSR();
-    CSRMatrix* C = spgemm_helper(A_csr, B, A_csr->vals, B->vals, 
-            B_to_C);
-    delete A_csr;
-    return C;
-}
-BSRMatrix* BCOOMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    BSRMatrix* A_bsr = (BSRMatrix*) to_BSR();
-    BSRMatrix* B_bsr = (BSRMatrix*) B;
-    BSRMatrix* C = (BSRMatrix*) spgemm_helper(A_bsr, B_bsr, 
-            A_bsr->block_vals, B_bsr->block_vals, B_to_C);
-    delete A_bsr;
-    return C;
-}
-CSRMatrix* CSCMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    CSRMatrix* A_csr = to_CSR();
-    CSRMatrix* C = spgemm_helper(A_csr, B, A_csr->vals, B->vals,
-            B_to_C);
-    delete A_csr;
-    return C;
-}
-BSRMatrix* BSCMatrix::spgemm(CSRMatrix* B, int* B_to_C)
-{
-    BSRMatrix* A_bsr = (BSRMatrix*) to_BSR();
-    BSRMatrix* B_bsr = (BSRMatrix*) B;
-    BSRMatrix* C = (BSRMatrix*) spgemm_helper(A_bsr, B_bsr, 
-            A_bsr->block_vals, B_bsr->block_vals, B_to_C);
-    delete A_bsr;
-    return C;
-}
-
-
-CSRMatrix* CSRMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    return spgemm_T_helper(A, this, A->vals, vals, C_map);
-}
-BSRMatrix* BSRMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    BSCMatrix* A_bsc = (BSCMatrix*) A;
-    return (BSRMatrix*) spgemm_T_helper(A_bsc, this, 
-            A_bsc->block_vals, block_vals, C_map);
-}
-CSRMatrix* COOMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    CSRMatrix* B_csr = to_CSR();
-    CSRMatrix* C = spgemm_T_helper(A, B_csr, A->vals, 
-            B_csr->vals, C_map);
-    delete B_csr;
-    return C;
-}
-BSRMatrix* BCOOMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    BSCMatrix* A_bsc = (BSCMatrix*) A;
-    BSRMatrix* B_bsr = (BSRMatrix*) to_BSR();
-    BSRMatrix* C = (BSRMatrix*) spgemm_T_helper(A_bsc, B_bsr, 
-            A_bsc->block_vals, B_bsr->block_vals, C_map);
-    delete B_bsr;
-    return C;
-}
-CSRMatrix* CSCMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    CSRMatrix* B_csr = to_CSR();
-    CSRMatrix* C = spgemm_T_helper(A, B_csr, A->vals, 
-            B_csr->vals, C_map);
-    delete B_csr;
-    return C;
-}
-BSRMatrix* BSCMatrix::spgemm_T(CSCMatrix* A, int* C_map)
-{
-    BSCMatrix* A_bsc = (BSCMatrix*) A;
-    BSRMatrix* B_bsr = (BSRMatrix*) to_BSR();
-    BSRMatrix* C = (BSRMatrix*) spgemm_T_helper(A_bsc, B_bsr, 
-            A_bsc->block_vals, B_bsr->block_vals, C_map);
-    delete B_bsr;
-    return C;
-}
diff --git a/raptor/util/linalg/par_add.cpp b/raptor/util/linalg/par_add.cpp
deleted file mode 100644
index 73caa115..00000000
--- a/raptor/util/linalg/par_add.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "assert.h"
-#include "raptor/core/par_matrix.hpp"
-
-using namespace raptor;
-
-// TODO -- currently assumes partitions are the same 
-ParMatrix* ParMatrix::add(ParCSRMatrix* B)
-{
-    return NULL;
-}
-ParMatrix* ParMatrix::subtract(ParCSRMatrix* B)
-{
-    return NULL;
-}
-
-ParCSRMatrix* ParCSRMatrix::add(ParCSRMatrix* B)
-{
-    ParCSRMatrix* C = new ParCSRMatrix(partition, global_num_rows, global_num_cols, 
-            local_num_rows, on_proc_num_cols, 0);
-    int start, end;
-
-    std::vector<int> off_proc_to_new;
-    std::vector<int> B_off_proc_to_new;
-    if (off_proc_num_cols) off_proc_to_new.resize(off_proc_num_cols, 0);
-    if (B->off_proc_num_cols) B_off_proc_to_new.resize(B->off_proc_num_cols, 0);
-
-    int ctr = 0;
-    int ctr_B = 0;
-    int global_col = 0;
-    int global_col_B = 0;
-    while (ctr < off_proc_num_cols || ctr_B < B->off_proc_num_cols)
-    {
-        if (ctr < off_proc_num_cols) global_col = off_proc_column_map[ctr];
-        else global_col = partition->global_num_cols;
-
-        if (ctr_B < B->off_proc_num_cols) global_col_B = B->off_proc_column_map[ctr_B];
-        else global_col_B = B->partition->global_num_cols;
-
-        if (global_col == global_col_B)
-        {
-            off_proc_to_new[ctr++] = C->off_proc_column_map.size();
-            B_off_proc_to_new[ctr_B++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col);
-        }
-        else if (global_col < global_col_B)
-        {
-            off_proc_to_new[ctr++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col);
-        }
-        else
-        {
-            B_off_proc_to_new[ctr_B++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col_B);
-        }
-    }
-    C->off_proc_num_cols = C->off_proc_column_map.size();
-
-
-    C->on_proc->idx1[0] = 0;
-    C->off_proc->idx1[0] = 0;
-    int on_nnz = on_proc->nnz + B->on_proc->nnz;
-    int off_nnz = off_proc->nnz + B->off_proc->nnz;
-    C->on_proc->idx2.resize(on_nnz);
-    C->on_proc->vals.resize(on_nnz);
-    C->off_proc->idx2.resize(off_nnz);
-    C->off_proc->vals.resize(off_nnz);
-    on_nnz = 0;
-    off_nnz = 0;
-    for (int i = 0; i < local_num_rows; i++)
-    {
-        // Add on_proc column indices and values
-        start = on_proc->idx1[i];
-        end = on_proc->idx1[i+1];
-        std::copy(on_proc->idx2.begin() + start,
-                on_proc->idx2.begin() + end,
-                C->on_proc->idx2.begin() + on_nnz);
-        std::copy(on_proc->vals.begin() + start,
-                on_proc->vals.begin() + end,
-                C->on_proc->vals.begin() + on_nnz);
-        on_nnz += (end - start);
-
-        // Add on_proc columns and values from B
-        start = B->on_proc->idx1[i];
-        end = B->on_proc->idx1[i+1];
-        std::copy(B->on_proc->idx2.begin() + start,
-                B->on_proc->idx2.begin() + end,
-                C->on_proc->idx2.begin() + on_nnz);
-        std::copy(B->on_proc->vals.begin() + start,
-                B->on_proc->vals.begin() + end,
-                C->on_proc->vals.begin() + on_nnz);
-        on_nnz += (end - start);
-
-        // Update rowptr
-        C->on_proc->idx1[i+1] = on_nnz;
-
-
-        // Add off_proc columns and values
-        start = off_proc->idx1[i];
-        end = off_proc->idx1[i+1];
-        std::copy(off_proc->idx2.begin() + start,
-                off_proc->idx2.begin() + end,
-                C->off_proc->idx2.begin() + off_nnz);
-        std::copy(off_proc->vals.begin() + start,
-                off_proc->vals.begin() + end,
-                C->off_proc->vals.begin() + off_nnz);
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin() + off_nnz;
-                it != C->off_proc->idx2.begin() + off_nnz + (end - start); ++it)
-        {
-            *it = off_proc_to_new[*it];
-        }
-        off_nnz += (end - start);
-
-        // Add off_proc columns and values from B
-        start = B->off_proc->idx1[i];
-        end = B->off_proc->idx1[i+1];
-        std::copy(B->off_proc->idx2.begin() + start,
-                B->off_proc->idx2.begin() + end,
-                C->off_proc->idx2.begin() + off_nnz);
-        std::copy(B->off_proc->vals.begin() + start,
-                B->off_proc->vals.begin() + end,
-                C->off_proc->vals.begin() + off_nnz);
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin() + off_nnz;
-                it != C->off_proc->idx2.begin() + off_nnz + (end - start); ++it)
-        {
-            *it = off_proc_to_new[*it];
-        }
-        off_nnz += (end - start);
-
-        // Update rowptr
-        C->off_proc->idx1[i+1] = off_nnz; 
-    }
-    C->on_proc->nnz = C->on_proc->idx2.size();
-    C->off_proc->nnz = C->off_proc->idx2.size();
-
-    C->on_proc_column_map.resize(on_proc_column_map.size());
-    std::copy(on_proc_column_map.begin(), on_proc_column_map.end(),
-            C->on_proc_column_map.begin());
-    C->local_row_map.resize(local_row_map.size());
-    std::copy(local_row_map.begin(), local_row_map.end(),
-            C->local_row_map.begin());
-
-    C->on_proc->sort();
-    C->on_proc->remove_duplicates();
-    C->on_proc->move_diag();
-
-    C->off_proc->sort();
-    C->off_proc->remove_duplicates();
-
-    if (C->off_proc_num_cols)
-    {
-        std::vector<int> new_col(C->off_proc_num_cols, 0);
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin();
-                it != C->off_proc->idx2.end(); ++it)
-        {
-            new_col[*it] = 1;
-        }
-        ctr = 0;
-        for (int i = 0; i < C->off_proc_num_cols; i++)
-        {
-            if (new_col[i])
-                new_col[i] = ctr++;
-            else 
-                new_col[i] = -1;
-        }
-        C->off_proc_num_cols = ctr;
-        C->off_proc->n_cols = ctr;
-        C->off_proc_column_map.resize(ctr);
-
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin();
-                it != C->off_proc->idx2.end(); ++it)
-        {
-            *it = new_col[*it];
-        }
-    }
-
-    C->local_nnz = C->on_proc->nnz + C->off_proc->nnz;
-
-    return C;
-}
-
-
-ParCSRMatrix* ParCSRMatrix::subtract(ParCSRMatrix* B)
-{
-    ParCSRMatrix* C = new ParCSRMatrix(partition, global_num_rows, global_num_cols, 
-            local_num_rows, on_proc_num_cols, 0);
-    int start, end;
-
-    std::vector<int> off_proc_to_new;
-    std::vector<int> B_off_proc_to_new;
-    if (off_proc_num_cols) off_proc_to_new.resize(off_proc_num_cols, 0);
-    if (B->off_proc_num_cols) B_off_proc_to_new.resize(B->off_proc_num_cols, 0);
-
-    int ctr = 0;
-    int ctr_B = 0;
-    int global_col = 0;
-    int global_col_B = 0;
-    while (ctr < off_proc_num_cols || ctr_B < B->off_proc_num_cols)
-    {
-        if (ctr < off_proc_num_cols) global_col = off_proc_column_map[ctr];
-        else global_col = partition->global_num_cols;
-
-        if (ctr_B < B->off_proc_num_cols) global_col_B = B->off_proc_column_map[ctr_B];
-        else global_col_B = B->partition->global_num_cols;
-
-        if (global_col == global_col_B)
-        {
-            off_proc_to_new[ctr++] = C->off_proc_column_map.size();
-            B_off_proc_to_new[ctr_B++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col);
-        }
-        else if (global_col < global_col_B)
-        {
-            off_proc_to_new[ctr++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col);
-        }
-        else
-        {
-            B_off_proc_to_new[ctr_B++] = C->off_proc_column_map.size();
-            C->off_proc_column_map.emplace_back(global_col_B);
-        }
-    }
-    C->off_proc_num_cols = C->off_proc_column_map.size();
-
-
-    C->on_proc->idx1[0] = 0;
-    C->off_proc->idx1[0] = 0;
-    for (int i = 0; i < local_num_rows; i++)
-    {
-        start = on_proc->idx1[i];
-        end = on_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->on_proc->idx2.emplace_back(on_proc->idx2[j]);
-            C->on_proc->vals.emplace_back(on_proc->vals[j]);
-        }
-        start = B->on_proc->idx1[i];
-        end = B->on_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->on_proc->idx2.emplace_back(B->on_proc->idx2[j]);
-            C->on_proc->vals.emplace_back(-B->on_proc->vals[j]);
-        }
-        C->on_proc->idx1[i+1] = C->on_proc->idx2.size();
-
-
-        start = off_proc->idx1[i];
-        end = off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->off_proc->idx2.emplace_back(off_proc_to_new[off_proc->idx2[j]]);
-            C->off_proc->vals.emplace_back(off_proc->vals[j]);
-        }
-        start = B->off_proc->idx1[i];
-        end = B->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            C->off_proc->idx2.emplace_back(B_off_proc_to_new[B->off_proc->idx2[j]]);
-            C->off_proc->vals.emplace_back(-B->off_proc->vals[j]);
-        }
-        C->off_proc->idx1[i+1] = C->off_proc->idx2.size();
-    }
-    C->on_proc->nnz = C->on_proc->idx2.size();
-    C->off_proc->nnz = C->off_proc->idx2.size();
-
-    C->on_proc_column_map.resize(on_proc_column_map.size());
-    std::copy(on_proc_column_map.begin(), on_proc_column_map.end(), C->on_proc_column_map.begin());
-    C->local_row_map.resize(local_row_map.size());
-    std::copy(local_row_map.begin(), local_row_map.end(), C->local_row_map.begin());
-
-    C->on_proc->sort();
-    C->on_proc->remove_duplicates();
-    C->on_proc->move_diag();
-
-    C->off_proc->sort();
-    C->off_proc->remove_duplicates();
-
-    if (C->off_proc_num_cols)
-    {
-        std::vector<int> new_col(C->off_proc_num_cols, 0);
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin();
-                it != C->off_proc->idx2.end(); ++it)
-        {
-            new_col[*it] = 1;
-        }
-        ctr = 0;
-        for (int i = 0; i < C->off_proc_num_cols; i++)
-        {
-            if (new_col[i])
-                new_col[i] = ctr++;
-            else 
-                new_col[i] = -1;
-        }
-        C->off_proc_num_cols = ctr;
-        C->off_proc->n_cols = ctr;
-        C->off_proc_column_map.resize(ctr);
-
-        for (std::vector<int>::iterator it = C->off_proc->idx2.begin();
-                it != C->off_proc->idx2.end(); ++it)
-        {
-            *it = new_col[*it];
-        }
-    }
-
-    C->local_nnz = C->on_proc->nnz + C->off_proc->nnz;
-
-    return C;
-}
diff --git a/raptor/util/linalg/par_matmult.cpp b/raptor/util/linalg/par_matmult.cpp
deleted file mode 100644
index ae65e0f1..00000000
--- a/raptor/util/linalg/par_matmult.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "raptor/core/par_matrix.hpp"
-
-using namespace raptor;
-
-// Declare Private Methods
-ParCSRMatrix* init_mat(ParCSCMatrix* A);
-ParCSRMatrix* init_mat(ParCSRMatrix* A);
-ParBSRMatrix* init_mat(ParBSRMatrix* A);
-ParBSRMatrix* init_mat(ParBSCMatrix* A);
-
-ParCSRMatrix* init_mat(ParCSCMatrix* A)
-{
-    return new ParCSRMatrix(A->partition);
-}
-ParCSRMatrix* init_mat(ParCSRMatrix* A)
-{
-    return new ParCSRMatrix(A->partition);
-}
-template <typename T>
-ParCSRMatrix* init_mat(ParCSRMatrix* A, T* B)
-{
-    Partition* part = new Partition(A->partition, B->partition);
-    ParCSRMatrix* C = new ParCSRMatrix(part);
-    part->num_shared = 0;
-    return C;
-}
-ParBSRMatrix* init_mat(ParBSRMatrix* A)
-{
-    return new ParBSRMatrix(A->partition, A->on_proc->b_rows, A->on_proc->b_cols);
-}
-ParBSRMatrix* init_mat(ParBSCMatrix* A)
-{
-    return new ParBSRMatrix(A->partition, A->on_proc->b_rows, A->on_proc->b_cols);
-}
-template <typename T>
-ParBSRMatrix* init_mat(ParBSRMatrix* A, T* B)
-{
-    Partition* part = new Partition(A->partition, B->partition);
-    ParBSRMatrix* C = new ParBSRMatrix(part, A->on_proc->b_rows, A->on_proc->b_cols);
-    part->num_shared = 0;
-    return C;
-}
-template <typename T, typename U>
-ParCSRMatrix* init_matrix(T* A, U* B)
-{
-    ParCSRMatrix* C;
-
-    if (A->partition == B->partition)
-    {
-        C = init_mat(A); 
-    }
-    else
-    {
-        if (A->partition->global_num_rows == B->partition->global_num_rows &&
-            A->partition->local_num_rows == B->partition->local_num_rows &&
-            A->partition->first_local_row == B->partition->first_local_row &&
-            A->partition->last_local_row == B->partition->last_local_row)
-        {
-            C = init_mat(B);
-        }
-        else if (A->partition->global_num_cols == B->partition->global_num_cols &&
-            A->partition->local_num_cols == B->partition->local_num_cols &&
-            A->partition->first_local_col == B->partition->first_local_col &&
-            A->partition->last_local_col == B->partition->last_local_col)
-        {
-            C = init_mat(A);
-        }
-        else
-        {
-            C = init_mat(A, B);
-        }
-    }
-
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::mult(ParCSRMatrix* B, bool tap)
-{
-    if (tap)
-    {
-        return this->tap_mult(B);
-    }
-
-    // Check that communication package has been initialized
-    if (comm == NULL)
-    {
-        comm = new ParComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize C (matrix to be returned)
-    ParCSRMatrix* C = init_matrix(this, B);
-    std::vector<char> send_buffer;
-
-    // Communicate data and multiply
-    comm->init_par_mat_comm(B, send_buffer);
-
-    // Fully Local Computation
-    CSRMatrix* C_on_on = on_proc->mult((CSRMatrix*) B->on_proc);
-    CSRMatrix* C_on_off = on_proc->mult((CSRMatrix*) B->off_proc);
-
-    CSRMatrix* recv_mat = comm->complete_mat_comm();
-
-    mult_helper(B, C, recv_mat, C_on_on, C_on_off);
-
-    delete C_on_on;
-    delete C_on_off;
-    delete recv_mat;
-
-    // Return matrix containing product
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::tap_mult(ParCSRMatrix* B)
-{
-    // Check that communication package has been initialized
-    if (tap_mat_comm == NULL)
-    {
-        // Always 2-step
-        tap_mat_comm = new TAPComm(partition, off_proc_column_map, 
-                on_proc_column_map, false);
-    }
-
-    // Initialize C (matrix to be returned)
-    ParCSRMatrix* C = init_matrix(this, B);;
-    std::vector<char> send_buffer;
-
-    // Communicate data and multiply
-    tap_mat_comm->init_par_mat_comm(B, send_buffer);
-
-    // Fully Local Computation
-    CSRMatrix* C_on_on = on_proc->mult((CSRMatrix*) B->on_proc);
-    CSRMatrix* C_on_off = on_proc->mult((CSRMatrix*) B->off_proc);
-
-    CSRMatrix* recv_mat = tap_mat_comm->complete_mat_comm();
-
-    mult_helper(B, C, recv_mat, C_on_on, C_on_off);
-    delete C_on_on;
-    delete C_on_off;
-    delete recv_mat;
-
-    // Return matrix containing product
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::mult_T(ParCSRMatrix* A, bool tap)
-{
-    ParCSCMatrix* Acsc = A->to_ParCSC();
-    ParCSRMatrix* C = this->mult_T(Acsc, tap);
-    delete Acsc;
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::tap_mult_T(ParCSRMatrix* A)
-{
-    ParCSCMatrix* Acsc = A->to_ParCSC();
-    ParCSRMatrix* C = this->tap_mult_T(Acsc);
-    delete Acsc;
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::mult_T(ParCSCMatrix* A, bool tap)
-{
-    if (tap)
-    {
-        return this->tap_mult_T(A);
-    }
-
-    if (A->comm == NULL)
-    {
-        A->comm = new ParComm(A->partition, A->off_proc_column_map, A->on_proc_column_map);
-    }
-
-    // Initialize C (matrix to be returned)
-    ParCSRMatrix* C = init_matrix(this, A);;
-
-    CSRMatrix* Ctmp = mult_T_partial(A);
-    std::vector<char> send_buffer;
-
-    A->comm->init_mat_comm_T(send_buffer, Ctmp->idx1, Ctmp->idx2, 
-            Ctmp->vals);
-
-    CSRMatrix* C_on_on = on_proc->mult_T((CSCMatrix*) A->on_proc);
-    CSRMatrix* C_off_on = off_proc->mult_T((CSCMatrix*) A->on_proc);
-
-    CSRMatrix* recv_mat = A->comm->complete_mat_comm_T(A->on_proc_num_cols);
-
-    mult_T_combine(A, C, recv_mat, C_on_on, C_off_on);
-
-    // Clean up
-    delete Ctmp;
-    delete C_on_on;
-    delete C_off_on;
-    delete recv_mat;
-
-    // Return matrix containing product
-    return C;
-}
-
-ParCSRMatrix* ParCSRMatrix::tap_mult_T(ParCSCMatrix* A)
-{
-    if (A->tap_mat_comm == NULL)
-    {
-        A->tap_mat_comm = new TAPComm(A->partition, A->off_proc_column_map, 
-                A->on_proc_column_map, false);
-    }
-
-    // Initialize C (matrix to be returned)
-    ParCSRMatrix* C = init_matrix(this, A);
-
-    CSRMatrix* Ctmp = mult_T_partial(A);
-    std::vector<char> send_buffer;
-
-    A->tap_mat_comm->init_mat_comm_T(send_buffer, Ctmp->idx1, Ctmp->idx2, 
-            Ctmp->vals);
-
-    CSRMatrix* C_on_on = on_proc->mult_T((CSCMatrix*) A->on_proc);
-    CSRMatrix* C_off_on = off_proc->mult_T((CSCMatrix*) A->on_proc);
-
-    CSRMatrix* recv_mat = A->tap_mat_comm->complete_mat_comm_T(A->on_proc_num_cols);
-
-    mult_T_combine(A, C, recv_mat, C_on_on, C_off_on);
-
-    // Clean up
-    delete Ctmp;
-    delete recv_mat;
-    delete C_on_on;
-    delete C_off_on;
-
-    // Return matrix containing product
-    return C;
-}
-
-ParMatrix* ParMatrix::mult(ParCSRMatrix* B, bool tap)
-{
-    int rank;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    if (rank == 0) 
-        printf("Multiplication is not implemented for these ParMatrix types.\n");
-    return NULL;
-}
-
-void ParCSRMatrix::mult_helper(ParCSRMatrix* B, ParCSRMatrix* C, 
-        CSRMatrix* recv_mat, CSRMatrix* C_on_on, CSRMatrix* C_on_off)
-{
-    // Set dimensions of C
-    C->global_num_rows = global_num_rows;
-    C->global_num_cols = B->global_num_cols;
-    C->local_num_rows = local_num_rows;
-
-    C->on_proc_column_map = B->get_on_proc_column_map();
-    C->local_row_map = get_local_row_map();
-    C->on_proc_num_cols = C->on_proc_column_map.size();
-    
-    // Initialize nnz as 0 (will increment this as nonzeros are added)
-    C->local_nnz = 0;
-
-    // Declare Variables
-    int row_start, row_end;
-    int global_col;
-            
-    // Split recv_mat into on and off proc portions
-    CSRMatrix* recv_on = new CSRMatrix(recv_mat->n_rows, -1);
-    CSRMatrix* recv_off = new CSRMatrix(recv_mat->n_rows, -1);
-
-    int* part_to_col = B->map_partition_to_local();
-    recv_on->idx1[0] = 0;
-    recv_off->idx1[0] = 0;
-    for (int i = 0; i < recv_mat->n_rows; i++)
-    {
-        row_start = recv_mat->idx1[i];
-        row_end = recv_mat->idx1[i+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            global_col = recv_mat->idx2[j];
-            if (global_col < B->partition->first_local_col ||
-                    global_col > B->partition->last_local_col)
-            {
-                recv_off->idx2.emplace_back(global_col);
-                recv_off->vals.emplace_back(recv_mat->vals[j]);
-            }
-            else
-            {
-                recv_on->idx2.emplace_back(part_to_col[global_col - 
-                        B->partition->first_local_col]);
-                recv_on->vals.emplace_back(recv_mat->vals[j]);
-            }
-        }
-        recv_on->idx1[i+1] = recv_on->idx2.size();
-        recv_off->idx1[i+1] = recv_off->idx2.size();
-    }
-    recv_on->nnz = recv_on->idx2.size();
-    recv_off->nnz = recv_off->idx2.size();
-    delete[] part_to_col;
-
-    // Calculate global_to_C and B_to_C column maps
-    std::map<int, int> global_to_C;
-    std::vector<int> B_to_C(B->off_proc_num_cols);
-
-    std::copy(recv_off->idx2.begin(), recv_off->idx2.end(),
-            std::back_inserter(C->off_proc_column_map));
-    for (std::vector<int>::iterator it = B->off_proc_column_map.begin();
-            it != B->off_proc_column_map.end(); ++it)
-    {
-        C->off_proc_column_map.emplace_back(*it);
-    }
-    std::sort(C->off_proc_column_map.begin(), C->off_proc_column_map.end());
-
-    int prev_col = -1;
-    C->off_proc_num_cols = 0;
-    for (std::vector<int>::iterator it = C->off_proc_column_map.begin();
-            it != C->off_proc_column_map.end(); ++it)
-    {
-        if (*it != prev_col)
-        {
-            global_to_C[*it] = C->off_proc_num_cols;
-            C->off_proc_column_map[C->off_proc_num_cols++] = *it;
-            prev_col = *it;
-        }
-    }
-    C->off_proc_column_map.resize(C->off_proc_num_cols);
-
-    for (int i = 0; i < B->off_proc_num_cols; i++)
-    {
-        global_col = B->off_proc_column_map[i];
-        B_to_C[i] = global_to_C[global_col];
-    }
-    for (std::vector<int>::iterator it = recv_off->idx2.begin(); 
-            it != recv_off->idx2.end(); ++it)
-    {
-        *it = global_to_C[*it];
-    }
-
-    for (std::vector<int>::iterator it = C_on_off->idx2.begin();
-            it != C_on_off->idx2.end(); ++it)
-    {
-        *it = B_to_C[*it];
-    }
-    C->off_proc_num_cols = C->off_proc_column_map.size();
-    recv_on->n_cols = B->on_proc->n_cols;
-    recv_off->n_cols = C->off_proc_num_cols;
-    C_on_off->n_cols = C->off_proc_num_cols;
-
-    // Multiply A->off_proc * B->recv_on -> C_off_on
-    CSRMatrix* C_off_on = off_proc->mult(recv_on);
-    delete recv_on;
-
-    // Multiply A->off_proc * B->recv_off -> C_off_off
-    CSRMatrix* C_off_off = off_proc->mult(recv_off);
-    delete recv_off;
-
-    // Create C->on_proc by adding C_on_on + C_off_on
-    C_on_on->add_append(C_off_on, (CSRMatrix*) C->on_proc);
-    delete C_off_on;
-
-    // Create C->off_proc by adding C_off_on + C_off_off
-    C_on_off->add_append(C_off_off, (CSRMatrix*) C->off_proc);
-    delete C_off_off;
-
-    C->local_nnz = C->on_proc->nnz + C->off_proc->nnz;
-}
-
-CSRMatrix* ParCSRMatrix::mult_T_partial(CSCMatrix* A_off)
-{
-    CSRMatrix* C_off_on = on_proc->mult_T(A_off, on_proc_column_map.data());
-    CSRMatrix* C_off_off = off_proc->mult_T(A_off, off_proc_column_map.data());
-    CSRMatrix* Ctmp = C_off_on->add(C_off_off, false);
-
-    delete C_off_on;
-    delete C_off_off;
-
-    return Ctmp;
-}
-
-// A_T * self
-CSRMatrix* ParCSRMatrix::mult_T_partial(ParCSCMatrix* A)
-{
-    // Declare Variables
-    return mult_T_partial((CSCMatrix*) A->off_proc); 
-}
-
-void ParCSRMatrix::mult_T_combine(ParCSCMatrix* P, ParCSRMatrix* C, CSRMatrix* recv_mat,
-        CSRMatrix* C_on_on, CSRMatrix* C_off_on)
-{ 
-    int start, end, ctr;
-    int col, col_C;
-
-    std::vector<double> sums;
-    std::vector<int> next;
-
-    // Split recv_mat into recv_on and recv_off
-    // Split recv_mat into on and off proc portions
-    CSRMatrix* recv_on = new CSRMatrix(recv_mat->n_rows, -1);
-    CSRMatrix* recv_off = new CSRMatrix(recv_mat->n_rows, -1);
-    for (int i = 0; i < recv_mat->n_rows; i++)
-    {
-        start = recv_mat->idx1[i];
-        end = recv_mat->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            col = recv_mat->idx2[j];
-            if (col < partition->first_local_col
-                    || col > partition->last_local_col)
-            {
-                recv_off->idx2.emplace_back(col);
-                recv_off->vals.emplace_back(recv_mat->vals[j]);
-            }
-            else
-            {
-                recv_on->idx2.emplace_back(col);
-                recv_on->vals.emplace_back(recv_mat->vals[j]);
-            }
-        }
-        recv_on->idx1[i+1] = recv_on->idx2.size();
-        recv_off->idx1[i+1] = recv_off->idx2.size();
-    }
-    recv_on->nnz = recv_on->idx2.size();
-    recv_off->nnz = recv_off->idx2.size();
-
-
-    // Set dimensions of C
-    C->global_num_rows = P->global_num_cols; // AT global rows
-    C->global_num_cols = global_num_cols;
-    C->local_num_rows = P->on_proc_num_cols; // AT local rows
-
-    // Initialize nnz as 0 (will increment this as nonzeros are added)
-    C->local_nnz = 0;
-
-    /******************************
-     * Form on_proc
-     ******************************/
-    // Resize variables in on_proc
-    C->on_proc_column_map = get_on_proc_column_map();
-    C->local_row_map = P->get_on_proc_column_map();
-    C->on_proc_num_cols = C->on_proc_column_map.size();
-
-    // Update recv_on columns (to match local cols)
-    int* part_to_col = map_partition_to_local();
-    for (std::vector<int>::iterator it = recv_on->idx2.begin();
-            it != recv_on->idx2.end(); ++it)
-    {
-        *it = part_to_col[(*it - partition->first_local_col)];
-    }
-    delete[] part_to_col;
-
-    // Multiply on_proc    
-    recv_on->n_cols = C->on_proc_num_cols;
-    C_on_on->add_append(recv_on, (CSRMatrix*) C->on_proc);
-
-    /******************************
-     * Form off_proc
-     ******************************/
-    // Calculate global_to_C and map_to_C column maps
-    std::map<int, int> global_to_C;
-    std::vector<int> map_to_C;
-    if (off_proc_num_cols)
-    {
-        map_to_C.reserve(off_proc_num_cols);
-    }
-
-    // Create set of global columns in B_off_proc and recv_mat
-    std::set<int> C_col_set;
-    for (std::vector<int>::iterator it = recv_off->idx2.begin(); 
-            it != recv_off->idx2.end(); ++it)
-    {
-        C_col_set.insert(*it);
-    }
-    for (std::vector<int>::iterator it = off_proc_column_map.begin(); 
-            it != off_proc_column_map.end(); ++it)
-    {
-        C_col_set.insert(*it);
-    }
-
-    C->off_proc_num_cols = C_col_set.size();
-    if (C->off_proc_num_cols)
-    {
-        C->off_proc_column_map.reserve(C->off_proc_num_cols);
-    }
-    for (std::set<int>::iterator it = C_col_set.begin(); 
-            it != C_col_set.end(); ++it)
-    {
-        global_to_C[*it] = C->off_proc_column_map.size();
-        C->off_proc_column_map.emplace_back(*it);
-    }
-
-    // Map local off_proc_cols to C->off_proc_column_map
-    for (std::vector<int>::iterator it = off_proc_column_map.begin();
-            it != off_proc_column_map.end(); ++it)
-    {
-        col_C = global_to_C[*it];
-        map_to_C.emplace_back(col_C);
-    }
-
-    // Update recvd cols from global_col to local col in C
-    for (std::vector<int>::iterator it = recv_off->idx2.begin();
-            it != recv_off->idx2.end(); ++it)
-    {
-        *it = global_to_C[*it];
-    }
-
-    recv_off->n_cols = C->off_proc_num_cols;
-    for (std::vector<int>::iterator it = C_off_on->idx2.begin();
-            it != C_off_on->idx2.end(); ++it)
-    {
-        *it = map_to_C[*it];
-    }
-    C_off_on->add_append(recv_off, (CSRMatrix*) C->off_proc);
-
-    C->local_nnz = C->on_proc->nnz + C->off_proc->nnz;
-
-    // Condense columns!  A lot of them are zero columns...
-    // Could instead add global column indices, and then map to local
-    std::vector<int> off_col_sizes;
-    std::vector<int> col_orig_to_new;
-    if (C->off_proc_num_cols)
-    {
-        off_col_sizes.resize(C->off_proc_num_cols, 0);
-        col_orig_to_new.resize(C->off_proc_num_cols);
-    }
-    for (int i = 0; i < C->local_num_rows; i++)
-    {
-        start = C->off_proc->idx1[i];
-        end = C->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            off_col_sizes[C->off_proc->idx2[j]]++;
-        }
-    }
-    ctr = 0;
-    for (int i = 0; i < C->off_proc_num_cols; i++)
-    {
-        if (off_col_sizes[i])
-        {
-            col_orig_to_new[i] = ctr;
-            C->off_proc_column_map[ctr++] = C->off_proc_column_map[i];
-        }
-    }
-    C->off_proc_num_cols = ctr;
-    C->off_proc->n_cols = ctr;
-    if (ctr)
-    {
-        C->off_proc_column_map.resize(ctr);
-    }
-    else
-    {
-        C->off_proc_column_map.clear();
-    }
-    for (int i = 0; i < C->local_num_rows; i++)
-    {
-        start = C->off_proc->idx1[i];
-        end = C->off_proc->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            col = C->off_proc->idx2[j];
-            C->off_proc->idx2[j] = col_orig_to_new[col];
-        }
-    }
-
-    delete recv_on;
-    delete recv_off;
-}
-
diff --git a/raptor/util/linalg/par_spmv.cpp b/raptor/util/linalg/par_spmv.cpp
deleted file mode 100644
index c2596955..00000000
--- a/raptor/util/linalg/par_spmv.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "raptor/core/types.hpp"
-#include "raptor/core/par_matrix.hpp"
-#include "raptor/core/par_vector.hpp"
-
-#include "assert.h"
-
-using namespace raptor;
-
-/**************************************************************
- *****   Parallel Matrix-Vector Multiplication
- **************************************************************
- ***** Performs parallel matrix-vector multiplication
- ***** b = A*x
- *****
- ***** Parameters
- ***** -------------
- ***** x : ParVector*
- *****    Parallel vector to be multiplied
- ***** b : ParVector*
- *****    Parallel vector result is returned in
- **************************************************************/
-void ParMatrix::mult(ParVector& x, ParVector& b, bool tap)
-{
-    if (tap)
-    {
-        this->tap_mult(x, b);
-        return;
-    }
-
-    // Check that communication package has been initialized
-    if (comm == NULL)
-    {
-        comm = new ParComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    comm->init_comm(x, off_proc->b_cols);
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows)
-    {
-        on_proc->mult(x.local, b.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append(x_tmp, b.local);
-    }
-}
-
-void ParMatrix::tap_mult(ParVector& x, ParVector& b)
-{
-    // Check that communication package has been initialized
-    if (tap_comm == NULL)
-    {
-        tap_comm = new TAPComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    tap_comm->init_comm(x, off_proc->b_cols);
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows)
-    {
-        on_proc->mult(x.local, b.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = tap_comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append(x_tmp, b.local);
-    }
-}
-
-void ParMatrix::mult_append(ParVector& x, ParVector& b, bool tap)
-{
-    if (tap)
-    {
-        this->tap_mult_append(x, b);
-        return;
-    }
-
-    // Check that communication package has been initialized
-    if (comm == NULL)
-    {
-        comm = new ParComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    comm->init_comm(x, off_proc->b_cols);
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows)
-    {
-        on_proc->mult_append(x.local, b.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append(x_tmp, b.local);
-    }
-}
-
-void ParMatrix::tap_mult_append(ParVector& x, ParVector& b)
-{
-    // Check that communication package has been initialized
-    if (tap_comm == NULL)
-    {
-        tap_comm = new TAPComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    tap_comm->init_comm(x, off_proc->b_cols);
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows)
-    {
-        on_proc->mult_append(x.local, b.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = tap_comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append(x_tmp, b.local);
-    }
-}
-
-void ParMatrix::mult_T(ParVector& x, ParVector& b, bool tap)
-{
-    if (tap)
-    {
-        this->tap_mult_T(x, b);
-        return;
-    }
-
-    // Check that communication package has been initialized
-    if (comm == NULL)
-    {
-        comm = new ParComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    std::vector<double>& x_tmp = comm->get_buffer<double>();
-    if ((int)x_tmp.size() < comm->recv_data->size_msgs * off_proc->b_cols)
-        x_tmp.resize(comm->recv_data->size_msgs * off_proc->b_cols);
-
-    off_proc->mult_T(x.local, x_tmp);
-
-    comm->init_comm_T(x_tmp, off_proc->b_cols);
-
-    if (local_num_rows)
-    {
-        on_proc->mult_T(x.local, b.local);
-    }
-
-    comm->complete_comm_T<double>(b.local.values, off_proc->b_cols);
-}
-
-void ParMatrix::tap_mult_T(ParVector& x, ParVector& b)
-{
-    // Check that communication package has been initialized
-    if (tap_comm == NULL)
-    {
-        tap_comm = new TAPComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    std::vector<double>& x_tmp = tap_comm->get_buffer<double>();
-    if ((int)x_tmp.size() < tap_comm->recv_size * off_proc->b_cols)
-        x_tmp.resize(tap_comm->recv_size * off_proc->b_cols);
-
-    off_proc->mult_T(x.local, x_tmp);
-
-    tap_comm->init_comm_T(x_tmp, off_proc->b_cols);
-
-    if (local_num_rows)
-    {
-        on_proc->mult_T(x.local, b.local);
-    }
-
-    tap_comm->complete_comm_T<double>(b.local.values, off_proc->b_cols);
-}
-
-void ParMatrix::residual(ParVector& x, ParVector& b, ParVector& r, bool tap)
-{
-    if (tap) 
-    {
-        this->tap_residual(x, b, r);
-        return;
-    }
-
-    // Check that communication package has been initialized
-    if (comm == NULL)
-    {
-        comm = new ParComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    comm->init_comm(x, off_proc->b_cols);
-
-    std::copy(b.local.values.begin(), b.local.values.end(), 
-            r.local.values.begin());
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows && on_proc_num_cols)
-    {
-        on_proc->residual(x.local, b.local, r.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append_neg(x_tmp, r.local);
-    }
-}
-
-void ParMatrix::tap_residual(ParVector& x, ParVector& b, ParVector& r)
-{
-    // Check that communication package has been initialized
-    if (tap_comm == NULL)
-    {
-        tap_comm = new TAPComm(partition, off_proc_column_map, on_proc_column_map);
-    }
-
-    // Initialize Isends and Irecvs to communicate
-    // values of x
-    tap_comm->init_comm(x, off_proc->b_cols);
-
-    std::copy(b.local.values.begin(), b.local.values.end(), r.local.values.begin());
-
-    // Multiply the diagonal portion of the matrix,
-    // setting b = A_diag*x_local
-    if (local_num_rows && on_proc_num_cols)
-    {
-        on_proc->mult_append_neg(x.local, r.local);
-    }
-
-    // Wait for Isends and Irecvs to complete
-    std::vector<double>& x_tmp = tap_comm->complete_comm<double>(off_proc->b_cols);
-
-    // Multiply remaining columns, appending to previous
-    // solution in b (b += A_offd * x_distant)
-    if (off_proc_num_cols)
-    {
-        off_proc->mult_append_neg(x_tmp, r.local);
-    }
-}
-
-
-void ParCOOMatrix::mult(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult(x, b, tap);
-}
-
-void ParCSRMatrix::mult(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult(x, b, tap);
-}
-
-void ParCSCMatrix::mult(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult(x, b, tap);
-}
-
-void ParCOOMatrix::tap_mult(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult(x, b);
-}
-
-void ParCSRMatrix::tap_mult(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult(x, b);
-}
-
-void ParCSCMatrix::tap_mult(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult(x, b);
-}
-
-void ParCOOMatrix::mult_T(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult_T(x, b, tap);
-}
-
-void ParCSRMatrix::mult_T(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult_T(x, b, tap);
-}
-
-void ParCSCMatrix::mult_T(ParVector& x, ParVector& b, bool tap)
-{
-    ParMatrix::mult_T(x, b, tap);
-}
-
-void ParCOOMatrix::tap_mult_T(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult_T(x, b);
-}
-
-void ParCSRMatrix::tap_mult_T(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult_T(x, b);
-}
-
-void ParCSCMatrix::tap_mult_T(ParVector& x, ParVector& b)
-{
-    ParMatrix::tap_mult_T(x, b);
-}
-
diff --git a/raptor/util/linalg/repartition.cpp b/raptor/util/linalg/repartition.cpp
deleted file mode 100644
index 82312331..00000000
--- a/raptor/util/linalg/repartition.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-#include "repartition.hpp"
-
-namespace raptor {
-void make_contiguous(ParCSRMatrix* A, std::vector<int>& off_proc_part_map)
-{
-    int rank, num_procs;
-    RAPtor_MPI_Comm_rank(RAPtor_MPI_COMM_WORLD, &rank);
-    RAPtor_MPI_Comm_size(RAPtor_MPI_COMM_WORLD, &num_procs);
-
-    std::map<int, int> global_to_local;
-    std::vector<int> proc_num_cols(num_procs);
-    std::vector<int> recvvec;
-
-    int ctr = 0;
-    for (std::vector<int>::const_iterator it = A->off_proc_column_map.begin();  
-            it != A->off_proc_column_map.end(); ++it)
-    {
-        global_to_local[*it] = ctr++;
-    }
-
-    // Find how many columns are local to each process
-    RAPtor_MPI_Allgather(&(A->on_proc_num_cols), 1, RAPtor_MPI_INT, proc_num_cols.data(), 1, RAPtor_MPI_INT,
-            RAPtor_MPI_COMM_WORLD);
-  
-    // Determine the new first local row / first local col of rank
-    A->partition->first_local_col = 0;
-    for (int i = 0; i < rank; i++)
-    {
-        A->partition->first_local_col += proc_num_cols[i];
-    }
-    A->partition->first_local_row = A->partition->first_local_col;
-
-    // Determine the global number of columns and rows
-    A->global_num_cols = A->partition->first_local_col;
-    for (int i = rank; i < num_procs; i++)
-    {
-        A->global_num_cols += proc_num_cols[i];
-    }
-    A->global_num_rows = A->global_num_cols;
-
-    A->comm = new ParComm(A->partition->topology, A->off_proc_column_map,
-            off_proc_part_map, A->local_row_map);
-
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        A->on_proc_column_map[i] = A->partition->first_local_col + i;
-    }
-    A->local_row_map = A->get_on_proc_column_map();
-    recvvec = A->comm->communicate(A->local_row_map);
-    for (int i = 0; i < A->off_proc_num_cols; i++)
-        A->off_proc_column_map[i] = recvvec[i];
-
-
-    // Sort rows, removing duplicate entries and moving diagonal 
-    // value to first
-    A->on_proc->sort();
-    A->on_proc->move_diag();
-    A->off_proc->sort();
-
-}
-
-ParCSRMatrix* repartition_matrix(ParCSRMatrix* A, int* partition, std::vector<int>& new_local_rows)
-{
-    int rank, num_procs;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-
-    ParCSRMatrix* A_part = NULL;
-
-    int proc, start, end;
-    int row_start, row_end, row_size;
-    int num_sends, num_recvs;
-    int proc_idx, idx, ctr, prev_ctr;
-    int row, col, global_row, global_col;
-    int count, first_row;
-    int recv_size;
-    double val;
-    std::vector<int> proc_rows(num_procs, 0);
-    std::vector<int> proc_to_idx(num_procs);
-    std::vector<int> send_procs(num_procs);
-    std::vector<int> send_ptr;
-    std::vector<MPI_Request> send_requests;
-    std::vector<int> send_indices;
-    std::vector<char> send_buffer;
-    std::vector<char> recv_buffer;
-    std::vector<int> recv_rows;
-    std::vector<int> recv_row_ptr;
-    std::vector<int> recv_cols;
-    std::vector<double> recv_vals;
-    MPI_Status recv_status;
-
-    int num_ints = 2*A->local_num_rows + A->local_nnz;
-    int num_dbls = A->local_nnz;
-    int int_bytes, dbl_bytes;
-    MPI_Pack_size(num_ints, MPI_INT, MPI_COMM_WORLD, &int_bytes);
-    MPI_Pack_size(num_dbls, MPI_DOUBLE, MPI_COMM_WORLD, &dbl_bytes);
-
-    int tag = 29485;
-
-    std::vector<int> off_parts(A->off_proc_num_cols);
-    std::vector<int>& recvvec = A->comm->communicate(partition);
-    std::copy(recvvec.begin(), recvvec.end(), off_parts.begin());
-
-    num_sends = 0;
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        proc = partition[i];
-        if (proc_rows[proc] == 0)
-        {
-            send_procs[num_sends++] = proc;
-        }
-        proc_rows[proc]++;
-    }
-    send_procs.resize(num_sends);
-
-
-    send_ptr.resize(num_sends+1);
-    send_requests.resize(num_sends);
-    send_ptr[0] = 0;
-    for (int i = 0; i < num_sends; i++)
-    {
-        proc = send_procs[i];
-        proc_to_idx[proc] = i;
-        send_ptr[i+1] = send_ptr[i] + proc_rows[proc];
-        proc_rows[proc] = 0;
-    }
-
-    send_indices.resize(A->local_num_rows);
-    for (int i = 0; i < A->local_num_rows; i++)
-    {
-        proc = partition[i];
-        proc_idx = proc_to_idx[proc];
-        idx = send_ptr[proc_idx] + proc_rows[proc]++;
-        send_indices[idx] = i;
-    }
-    for (int i = 0; i < num_sends; i++)
-    {
-        proc = send_procs[i];
-        proc_rows[proc] = 1;
-    }
-    MPI_Allreduce(MPI_IN_PLACE, proc_rows.data(), num_procs, MPI_INT, MPI_SUM,
-            MPI_COMM_WORLD);
-    num_recvs = proc_rows[rank];
-
-    // TODO -- send partitions for each global col (both on and off proc) if part[row] != part[col]
-    std::vector<int> col_bool(A->local_num_rows, 0);
-    std::vector<int> off_col_bool(A->off_proc_num_cols, 0);
-    std::vector<int> send_cols(A->local_num_rows);
-    std::vector<int> off_send_cols(A->off_proc_num_cols);
-    int n_cols, off_n_cols;
-    int n_rows, part;
-    int off_col_size = 2 * (A->off_proc_num_cols + A->local_num_rows) * num_sends;
-    int off_col_bytes, row_bytes;
-    MPI_Pack_size(off_col_size, MPI_INT, MPI_COMM_WORLD, &off_col_bytes);
-    MPI_Pack_size(num_sends, MPI_INT, MPI_COMM_WORLD, &row_bytes);
-    send_buffer.resize(int_bytes + dbl_bytes + off_col_bytes + row_bytes);
-    ctr = 0;
-    for (int i = 0; i < num_sends; i++)
-    {
-        prev_ctr = ctr;
-        proc = send_procs[i];
-        start = send_ptr[i];
-        end = send_ptr[i+1];
-        n_rows = end - start;
-        proc_rows[proc] = n_rows;
-        MPI_Pack(&n_rows, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                &ctr, MPI_COMM_WORLD);
-        n_cols = 0;
-        off_n_cols = 0;
-        for (int j = start; j < end; j++)
-        {
-            row = send_indices[j];
-            global_row = A->local_row_map[row];
-            row_size = A->on_proc->idx1[row+1] - A->on_proc->idx1[row]
-                + A->off_proc->idx1[row+1] - A->off_proc->idx1[row];
-
-            MPI_Pack(&global_row, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                    &ctr, MPI_COMM_WORLD);
-            MPI_Pack(&row_size, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                    &ctr, MPI_COMM_WORLD);
-
-            row_start = A->on_proc->idx1[row];
-            row_end = A->on_proc->idx1[row+1];
-            for (int k = row_start; k < row_end; k++)
-            {
-                col = A->on_proc->idx2[k];
-                global_col = A->on_proc_column_map[col];
-                val = A->on_proc->vals[k];
-                MPI_Pack(&global_col, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                        &ctr, MPI_COMM_WORLD);
-                MPI_Pack(&val, 1, MPI_DOUBLE, send_buffer.data(), send_buffer.size(),
-                        &ctr, MPI_COMM_WORLD);
-                if (partition[col] != proc && col_bool[col] == 0)
-                {
-                    send_cols[n_cols++] = col;
-                    col_bool[col] = 1;
-                }
-            }
-            row_start = A->off_proc->idx1[row];
-            row_end = A->off_proc->idx1[row+1];
-            for (int k = row_start; k < row_end; k++)
-            {
-                col = A->off_proc->idx2[k];
-                global_col = A->off_proc_column_map[col];
-                val = A->off_proc->vals[k];
-                MPI_Pack(&global_col, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                        &ctr, MPI_COMM_WORLD);
-                MPI_Pack(&val, 1, MPI_DOUBLE, send_buffer.data(), send_buffer.size(),
-                        &ctr, MPI_COMM_WORLD);
-                if (off_parts[col] != proc && off_col_bool[col] == 0)
-                {
-                    off_send_cols[off_n_cols++] = col;
-                    off_col_bool[col] = 1;
-                }
-            }
-        }
-        for (int j = 0; j < n_cols; j++)
-        {
-            col = send_cols[j];
-            col_bool[col] = 0;
-            global_col = A->local_row_map[col];
-            part = partition[col];
-            MPI_Pack(&global_col, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                    &ctr, MPI_COMM_WORLD);
-            MPI_Pack(&part, 1, MPI_INT, send_buffer.data(), send_buffer.size(), &ctr,
-                    MPI_COMM_WORLD);
-        }
-        for (int j = 0; j < off_n_cols; j++)
-        {
-            col = off_send_cols[j];
-            off_col_bool[col] = 0;
-            global_col = A->off_proc_column_map[col];
-            part = off_parts[col];
-            MPI_Pack(&global_col, 1, MPI_INT, send_buffer.data(), send_buffer.size(),
-                    &ctr, MPI_COMM_WORLD);
-            MPI_Pack(&part, 1, MPI_INT, send_buffer.data(), send_buffer.size(), &ctr,
-                    MPI_COMM_WORLD);
-        }
-        MPI_Isend(&(send_buffer[prev_ctr]), ctr - prev_ctr, MPI_PACKED, proc, tag,
-                MPI_COMM_WORLD, &(send_requests[i]));
-    }
-
-    std::map<int,int> off_proc_to_local;
-    std::vector<int> off_col_to_global;
-    std::vector<int> off_col_parts;
-    recv_size = 0;
-    recv_row_ptr.push_back(recv_size);
-    for (int i = 0; i < num_recvs; i++)
-    {
-        MPI_Probe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &recv_status);
-        proc = recv_status.MPI_SOURCE;
-        MPI_Get_count(&recv_status, MPI_PACKED, &count);
-        recv_buffer.resize(count);
-        MPI_Recv(recv_buffer.data(), count, MPI_PACKED, proc, tag, MPI_COMM_WORLD,
-                &recv_status);
-
-        ctr = 0;
-
-        MPI_Unpack(recv_buffer.data(), count, &ctr, &n_rows, 1, MPI_INT,
-                MPI_COMM_WORLD);
-        for (int j = 0; j < n_rows; j++)
-        {
-            MPI_Unpack(recv_buffer.data(), count, &ctr, &global_row, 1, MPI_INT,
-                    MPI_COMM_WORLD);
-            recv_rows.push_back(global_row);
-            MPI_Unpack(recv_buffer.data(), count, &ctr, &row_size, 1, MPI_INT,
-                    MPI_COMM_WORLD);
-            recv_size += row_size;
-            recv_row_ptr.push_back(recv_size);
-            for (int k = 0; k < row_size; k++)
-            {
-                MPI_Unpack(recv_buffer.data(), count, &ctr, &global_col, 1, MPI_INT,
-                        MPI_COMM_WORLD);
-                recv_cols.push_back(global_col);
-                MPI_Unpack(recv_buffer.data(), count, &ctr, &val, 1, MPI_DOUBLE,
-                        MPI_COMM_WORLD);
-                recv_vals.push_back(val);
-            }
-        }
-        while (ctr < count)
-        {
-            MPI_Unpack(recv_buffer.data(), count, &ctr, &global_col, 1, MPI_INT,
-                    MPI_COMM_WORLD);
-            MPI_Unpack(recv_buffer.data(), count, &ctr, &part, 1, MPI_INT, MPI_COMM_WORLD);
-            if (off_proc_to_local.find(global_col) == off_proc_to_local.end())
-            {
-                off_proc_to_local[global_col] = off_col_to_global.size();
-                off_col_to_global.push_back(global_col);
-                off_col_parts.push_back(part);
-            }
-
-        }
-    }
-
-    int num_rows = recv_rows.size();
-    MPI_Waitall(num_sends, send_requests.data(), MPI_STATUSES_IGNORE);
-
-    MPI_Allgather(&num_rows, 1, MPI_INT, proc_rows.data(), 1, MPI_INT, MPI_COMM_WORLD);
-    first_row = 0;
-    for (int i = 0; i < rank; i++)
-    {
-        first_row += proc_rows[i];
-    }
-
-    A_part = new ParCSRMatrix(A->global_num_rows, A->global_num_rows, num_rows, num_rows, 
-            first_row, first_row, A->partition->topology);
-    A_part->off_proc_num_cols = off_col_parts.size();
-    A_part->off_proc_column_map.resize(A_part->off_proc_num_cols);
-    A_part->on_proc_column_map.resize(A_part->local_num_rows);
-
-    std::vector<int> off_proc_part_map(A_part->off_proc_num_cols);
-    std::vector<int> off_col_order(A_part->off_proc_num_cols);
-    std::iota(off_col_order.begin(), off_col_order.end(), 0);
-    std::sort(off_col_order.begin(), off_col_order.end(), 
-            [&](const int i, const int j)
-            {
-                if (off_col_parts[i] == off_col_parts[j])
-                    return off_col_to_global[i] < off_col_to_global[j];
-                return off_col_parts[i] < off_col_parts[j];
-            });
-    for (int i = 0; i < A_part->off_proc_num_cols; i++)
-    {
-        col = off_col_order[i];
-        global_col = off_col_to_global[col];
-        off_proc_to_local[global_col] = i;
-        A_part->off_proc_column_map[i] = global_col;
-        off_proc_part_map[i] = off_col_parts[col];
-    }
-
-    // Create row_ptr
-    // Add values/indices to appropriate positions
-    std::vector<int> row_order(A_part->local_num_rows);
-    std::iota(row_order.begin(), row_order.end(), 0);
-    std::sort(row_order.begin(), row_order.end(),
-            [&](const int i, const int j)
-            {
-                return recv_rows[i] < recv_rows[j];
-            });
-    std::map<int, int> on_proc_to_local;
-    for(int i = 0; i < num_rows; i++)
-    {
-        row = row_order[i];
-        global_row = recv_rows[row];
-        on_proc_to_local[global_row] = i;
-        A_part->on_proc_column_map[i] = global_row;
-    }
-    A_part->local_row_map = A_part->get_on_proc_column_map();
-    A_part->on_proc_num_cols = A_part->on_proc_column_map.size();
-
-    A_part->on_proc->idx1[0] = 0;
-    A_part->off_proc->idx1[0] = 0;
-    for (int i = 0; i < num_rows; i++)
-    {
-        row = row_order[i];
-        row_start = recv_row_ptr[row];
-        row_end = recv_row_ptr[row+1];
-        for (int j = row_start; j < row_end; j++)
-        {
-            col = recv_cols[j];
-            val = recv_vals[j];
-
-            if (on_proc_to_local.find(col) != on_proc_to_local.end())
-            {
-                A_part->on_proc->idx2.push_back(on_proc_to_local[col]);
-                A_part->on_proc->vals.push_back(val);
-            }
-            else
-            {
-                A_part->off_proc->idx2.push_back(off_proc_to_local[col]);
-                A_part->off_proc->vals.push_back(val);
-            }
-        }
-        A_part->on_proc->idx1[i+1] = A_part->on_proc->idx2.size();
-        A_part->off_proc->idx1[i+1] = A_part->off_proc->idx2.size();
-    }
-    A_part->on_proc->nnz = A_part->on_proc->idx2.size();
-    A_part->off_proc->nnz = A_part->off_proc->idx2.size();
-    A_part->local_nnz = A_part->on_proc->nnz + A_part->off_proc->nnz;
-
-    new_local_rows.resize(A_part->on_proc_num_cols);
-    std::copy(A_part->on_proc_column_map.begin(), A_part->on_proc_column_map.end(),
-            new_local_rows.begin());
-
-    make_contiguous(A_part, off_proc_part_map);
-
-    return A_part;
-}
-
-}
diff --git a/raptor/util/linalg/repartition.hpp b/raptor/util/linalg/repartition.hpp
deleted file mode 100644
index 2bd704af..00000000
--- a/raptor/util/linalg/repartition.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-//
-#ifndef RAPTOR_GALLERY_REPARTITION_HPP
-#define RAPTOR_GALLERY_REPARTITION_HPP
-
-#include <mpi.h>
-#include <unistd.h>
-#include <set>
-#include <stdio.h>
-
-#include "raptor/core/types.hpp"
-#include "raptor/core/mpi_types.hpp"
-#include "raptor/core/par_matrix.hpp"
-
-namespace raptor {
-
-ParCSRMatrix* repartition_matrix(ParCSRMatrix* A, int* partition, std::vector<int>& new_local_rows);
-void make_contiguous(ParCSRMatrix* A);
-
-}
-#endif
diff --git a/raptor/util/linalg/spmv.cpp b/raptor/util/linalg/spmv.cpp
deleted file mode 100644
index cefd8f72..00000000
--- a/raptor/util/linalg/spmv.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-// Copyright (c) 2015-2017, RAPtor Developer Team
-// License: Simplified BSD, http://opensource.org/licenses/BSD-2-Clause
-
-#include "raptor/core/matrix.hpp"
-
-using namespace raptor;
-
-// Declare Private Methods
-void CSR_spmv(const CSRMatrix* A, const double* x, double* b);
-void CSR_residual(const CSRMatrix* A, const double* x, 
-        const double* b, double* r);
-void CSR_append(const CSRMatrix* A, const double* x, double* b);
-void BSR_spmv(const BSRMatrix* A, const double* x, double* b);
-
-// COOMatrix SpMV Methods (or BCOO)
-template <typename T>
-void COO_append(const COOMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    for (int i = 0; i < A->nnz; i++)
-    {
-        A->append(A->idx1[i], A->idx2[i], b, x, vals[i]);
-    }
-}
-template <typename T>
-void COO_append_T(const COOMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    for (int i = 0; i < A->nnz; i++)
-    {
-        A->append_T(A->idx2[i], A->idx1[i], b, x, vals[i]);
-    }
-}
-template <typename T>
-void COO_append_neg(const COOMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    for (int i = 0; i < A->nnz; i++)
-    {
-        A->append_neg(A->idx1[i], A->idx2[i], b, x, vals[i]);
-    }
-}
-template <typename T>
-void COO_append_neg_T(const COOMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    for (int i = 0; i < A->nnz; i++)
-    {
-        A->append_neg_T(A->idx1[i], A->idx2[i], b, x, vals[i]);
-    }
-}
-
-
-
-
-
-// CSRMatrix SpMV Methods (or BSR)
-// Optimized CSR and BSR standard SpMVs
-void CSR_spmv(const CSRMatrix* A, const double* x, double* b)
-{
-    int start, end;
-    double val;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        val = 0;
-        for (int j = start; j < end; j++)
-        {
-            val += A->vals[j] * x[A->idx2[j]];
-        }
-        b[i] = val;
-    }
-}
-
-void CSR_residual(const CSRMatrix* A, const double* x, 
-        const double* b, double* r)
-{
-    int start, end;
-    double val;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        val = b[i];
-        for (int j = start; j < end; j++)
-        {
-            val -= A->vals[j] * x[A->idx2[j]];
-        }
-        r[i] = val;
-    }
-}
-
-
-void CSR_append(const CSRMatrix* A, const double* x, double* b)
-{
-    int start, end;
-    double val;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        val = 0;
-        for (int j = start; j < end; j++)
-        {
-            val += A->vals[j] * x[A->idx2[j]];
-        }
-        b[i] += val;
-    }
-}
-
-template <typename T>
-void BSR_append(const CSRMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append(i, A->idx2[j], b, x, vals[j]);
-        }
-    }
-}
-
-void BSR_spmv(const BSRMatrix* A, const double* x, double* b)
-{
-    int start, end, idx;
-    int first_row, first_col;
-    double val;
-    double* block_val;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        first_row = i*A->b_rows;
-        for (int row = 0; row < A->b_rows; row++)
-        {
-            val = 0;
-            idx = row * A->b_cols;
-            for (int j = start; j < end; j++)
-            {
-                first_col = A->idx2[j]*A->b_cols;
-                block_val = A->block_vals[j];
-                for (int col = 0; col < A->b_cols; col++)
-                {
-                    val += (block_val[idx + col] * x[first_col + col]);
-                }
-            }
-            b[first_row + row] = val;
-        }
-    }
-}
-template <typename T>
-void CSR_append_T(const CSRMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_T(i, A->idx2[j], b, x, vals[j]);
-        }
-    }
-}
-template <typename T>
-void CSR_append_neg(const CSRMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_neg(i, A->idx2[j], b, x, vals[j]);
-        }
-    }
-}
-template <typename T>
-void CSR_append_neg_T(const CSRMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_rows; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_neg_T(i, A->idx2[j], b, x, vals[j]);
-        }
-    }
-}
-
-
-
-// CSCMatrix SpMV Methods (or BSC)
-template <typename T>
-void CSC_append(const CSCMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append(A->idx2[j], i, b, x, vals[j]);
-        }
-    }
-}
-template <typename T>
-void CSC_append_T(const CSCMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_T(A->idx2[j], i, b, x, vals[j]);
-        }
-    }
-}
-template <typename T>
-void CSC_append_neg(const CSCMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_neg(A->idx2[j], i, b, x, vals[j]);
-        }
-    }
-}
-template <typename T>
-void CSC_append_neg_T(const CSCMatrix* A, const std::vector<T>& vals,
-        const double* x, double* b)
-{
-    int start, end;
-    for (int i = 0; i < A->n_cols; i++)
-    {
-        start = A->idx1[i];
-        end = A->idx1[i+1];
-        for (int j = start; j < end; j++)
-        {
-            A->append_neg_T(A->idx2[j], i, b, x, vals[j]);
-        }
-    }
-}
-
-
-void COOMatrix::spmv(const double* x, double* b) const
-{
-    for (int i = 0; i < n_rows; i++)
-        b[i] = 0;
-    COO_append(this, vals, x, b);
-}
-void COOMatrix::spmv_append(const double* x, double* b) const
-{
-    COO_append(this, vals, x, b);
-}
-void COOMatrix::spmv_append_T(const double* x, double* b) const
-{
-    COO_append_T(this, vals, x, b);
-}
-void COOMatrix::spmv_append_neg(const double* x, double* b) const
-{
-    COO_append_neg(this, vals, x, b);
-}
-void COOMatrix::spmv_append_neg_T(const double* x, double* b) const
-{
-    COO_append_neg_T(this, vals, x, b);
-}
-void COOMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    for (int i = 0; i < n_rows; i++)
-        r[i] = b[i];
-    COO_append_neg(this, vals, x, r);
-}
-void BCOOMatrix::spmv(const double* x, double* b) const 
-{
-    for (int i = 0; i < n_rows * b_rows; i++)
-        b[i] = 0;
-    COO_append(this, block_vals, x, b);
-}
-void BCOOMatrix::spmv_append(const double* x,double* b) const
-{
-    COO_append(this, block_vals, x, b);
-}
-void BCOOMatrix::spmv_append_T(const double* x,double* b) const
-{
-    COO_append_T(this, block_vals, x, b);
-}
-void BCOOMatrix::spmv_append_neg(const double* x,double* b) const
-{
-    COO_append_neg(this, block_vals, x, b);
-}
-void BCOOMatrix::spmv_append_neg_T(const double* x,double* b) const
-{
-    COO_append_neg_T(this, block_vals, x, b);
-}
-void BCOOMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    for (int i = 0; i < n_rows * b_rows; i++)
-        r[i] = b[i];
-    COO_append_neg(this, block_vals, x, r);
-}
-
-
-
-void CSRMatrix::spmv(const double* x, double* b) const
-{
-    CSR_spmv(this, x, b);
-}
-void CSRMatrix::spmv_append(const double* x, double* b) const
-{
-    CSR_append(this, x, b);
-}
-void CSRMatrix::spmv_append_T(const double* x, double* b) const
-{
-    CSR_append_T(this, vals, x, b);
-}
-void CSRMatrix::spmv_append_neg(const double* x, double* b) const
-{
-    CSR_append_neg(this, vals, x, b);
-}
-void CSRMatrix::spmv_append_neg_T(const double* x, double* b) const
-{
-    CSR_append_neg_T(this, vals, x, b);
-}
-void CSRMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    CSR_residual(this, x, b, r);
-}
-void BSRMatrix::spmv(const double* x, double* b) const
-{
-    BSR_spmv(this, x, b);
-}
-void BSRMatrix::spmv_append(const double* x,double* b) const
-{
-    BSR_append(this, block_vals, x, b);
-}
-void BSRMatrix::spmv_append_T(const double* x,double* b) const
-{
-    CSR_append_T(this, block_vals, x, b);
-}
-void BSRMatrix::spmv_append_neg(const double* x,double* b) const
-{
-    CSR_append_neg(this, block_vals, x, b);
-}
-void BSRMatrix::spmv_append_neg_T(const double* x,double* b) const
-{
-    CSR_append_neg_T(this, block_vals, x, b);
-}
-void BSRMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    for (int i = 0; i < n_rows * b_rows; i++)
-        r[i] = b[i];
-    CSR_append_neg(this, block_vals, x, r);
-}
-
-
-
-void CSCMatrix::spmv(const double* x, double* b) const
-{
-    for (int i = 0; i < n_rows; i++)
-        b[i] = 0;
-    CSC_append(this, vals, x, b);
-}
-void CSCMatrix::spmv_append(const double* x, double* b) const
-{
-    CSC_append(this, vals, x, b);
-}
-void CSCMatrix::spmv_append_T(const double* x, double* b) const
-{
-    CSC_append_T(this, vals, x, b);
-}
-void CSCMatrix::spmv_append_neg(const double* x, double* b) const
-{
-    CSC_append_neg(this, vals, x, b);
-}
-void CSCMatrix::spmv_append_neg_T(const double* x, double* b) const
-{
-    CSC_append_neg_T(this, vals, x, b);
-}
-void CSCMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    for (int i = 0; i < n_rows; i++)
-        r[i] = b[i];
-    CSC_append_neg(this, vals, x, r);
-}
-void BSCMatrix::spmv(const double* x, double* b) const
-{ 
-    for (int i = 0; i < n_rows * b_rows; i++)
-        b[i] = 0;
-    CSC_append(this, block_vals, x, b);
-}
-void BSCMatrix::spmv_append(const double* x,double* b) const
-{
-    CSC_append(this, block_vals, x, b);
-}
-void BSCMatrix::spmv_append_T(const double* x,double* b) const
-{
-    CSC_append_T(this, block_vals, x, b);
-}
-void BSCMatrix::spmv_append_neg(const double* x,double* b) const
-{
-    CSC_append_neg(this, block_vals, x, b);
-}
-void BSCMatrix::spmv_append_neg_T(const double* x,double* b) const
-{
-    CSC_append_neg_T(this, block_vals, x, b);
-}
-void BSCMatrix::spmv_residual(const double* x, const double* b, double* r) const
-{
-    for (int i = 0; i < n_rows * b_rows; i++)
-        r[i] = b[i];
-    CSC_append_neg(this, block_vals, x, r);
-}
-
-
-
diff --git a/raptor/util/tests/CMakeLists.txt b/raptor/util/tests/CMakeLists.txt
deleted file mode 100644
index d78f8224..00000000
--- a/raptor/util/tests/CMakeLists.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-add_executable(test_spmv_laplacian test_spmv_laplacian.cpp)
-target_link_libraries(test_spmv_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(LaplacianSpMVTest ./test_spmv_laplacian)
-
-add_executable(test_spmv_aniso test_spmv_aniso.cpp)
-target_link_libraries(test_spmv_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(AnisoSpMVTest ./test_spmv_aniso)
-
-add_executable(test_spmv_random test_spmv_random.cpp)
-target_link_libraries(test_spmv_random raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(RandomSpMVTest ./test_spmv_random)
-
-add_executable(test_bsr_spmv_laplacian test_bsr_spmv_laplacian.cpp)
-target_link_libraries(test_bsr_spmv_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRLaplacianSpMVTest ./test_bsr_spmv_laplacian)
-
-add_executable(test_bsr_spmv_aniso test_bsr_spmv_aniso.cpp)
-target_link_libraries(test_bsr_spmv_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRAnisoSpMVTest ./test_bsr_spmv_aniso)
-
-add_executable(test_bsr_spmv_random test_bsr_spmv_random.cpp)
-target_link_libraries(test_bsr_spmv_random raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRRandomSpMVTest ./test_bsr_spmv_random)
-
-
-add_executable(test_jacobi_aniso test_jacobi_aniso.cpp)
-target_link_libraries(test_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(AnisoJacobiTest ./test_jacobi_aniso)
-
-add_executable(test_jacobi_laplacian test_jacobi_laplacian.cpp)
-target_link_libraries(test_jacobi_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(LaplaceJacobiTest ./test_jacobi_laplacian)
-
-add_executable(test_gs_aniso test_gs_aniso.cpp)
-target_link_libraries(test_gs_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(AnisoGSTest ./test_gs_aniso)
-
-add_executable(test_gs_laplacian test_gs_laplacian.cpp)
-target_link_libraries(test_gs_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(LaplaceGSTest ./test_gs_laplacian)
-
-# CANNOT CURRENTLY RUN THESE TESTS, BUT RAPTOR SEEMS CORRECT
-# TODO : UNCOMMENT WHEN PYAMG BUG IS FIXED
-#
-#add_executable(test_sor_aniso test_sor_aniso.cpp)
-#target_link_libraries(test_sor_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-#add_test(AnisoSORTest ./test_sor_aniso)
-#
-#add_executable(test_sor_laplacian test_sor_laplacian.cpp)
-#target_link_libraries(test_sor_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-#add_test(LaplaceSORTest ./test_sor_laplacian)
-
-#add_executable(test_bsr_jacobi_aniso test_bsr_jacobi_aniso.cpp)
-#target_link_libraries(test_bsr_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-#add_test(BSRAnisoJacobiTest ./test_bsr_jacobi_aniso)
-
-add_executable(test_bsr_jacobi_aniso test_bsr_jacobi_aniso.cpp)
-target_link_libraries(test_bsr_jacobi_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRAnisoJacobiTest ./test_bsr_jacobi_aniso)
-
-add_executable(test_bsr_jacobi_laplacian test_bsr_jacobi_laplacian.cpp)
-target_link_libraries(test_bsr_jacobi_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRLaplaceJacobiTest ./test_bsr_jacobi_laplacian)
-
-add_executable(test_bsr_gs_aniso test_bsr_gs_aniso.cpp)
-target_link_libraries(test_bsr_gs_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRAnisoGSTest ./test_bsr_gs_aniso)
-
-add_executable(test_bsr_gs_laplacian test_bsr_gs_laplacian.cpp)
-target_link_libraries(test_bsr_gs_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-add_test(BSRLaplaceGSTest ./test_bsr_gs_laplacian)
-
-if (WITH_MPI)
-    add_executable(test_par_add test_par_add.cpp)
-    target_link_libraries(test_par_add raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParAddTest ${MPIRUN} -n 1 ${HOST} ./test_par_add)
-    add_test(ParAddTest ${MPIRUN} -n 16 ${HOST} ./test_par_add)
-
-    add_executable(test_par_spmv_laplacian test_par_spmv_laplacian.cpp)
-    target_link_libraries(test_par_spmv_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParLaplacianSpMVTest ${MPIRUN} -n 1 ${HOST} ./test_par_spmv_laplacian)
-    add_test(ParLaplacianSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_par_spmv_laplacian)
-    add_test(ParLaplacianSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_par_spmv_laplacian)
-    add_test(ParLaplacianSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_par_spmv_laplacian)
-    add_test(ParLaplacianSpMVTest ${MPIRUN} -n 16 ${HOST} ./test_par_spmv_laplacian)
-
-    add_executable(test_par_spmv_aniso test_par_spmv_aniso.cpp)
-    target_link_libraries(test_par_spmv_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParAnisoSpMVTest ${MPIRUN} -n 1 ${HOST} ./test_par_spmv_aniso)
-    add_test(ParAnisoSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_par_spmv_aniso)
-    add_test(ParAnisoSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_par_spmv_aniso)
-    add_test(ParAnisoSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_par_spmv_aniso)
-    add_test(ParAnisoSpMVTest ${MPIRUN} -n 16 ${HOST} ./test_par_spmv_aniso)
-
-    add_executable(test_par_spmv_random test_par_spmv_random.cpp)
-    target_link_libraries(test_par_spmv_random ${MPI_LIBRARIES} raptor googletest pthread )
-    add_test(ParRandomSpMVTest ${MPIRUN} -n 1 ${HOST} ./test_par_spmv_random)
-    add_test(ParRandomSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_par_spmv_random)
-    add_test(ParRandomSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_par_spmv_random)
-    add_test(ParRandomSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_par_spmv_random)
-    add_test(ParRandomSpMVTest ${MPIRUN} -n 16 ${HOST} ./test_par_spmv_random)
-    
-    add_executable(test_tap_spmv_laplacian test_tap_spmv_laplacian.cpp)
-    target_link_libraries(test_tap_spmv_laplacian raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(TAPLaplacianSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_tap_spmv_laplacian)
-    add_test(TAPLaplacianSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_tap_spmv_laplacian)
-    add_test(TAPLaplacianSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_tap_spmv_laplacian)
-    add_test(TAPLaplacianSpMVTest ${MPIRUN} -n 16 ${HOST} ./test_tap_spmv_laplacian)
-
-    add_executable(test_tap_spmv_aniso test_tap_spmv_aniso.cpp)
-    target_link_libraries(test_tap_spmv_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(TAPAnisoSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_tap_spmv_aniso)
-    add_test(TAPAnisoSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_tap_spmv_aniso)
-    add_test(TAPAnisoSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_tap_spmv_aniso)
-    add_test(TAPAnisoSpMVTest ${MPIRUN} -n 16 ${HOST} ./test_tap_spmv_aniso)
-
-    add_executable(test_tap_spmv_random test_tap_spmv_random.cpp)
-    target_link_libraries(test_tap_spmv_random ${MPI_LIBRARIES} raptor googletest pthread )
-    add_test(TAPRandomSpMVTest ${MPIRUN} -n 2 ${HOST} ./test_tap_spmv_random)
-    add_test(TAPRandomSpMVTest ${MPIRUN} -n 4 ${HOST} ./test_tap_spmv_random)
-    add_test(TAPRandomSpMVTest ${MPIRUN} -n 8 ${HOST} ./test_tap_spmv_random)
-
-    add_executable(test_par_scale_aniso test_par_scale_aniso.cpp)
-    target_link_libraries(test_par_scale_aniso raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(ParScaleAnisoTest ${MPIRUN} -n 1 ${HOST} ./test_par_scale_aniso)
-    add_test(ParScaleAnisoTest ${MPIRUN} -n 2 ${HOST} ./test_par_scale_aniso)
-    add_test(ParScaleAnisoTest ${MPIRUN} -n 3 ${HOST} ./test_par_scale_aniso)
-    add_test(ParScaleAnisoTest ${MPIRUN} -n 6 ${HOST} ./test_par_scale_aniso)
-
-    add_executable(test_repartition test_repartition.cpp)
-    target_link_libraries(test_repartition raptor ${MPI_LIBRARIES} googletest pthread )
-    add_test(RepartitionTest ${MPIRUN} -n 1 ${HOST} ./test_repartition)
-    add_test(RepartitionTest ${MPIRUN} -n 2 ${HOST} ./test_repartition)
-    add_test(RepartitionTest ${MPIRUN} -n 3 ${HOST} ./test_repartition)
-    add_test(RepartitionTest ${MPIRUN} -n 6 ${HOST} ./test_repartition)
-    add_test(RepartitionTest ${MPIRUN} -n 16 ${HOST} ./test_repartition)
-
-    if (WITH_PTSCOTCH)
-        add_executable(test_ptscotch test_ptscotch.cpp)
-        target_link_libraries(test_ptscotch raptor ${MPI_LIBRARIES} googletest pthread )
-        add_test(PTScotchTest ${MPIRUN} -n 1 ${HOST} ./test_ptscotch)
-        add_test(PTScotchTest ${MPIRUN} -n 2 ${HOST} ./test_ptscotch)
-        add_test(PTScotchTest ${MPIRUN} -n 3 ${HOST} ./test_ptscotch)
-        add_test(PTScotchTest ${MPIRUN} -n 6 ${HOST} ./test_ptscotch)
-    endif()
-
-    if (WITH_PARMETIS)
-        add_executable(test_parmetis test_parmetis.cpp)
-        target_link_libraries(test_parmetis raptor ${MPI_LIBRARIES} googletest pthread )
-        add_test(ParMetisTest ${MPIRUN} -n 1 ${HOST} ./test_parmetis)
-        add_test(ParMetisTest ${MPIRUN} -n 2 ${HOST} ./test_parmetis)
-        add_test(ParMetisTest ${MPIRUN} -n 3 ${HOST} ./test_parmetis)
-        add_test(ParMetisTest ${MPIRUN} -n 6 ${HOST} ./test_parmetis)
-    endif()
-endif()
diff --git a/raptor/util/tests/README.md b/raptor/util/tests/README.md
deleted file mode 100644
index 038d718d..00000000
--- a/raptor/util/tests/README.md
+++ /dev/null
@@ -1 +0,0 @@
-testing