GEOS-DEV · corbett5 · Jun 19, 2020 · Jun 9, 2020 · rrsettgast · Jun 18, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,7 +31,7 @@ if( ENABLE_CHAI )
     set( lvarray_dependencies ${lvarray_dependencies} chai umpire )
 endif()
 
-if( ${ENABLE_MPI} )
+if( ENABLE_MPI )
     set( lvarray_dependencies ${lvarray_dependencies} mpi )
 endif()
 

diff --git a/benchmarks/benchmarkArray1DR2TensorMultiplication.cpp b/benchmarks/benchmarkArray1DR2TensorMultiplication.cpp
@@ -193,7 +193,8 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 18) - 87;
 INDEX_TYPE const OMP_SIZE = (2 << 22) - 87;
 #endif
 
-#if defined(USE_CUDA)
+// The non Array benchmarks could be run without chai, but then what's the point.
+#if defined(USE_CUDA) && defined(USE_CHAI)
 constexpr INDEX_TYPE CUDA_SIZE = (2 << 24) - 87;
 #endif
 
@@ -240,7 +241,7 @@ void registerBenchmarks()
               , std::make_tuple( OMP_SIZE, RAJA::PERM_IJK {}, parallelHostPolicy {} )
               , std::make_tuple( OMP_SIZE, RAJA::PERM_KJI {}, parallelHostPolicy {} )
   #endif
-  #if defined(USE_CUDA)
+  #if defined(USE_CUDA) && defined(USE_CHAI)
               , std::make_tuple( CUDA_SIZE, RAJA::PERM_IJK {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
               , std::make_tuple( CUDA_SIZE, RAJA::PERM_KJI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
   #endif
@@ -264,7 +265,7 @@ int main( int argc, char * * argv )
   LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << ", 3, 3 )." );
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
   LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << ", 3, 3 )." );
 #endif
 

diff --git a/benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.cpp b/benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.cpp
@@ -64,10 +64,10 @@ namespace benchmarking
 
 
 template< typename VALUE_TYPE_CONST, int USD >
-RAJA_INLINE LVARRAY_HOST_DEVICE constexpr
-void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & a,
-                       LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & b,
-                       LvArray::ArraySlice< VALUE_TYPE, 2, USD > const & c )
+inline LVARRAY_HOST_DEVICE constexpr
+void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & a,
+                       LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & b,
+                       LvArray::ArraySlice< VALUE_TYPE, 2, USD, INDEX_TYPE > const & c )
 { INNER_LOOP( a( j, l ), b( l, k ), c( j, k ) ) }
 
 
@@ -265,7 +265,7 @@ template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, parallelHostPolicy >;
 template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, parallelHostPolicy >;
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 #endif

diff --git a/benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.hpp b/benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.hpp
@@ -220,7 +220,7 @@ class ArrayOfR2TensorsRAJA : private ArrayOfR2TensorsNative< PERMUTATION >
   }
 
   ~ArrayOfR2TensorsRAJA()
-  { this->m_c.move( chai::CPU ); }
+  { this->m_c.move( MemorySpace::CPU ); }
 
   void fortranView() const
   {

diff --git a/benchmarks/benchmarkCommon.hpp b/benchmarks/benchmarkCommon.hpp
diff --git a/benchmarks/benchmarkHelpers.hpp b/benchmarks/benchmarkHelpers.hpp
@@ -15,12 +15,12 @@
  * Free Software Foundation) version 2.1 dated February 1999.
  *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
-/* *UNCRUSTIFY-OFF* */
+
 #pragma once
 
 // Source includes
-#include "benchmarkCommon.hpp"
 #include "../unitTests/testUtils.hpp"
+#include "StringUtilities.hpp"
 
 // System includes
 #include <random>
@@ -30,8 +30,8 @@
 #if defined(USE_CALIPER)
 
 #include <caliper/cali.h>
-#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann##__LINE__( STRINGIZE_NX( name ) )
-#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann##__LINE__( ( string ).data() )
+#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann ## __LINE__( STRINGIZE_NX( name ) )
+#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann ## __LINE__( ( string ).data() )
 
 #else
 
@@ -45,6 +45,12 @@ namespace LvArray
 
 using namespace testing;
 
+
+#if defined(USE_CHAI)
+static_assert( std::is_same< DEFAULT_BUFFER< int >, NewChaiBuffer< int > >::value,
+               "The default buffer should be NewChaiBuffer when chai is enabled." );
+#endif
+
 namespace benchmarking
 {
 
@@ -65,6 +71,36 @@ inline std::string typeToString( RAJA::PERM_KJI const & ) { return "RAJA::PERM_K
 
 } // namespace internal
 
+#define ACCESS_IJ( N, M, i, j ) M * i + j
+#define ACCESS_JI( N, M, i, j ) N * j + i
+
+#define ACCESS_IJK( N, M, P, i, j, k ) M * P * i + P * j + k
+#define ACCESS_KJI( N, M, P, i, j, k ) M * N * k + N * j + i
+
+using INDEX_TYPE = std::ptrdiff_t;
+
+template< typename T, typename PERMUTATION >
+using Array = LvArray::Array< T, getDimension( PERMUTATION {} ), PERMUTATION, INDEX_TYPE, DEFAULT_BUFFER >;
+
+template< typename T, typename PERMUTATION >
+using ArrayView = LvArray::ArrayView< T,
+getDimension( PERMUTATION {} ),
+getStrideOneDimension( PERMUTATION {} ),
+INDEX_TYPE,
+DEFAULT_BUFFER >;
+
+template< typename T, typename PERMUTATION >
+using ArraySlice = LvArray::ArraySlice< T,
+getDimension( PERMUTATION {} ),
+getStrideOneDimension( PERMUTATION {} ),
+INDEX_TYPE >;
+
+template< typename T, typename PERMUTATION >
+using RajaView = RAJA::View< T,
+RAJA::Layout< getDimension( PERMUTATION {} ),
+INDEX_TYPE,
+getStrideOneDimension( PERMUTATION {} ) >>;
+
 template< typename ARG0 >
 std::string typeListToString()
 { return internal::typeToString( ARG0 {} ); }
@@ -77,12 +113,12 @@ std::string typeListToString()
 #define REGISTER_BENCHMARK( args, func ) \
   { \
     ::benchmark::RegisterBenchmark( STRINGIZE( func ), func ) \
-    ->Args( args ) \
-    ->UseRealTime() \
-    ->ComputeStatistics( "min", []( std::vector< double > const & times ) \
-      { return *std::min_element( times.begin(), times.end() ); } ) \
-    ->ComputeStatistics( "max", []( std::vector< double > const & times ) \
-      { return *std::max_element( times.begin(), times.end() ); } ); \
+      ->Args( args ) \
+      ->UseRealTime() \
+      ->ComputeStatistics( "min", []( std::vector< double > const & times ) \
+{ return *std::min_element( times.begin(), times.end() ); } ) \
+      ->ComputeStatistics( "max", []( std::vector< double > const & times ) \
+{ return *std::max_element( times.begin(), times.end() ); } ); \
   }
 
 
@@ -91,12 +127,12 @@ std::string typeListToString()
     std::string functionName = STRINGIZE( func ) "< "; \
     functionName += typeListToString< __VA_ARGS__ >() + " >"; \
     ::benchmark::RegisterBenchmark( functionName.c_str(), func< __VA_ARGS__ > ) \
-    ->Args( args ) \
-    ->UseRealTime() \
-    ->ComputeStatistics( "min", []( std::vector< double > const & times ) \
-      { return *std::min_element( times.begin(), times.end() ); } ) \
-    ->ComputeStatistics( "max", []( std::vector< double > const & times ) \
-      { return *std::max_element( times.begin(), times.end() ); } ); \
+      ->Args( args ) \
+      ->UseRealTime() \
+      ->ComputeStatistics( "min", []( std::vector< double > const & times ) \
+{ return *std::min_element( times.begin(), times.end() ); } ) \
+      ->ComputeStatistics( "max", []( std::vector< double > const & times ) \
+{ return *std::max_element( times.begin(), times.end() ); } ); \
   }
 
 
@@ -135,7 +171,7 @@ RajaView< T, PERMUTATION > makeRajaView( Array< T, PERMUTATION > const & array )
   constexpr int NDIM = getDimension( PERMUTATION {} );
   std::array< INDEX_TYPE, NDIM > sizes;
 
-  for( int i = 0 ; i < NDIM ; ++i )
+  for( int i = 0; i < NDIM; ++i )
   {
     sizes[ i ] = array.dims()[ i ];
   }
@@ -215,7 +251,7 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )
 
       std::cout << "### The benchmarks produced different results with arguments ";
       std::cout << args[ 0 ];
-      for( unsigned long i = 1 ; i < N ; ++i )
+      for( unsigned long i = 1; i < N; ++i )
       {
         std::cout << ", " << args[ i ];
       }
@@ -261,4 +297,3 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )
 
 } // namespace benchmarking
 } // namespace LvArray
-/* *UNCRUSITIFY-ON* */
diff --git a/benchmarks/benchmarkInnerProduct.cpp b/benchmarks/benchmarkInnerProduct.cpp
@@ -133,7 +133,9 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 20) + 573;
 #if defined(USE_OPENMP)
 INDEX_TYPE const OMP_SIZE = SERIAL_SIZE;
 #endif
-#if defined(USE_CUDA)
+
+// The non Array benchmarks could be run without chai, but then what's the point.
+#if defined(USE_CUDA) && defined(USE_CHAI)
 INDEX_TYPE const CUDA_SIZE = SERIAL_SIZE;
 #endif
 
@@ -165,7 +167,7 @@ void registerBenchmarks()
   #if defined(USE_OPENMP)
               , std::make_tuple( OMP_SIZE, parallelHostPolicy {} )
   #endif
-  #if defined(USE_CUDA)
+  #if defined(USE_CUDA) && defined(USE_CHAI)
               , std::make_tuple( CUDA_SIZE, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
   #endif
               );
@@ -191,7 +193,7 @@ int main( int argc, char * * argv )
   LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << " )." );
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
   LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << " )." );
 #endif
 

diff --git a/benchmarks/benchmarkInnerProductKernels.cpp b/benchmarks/benchmarkInnerProductKernels.cpp
@@ -124,7 +124,7 @@ template class InnerProductRAJA< serialPolicy >;
 template class InnerProductRAJA< parallelHostPolicy >;
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 template class InnerProductRAJA< RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 #endif
 

diff --git a/benchmarks/benchmarkMatrixMatrix.cpp b/benchmarks/benchmarkMatrixMatrix.cpp
@@ -138,7 +138,7 @@ INDEX_TYPE const OMP_L = SERIAL_L;
 INDEX_TYPE const OMP_M = SERIAL_M;
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 INDEX_TYPE const CUDA_N = SERIAL_N;
 INDEX_TYPE const CUDA_L = SERIAL_L;
 INDEX_TYPE const CUDA_M = SERIAL_M;
@@ -184,7 +184,7 @@ void registerBenchmarks()
               , std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
               , std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
   #endif
-  #if defined(USE_CUDA)
+  #if defined(USE_CUDA) && defined(USE_CHAI)
               , std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
               , std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
   #endif
@@ -212,7 +212,7 @@ int main( int argc, char * * argv )
                LvArray::benchmarking::OMP_L << ", " << LvArray::benchmarking::OMP_M << " )." );
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
   LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
                LvArray::benchmarking::CUDA_L << ", " << LvArray::benchmarking::CUDA_M << " )." );
 #endif

diff --git a/benchmarks/benchmarkMatrixMatrixKernels.cpp b/benchmarks/benchmarkMatrixMatrixKernels.cpp
@@ -222,7 +222,7 @@ template class MatrixMatrixRAJA< RAJA::PERM_JI, parallelHostPolicy >;
 
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 
 template class MatrixMatrixRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 template class MatrixMatrixRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;

diff --git a/benchmarks/benchmarkMatrixVector.cpp b/benchmarks/benchmarkMatrixVector.cpp
@@ -176,7 +176,7 @@ void registerBenchmarks()
               , std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
               , std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
   #endif
-  #if defined(USE_CUDA)
+  #if defined(USE_CUDA) && defined(USE_CHAI)
               , std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
               , std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
   #endif
@@ -204,7 +204,7 @@ int main( int argc, char * * argv )
                LvArray::benchmarking::OMP_M << " )." );
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
   LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
                LvArray::benchmarking::CUDA_M << " )." );
 #endif

diff --git a/benchmarks/benchmarkMatrixVectorKernels.cpp b/benchmarks/benchmarkMatrixVectorKernels.cpp
@@ -189,7 +189,7 @@ template class MatrixVectorRAJA< RAJA::PERM_JI, parallelHostPolicy >;
 
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 
 template class MatrixVectorRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 template class MatrixVectorRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;

diff --git a/benchmarks/benchmarkOuterProduct.cpp b/benchmarks/benchmarkOuterProduct.cpp
@@ -136,7 +136,7 @@ INDEX_TYPE const OMP_N = (2 << 9) + 73;
 INDEX_TYPE const OMP_M = (2 << 9) - 71;
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 INDEX_TYPE const CUDA_N = (2 << 9) + 73;
 INDEX_TYPE const CUDA_M = (2 << 9) - 71;
 #endif
@@ -180,7 +180,7 @@ void registerBenchmarks()
               , std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
               , std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
   #endif
-  #if defined(USE_CUDA)
+  #if defined(USE_CUDA) && defined(USE_CHAI)
               , std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
               , std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
   #endif
@@ -208,7 +208,7 @@ int main( int argc, char * * argv )
                LvArray::benchmarking::OMP_M << " )." );
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
   LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
                LvArray::benchmarking::CUDA_M << " )." );
 #endif

diff --git a/benchmarks/benchmarkOuterProductKernels.cpp b/benchmarks/benchmarkOuterProductKernels.cpp
@@ -186,7 +186,7 @@ template class OuterProductRAJA< RAJA::PERM_IJ, parallelHostPolicy >;
 template class OuterProductRAJA< RAJA::PERM_JI, parallelHostPolicy >;
 #endif
 
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && defined(USE_CHAI)
 template class OuterProductRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 template class OuterProductRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
 #endif