Skip to content

Commit

Permalink
Addresses some Tribol requests.
Browse files Browse the repository at this point in the history
  • Loading branch information
corbett5 committed Jun 17, 2020
1 parent 54272b7 commit 4d22d70
Show file tree
Hide file tree
Showing 112 changed files with 7,324 additions and 5,942 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ if( ENABLE_CHAI )
set( lvarray_dependencies ${lvarray_dependencies} chai umpire )
endif()

if( ${ENABLE_MPI} )
if( ENABLE_MPI )
set( lvarray_dependencies ${lvarray_dependencies} mpi )
endif()

Expand Down
7 changes: 4 additions & 3 deletions benchmarks/benchmarkArray1DR2TensorMultiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 18) - 87;
INDEX_TYPE const OMP_SIZE = (2 << 22) - 87;
#endif

#if defined(USE_CUDA)
// The non Array benchmarks could be run without chai, but then what's the point.
#if defined(USE_CUDA) && defined(USE_CHAI)
constexpr INDEX_TYPE CUDA_SIZE = (2 << 24) - 87;
#endif

Expand Down Expand Up @@ -240,7 +241,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_SIZE, RAJA::PERM_IJK {}, parallelHostPolicy {} )
, std::make_tuple( OMP_SIZE, RAJA::PERM_KJI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_SIZE, RAJA::PERM_IJK {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_SIZE, RAJA::PERM_KJI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand All @@ -264,7 +265,7 @@ int main( int argc, char * * argv )
LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << ", 3, 3 )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << ", 3, 3 )." );
#endif

Expand Down
10 changes: 5 additions & 5 deletions benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ namespace benchmarking


template< typename VALUE_TYPE_CONST, int USD >
RAJA_INLINE LVARRAY_HOST_DEVICE constexpr
void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & a,
LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & b,
LvArray::ArraySlice< VALUE_TYPE, 2, USD > const & c )
inline LVARRAY_HOST_DEVICE constexpr
void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & a,
LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & b,
LvArray::ArraySlice< VALUE_TYPE, 2, USD, INDEX_TYPE > const & c )
{ INNER_LOOP( a( j, l ), b( l, k ), c( j, k ) ) }


Expand Down Expand Up @@ -265,7 +265,7 @@ template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, parallelHostPolicy >;
template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ class ArrayOfR2TensorsRAJA : private ArrayOfR2TensorsNative< PERMUTATION >
}

~ArrayOfR2TensorsRAJA()
{ this->m_c.move( chai::CPU ); }
{ this->m_c.move( MemorySpace::CPU ); }

void fortranView() const
{
Expand Down
53 changes: 0 additions & 53 deletions benchmarks/benchmarkCommon.hpp

This file was deleted.

73 changes: 54 additions & 19 deletions benchmarks/benchmarkHelpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
* Free Software Foundation) version 2.1 dated February 1999.
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/* *UNCRUSTIFY-OFF* */

#pragma once

// Source includes
#include "benchmarkCommon.hpp"
#include "../unitTests/testUtils.hpp"
#include "StringUtilities.hpp"

// System includes
#include <random>
Expand All @@ -30,8 +30,8 @@
#if defined(USE_CALIPER)

#include <caliper/cali.h>
#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann##__LINE__( STRINGIZE_NX( name ) )
#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann##__LINE__( ( string ).data() )
#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann ## __LINE__( STRINGIZE_NX( name ) )
#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann ## __LINE__( ( string ).data() )

#else

Expand All @@ -45,6 +45,12 @@ namespace LvArray

using namespace testing;


#if defined(USE_CHAI)
static_assert( std::is_same_v< DEFAULT_BUFFER< int >, NewChaiBuffer< int > >,
"The default buffer should be NewChaiBuffer when chai is enabled." );
#endif

namespace benchmarking
{

Expand All @@ -65,6 +71,36 @@ inline std::string typeToString( RAJA::PERM_KJI const & ) { return "RAJA::PERM_K

} // namespace internal

#define ACCESS_IJ( N, M, i, j ) M * i + j
#define ACCESS_JI( N, M, i, j ) N * j + i

#define ACCESS_IJK( N, M, P, i, j, k ) M * P * i + P * j + k
#define ACCESS_KJI( N, M, P, i, j, k ) M * N * k + N * j + i

using INDEX_TYPE = std::ptrdiff_t;

template< typename T, typename PERMUTATION >
using Array = LvArray::Array< T, getDimension( PERMUTATION {} ), PERMUTATION, INDEX_TYPE, DEFAULT_BUFFER >;

template< typename T, typename PERMUTATION >
using ArrayView = LvArray::ArrayView< T,
getDimension( PERMUTATION {} ),
getStrideOneDimension( PERMUTATION {} ),
INDEX_TYPE,
DEFAULT_BUFFER >;

template< typename T, typename PERMUTATION >
using ArraySlice = LvArray::ArraySlice< T,
getDimension( PERMUTATION {} ),
getStrideOneDimension( PERMUTATION {} ),
INDEX_TYPE >;

template< typename T, typename PERMUTATION >
using RajaView = RAJA::View< T,
RAJA::Layout< getDimension( PERMUTATION {} ),
INDEX_TYPE,
getStrideOneDimension( PERMUTATION {} ) >>;

template< typename ARG0 >
std::string typeListToString()
{ return internal::typeToString( ARG0 {} ); }
Expand All @@ -77,12 +113,12 @@ std::string typeListToString()
#define REGISTER_BENCHMARK( args, func ) \
{ \
::benchmark::RegisterBenchmark( STRINGIZE( func ), func ) \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
}


Expand All @@ -91,12 +127,12 @@ std::string typeListToString()
std::string functionName = STRINGIZE( func ) "< "; \
functionName += typeListToString< __VA_ARGS__ >() + " >"; \
::benchmark::RegisterBenchmark( functionName.c_str(), func< __VA_ARGS__ > ) \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
}


Expand Down Expand Up @@ -135,7 +171,7 @@ RajaView< T, PERMUTATION > makeRajaView( Array< T, PERMUTATION > const & array )
constexpr int NDIM = getDimension( PERMUTATION {} );
std::array< INDEX_TYPE, NDIM > sizes;

for( int i = 0 ; i < NDIM ; ++i )
for( int i = 0; i < NDIM; ++i )
{
sizes[ i ] = array.dims()[ i ];
}
Expand Down Expand Up @@ -215,7 +251,7 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )

std::cout << "### The benchmarks produced different results with arguments ";
std::cout << args[ 0 ];
for( unsigned long i = 1 ; i < N ; ++i )
for( unsigned long i = 1; i < N; ++i )
{
std::cout << ", " << args[ i ];
}
Expand Down Expand Up @@ -261,4 +297,3 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )

} // namespace benchmarking
} // namespace LvArray
/* *UNCRUSITIFY-ON* */
8 changes: 5 additions & 3 deletions benchmarks/benchmarkInnerProduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 20) + 573;
#if defined(USE_OPENMP)
INDEX_TYPE const OMP_SIZE = SERIAL_SIZE;
#endif
#if defined(USE_CUDA)

// The non Array benchmarks could be run without chai, but then what's the point.
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_SIZE = SERIAL_SIZE;
#endif

Expand Down Expand Up @@ -165,7 +167,7 @@ void registerBenchmarks()
#if defined(USE_OPENMP)
, std::make_tuple( OMP_SIZE, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_SIZE, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
);
Expand All @@ -191,7 +193,7 @@ int main( int argc, char * * argv )
LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << " )." );
#endif

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkInnerProductKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ template class InnerProductRAJA< serialPolicy >;
template class InnerProductRAJA< parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class InnerProductRAJA< RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmarkMatrixMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ INDEX_TYPE const OMP_L = SERIAL_L;
INDEX_TYPE const OMP_M = SERIAL_M;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_N = SERIAL_N;
INDEX_TYPE const CUDA_L = SERIAL_L;
INDEX_TYPE const CUDA_M = SERIAL_M;
Expand Down Expand Up @@ -184,7 +184,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -212,7 +212,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_L << ", " << LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_L << ", " << LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkMatrixMatrixKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ template class MatrixMatrixRAJA< RAJA::PERM_JI, parallelHostPolicy >;

#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)

template class MatrixMatrixRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class MatrixMatrixRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmarkMatrixVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -204,7 +204,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkMatrixVectorKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ template class MatrixVectorRAJA< RAJA::PERM_JI, parallelHostPolicy >;

#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)

template class MatrixVectorRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class MatrixVectorRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmarkOuterProduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ INDEX_TYPE const OMP_N = (2 << 9) + 73;
INDEX_TYPE const OMP_M = (2 << 9) - 71;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_N = (2 << 9) + 73;
INDEX_TYPE const CUDA_M = (2 << 9) - 71;
#endif
Expand Down Expand Up @@ -180,7 +180,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -208,7 +208,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkOuterProductKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ template class OuterProductRAJA< RAJA::PERM_IJ, parallelHostPolicy >;
template class OuterProductRAJA< RAJA::PERM_JI, parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class OuterProductRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class OuterProductRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif
Expand Down
Loading

0 comments on commit 4d22d70

Please sign in to comment.