Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addresses some Tribol requests. #164

Merged
merged 1 commit into from
Jun 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ if( ENABLE_CHAI )
set( lvarray_dependencies ${lvarray_dependencies} chai umpire )
endif()

if( ${ENABLE_MPI} )
if( ENABLE_MPI )
set( lvarray_dependencies ${lvarray_dependencies} mpi )
endif()

Expand Down
7 changes: 4 additions & 3 deletions benchmarks/benchmarkArray1DR2TensorMultiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 18) - 87;
INDEX_TYPE const OMP_SIZE = (2 << 22) - 87;
#endif

#if defined(USE_CUDA)
// The non Array benchmarks could be run without chai, but then what's the point.
#if defined(USE_CUDA) && defined(USE_CHAI)
constexpr INDEX_TYPE CUDA_SIZE = (2 << 24) - 87;
#endif

Expand Down Expand Up @@ -240,7 +241,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_SIZE, RAJA::PERM_IJK {}, parallelHostPolicy {} )
, std::make_tuple( OMP_SIZE, RAJA::PERM_KJI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_SIZE, RAJA::PERM_IJK {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_SIZE, RAJA::PERM_KJI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand All @@ -264,7 +265,7 @@ int main( int argc, char * * argv )
LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << ", 3, 3 )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << ", 3, 3 )." );
#endif

Expand Down
10 changes: 5 additions & 5 deletions benchmarks/benchmarkArray1DR2TensorMultiplicationKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ namespace benchmarking


template< typename VALUE_TYPE_CONST, int USD >
RAJA_INLINE LVARRAY_HOST_DEVICE constexpr
void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & a,
LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD > const & b,
LvArray::ArraySlice< VALUE_TYPE, 2, USD > const & c )
inline LVARRAY_HOST_DEVICE constexpr
corbett5 marked this conversation as resolved.
Show resolved Hide resolved
void R2TensorMultiply( LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & a,
LvArray::ArraySlice< VALUE_TYPE_CONST, 2, USD, INDEX_TYPE > const & b,
LvArray::ArraySlice< VALUE_TYPE, 2, USD, INDEX_TYPE > const & c )
{ INNER_LOOP( a( j, l ), b( l, k ), c( j, k ) ) }


Expand Down Expand Up @@ -265,7 +265,7 @@ template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, parallelHostPolicy >;
template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class ArrayOfR2TensorsRAJA< RAJA::PERM_IJK, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class ArrayOfR2TensorsRAJA< RAJA::PERM_KJI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ class ArrayOfR2TensorsRAJA : private ArrayOfR2TensorsNative< PERMUTATION >
}

~ArrayOfR2TensorsRAJA()
{ this->m_c.move( chai::CPU ); }
{ this->m_c.move( MemorySpace::CPU ); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a move from a chai scope to an umpire scope?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's an enum I defined myself. I think camp and umpire both have similar features but this works fine for now.


void fortranView() const
{
Expand Down
53 changes: 0 additions & 53 deletions benchmarks/benchmarkCommon.hpp

This file was deleted.

73 changes: 54 additions & 19 deletions benchmarks/benchmarkHelpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
* Free Software Foundation) version 2.1 dated February 1999.
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
/* *UNCRUSTIFY-OFF* */

#pragma once

// Source includes
#include "benchmarkCommon.hpp"
#include "../unitTests/testUtils.hpp"
#include "StringUtilities.hpp"

// System includes
#include <random>
Expand All @@ -30,8 +30,8 @@
#if defined(USE_CALIPER)

#include <caliper/cali.h>
#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann##__LINE__( STRINGIZE_NX( name ) )
#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann##__LINE__( ( string ).data() )
#define LVARRAY_MARK_FUNCTION_TAG( name ) cali::Function __cali_ann ## __LINE__( STRINGIZE_NX( name ) )
#define LVARRAY_MARK_FUNCTION_TAG_STRING( string ) cali::Function __cali_ann ## __LINE__( ( string ).data() )

#else

Expand All @@ -45,6 +45,12 @@ namespace LvArray

using namespace testing;


#if defined(USE_CHAI)
static_assert( std::is_same< DEFAULT_BUFFER< int >, NewChaiBuffer< int > >::value,
"The default buffer should be NewChaiBuffer when chai is enabled." );
#endif

namespace benchmarking
{

Expand All @@ -65,6 +71,36 @@ inline std::string typeToString( RAJA::PERM_KJI const & ) { return "RAJA::PERM_K

} // namespace internal

#define ACCESS_IJ( N, M, i, j ) M * i + j
#define ACCESS_JI( N, M, i, j ) N * j + i

#define ACCESS_IJK( N, M, P, i, j, k ) M * P * i + P * j + k
#define ACCESS_KJI( N, M, P, i, j, k ) M * N * k + N * j + i

using INDEX_TYPE = std::ptrdiff_t;

template< typename T, typename PERMUTATION >
using Array = LvArray::Array< T, getDimension( PERMUTATION {} ), PERMUTATION, INDEX_TYPE, DEFAULT_BUFFER >;

template< typename T, typename PERMUTATION >
using ArrayView = LvArray::ArrayView< T,
getDimension( PERMUTATION {} ),
getStrideOneDimension( PERMUTATION {} ),
INDEX_TYPE,
DEFAULT_BUFFER >;

template< typename T, typename PERMUTATION >
using ArraySlice = LvArray::ArraySlice< T,
getDimension( PERMUTATION {} ),
getStrideOneDimension( PERMUTATION {} ),
INDEX_TYPE >;

template< typename T, typename PERMUTATION >
using RajaView = RAJA::View< T,
RAJA::Layout< getDimension( PERMUTATION {} ),
INDEX_TYPE,
getStrideOneDimension( PERMUTATION {} ) >>;

template< typename ARG0 >
std::string typeListToString()
{ return internal::typeToString( ARG0 {} ); }
Expand All @@ -77,12 +113,12 @@ std::string typeListToString()
#define REGISTER_BENCHMARK( args, func ) \
{ \
::benchmark::RegisterBenchmark( STRINGIZE( func ), func ) \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
}


Expand All @@ -91,12 +127,12 @@ std::string typeListToString()
std::string functionName = STRINGIZE( func ) "< "; \
functionName += typeListToString< __VA_ARGS__ >() + " >"; \
::benchmark::RegisterBenchmark( functionName.c_str(), func< __VA_ARGS__ > ) \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
->Args( args ) \
->UseRealTime() \
->ComputeStatistics( "min", []( std::vector< double > const & times ) \
{ return *std::min_element( times.begin(), times.end() ); } ) \
->ComputeStatistics( "max", []( std::vector< double > const & times ) \
{ return *std::max_element( times.begin(), times.end() ); } ); \
}


Expand Down Expand Up @@ -135,7 +171,7 @@ RajaView< T, PERMUTATION > makeRajaView( Array< T, PERMUTATION > const & array )
constexpr int NDIM = getDimension( PERMUTATION {} );
std::array< INDEX_TYPE, NDIM > sizes;

for( int i = 0 ; i < NDIM ; ++i )
for( int i = 0; i < NDIM; ++i )
{
sizes[ i ] = array.dims()[ i ];
}
Expand Down Expand Up @@ -215,7 +251,7 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )

std::cout << "### The benchmarks produced different results with arguments ";
std::cout << args[ 0 ];
for( unsigned long i = 1 ; i < N ; ++i )
for( unsigned long i = 1; i < N; ++i )
{
std::cout << ", " << args[ i ];
}
Expand Down Expand Up @@ -261,4 +297,3 @@ inline int verifyResults( ResultsMap< T, N > const & benchmarkResults )

} // namespace benchmarking
} // namespace LvArray
/* *UNCRUSITIFY-ON* */
8 changes: 5 additions & 3 deletions benchmarks/benchmarkInnerProduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ INDEX_TYPE const SERIAL_SIZE = (2 << 20) + 573;
#if defined(USE_OPENMP)
INDEX_TYPE const OMP_SIZE = SERIAL_SIZE;
#endif
#if defined(USE_CUDA)

// The non Array benchmarks could be run without chai, but then what's the point.
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_SIZE = SERIAL_SIZE;
#endif

Expand Down Expand Up @@ -165,7 +167,7 @@ void registerBenchmarks()
#if defined(USE_OPENMP)
, std::make_tuple( OMP_SIZE, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_SIZE, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
);
Expand All @@ -191,7 +193,7 @@ int main( int argc, char * * argv )
LVARRAY_LOG( "OMP problems of size ( " << LvArray::benchmarking::OMP_SIZE << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_SIZE << " )." );
#endif

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkInnerProductKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ template class InnerProductRAJA< serialPolicy >;
template class InnerProductRAJA< parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class InnerProductRAJA< RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmarkMatrixMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ INDEX_TYPE const OMP_L = SERIAL_L;
INDEX_TYPE const OMP_M = SERIAL_M;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_N = SERIAL_N;
INDEX_TYPE const CUDA_L = SERIAL_L;
INDEX_TYPE const CUDA_M = SERIAL_M;
Expand Down Expand Up @@ -184,7 +184,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, SERIAL_L, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, SERIAL_L, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -212,7 +212,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_L << ", " << LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_L << ", " << LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkMatrixMatrixKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ template class MatrixMatrixRAJA< RAJA::PERM_JI, parallelHostPolicy >;

#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)

template class MatrixMatrixRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class MatrixMatrixRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmarkMatrixVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -204,7 +204,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkMatrixVectorKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ template class MatrixVectorRAJA< RAJA::PERM_JI, parallelHostPolicy >;

#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)

template class MatrixVectorRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class MatrixVectorRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmarkOuterProduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ INDEX_TYPE const OMP_N = (2 << 9) + 73;
INDEX_TYPE const OMP_M = (2 << 9) - 71;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
INDEX_TYPE const CUDA_N = (2 << 9) + 73;
INDEX_TYPE const CUDA_M = (2 << 9) - 71;
#endif
Expand Down Expand Up @@ -180,7 +180,7 @@ void registerBenchmarks()
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_IJ {}, parallelHostPolicy {} )
, std::make_tuple( OMP_N, OMP_M, RAJA::PERM_JI {}, parallelHostPolicy {} )
#endif
#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_IJ {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
, std::make_tuple( CUDA_N, CUDA_M, RAJA::PERM_JI {}, parallelDevicePolicy< THREADS_PER_BLOCK > {} )
#endif
Expand Down Expand Up @@ -208,7 +208,7 @@ int main( int argc, char * * argv )
LvArray::benchmarking::OMP_M << " )." );
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
LVARRAY_LOG( "CUDA problems of size ( " << LvArray::benchmarking::CUDA_N << ", " <<
LvArray::benchmarking::CUDA_M << " )." );
#endif
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarkOuterProductKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ template class OuterProductRAJA< RAJA::PERM_IJ, parallelHostPolicy >;
template class OuterProductRAJA< RAJA::PERM_JI, parallelHostPolicy >;
#endif

#if defined(USE_CUDA)
#if defined(USE_CUDA) && defined(USE_CHAI)
template class OuterProductRAJA< RAJA::PERM_IJ, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
template class OuterProductRAJA< RAJA::PERM_JI, RAJA::cuda_exec< THREADS_PER_BLOCK > >;
#endif
Expand Down
Loading