diff --git a/README b/README index de3e493fd..51a3c3612 100644 --- a/README +++ b/README @@ -259,6 +259,16 @@ options. compute node is determined by its unique hostname, and the number of STXs available on a compute node is provided by the libfabric library. + Team Environment variables: + + SHMEM_TEAMS_MAX (default: 10) + Sets the maximum number of available teams per PE, including the + predefined teams. The maximum supported value is 64. The value must + be the same across all PEs in SHMEM_TEAM_WORLD. + + SHMEM_TEAM_SHARED_ONLY_SELF (default: off) + If defined, the predefined team, SHMEM_TEAM_SHARED, will only include + the self PE. Debugging Environment variables: diff --git a/bindings/shmem_bind_c.m4 b/bindings/shmem_bind_c.m4 index 7470c8a22..e72917126 100644 --- a/bindings/shmem_bind_c.m4 +++ b/bindings/shmem_bind_c.m4 @@ -96,6 +96,45 @@ $1(int, int, `SHM_INTERNAL_INT', `$2', `$3') $1(long, long, `SHM_INTERNAL_LONG', `$2', `$3') $1(longlong, long long, `SHM_INTERNAL_LONG_LONG', `$2', `$3')')dnl dnl +define(`SHMEM_BIND_C_COLL_AND_OR_XOR', dnl args: macro_name, op_name, op_const +`$1(uchar, unsigned char, `SHM_INTERNAL_UCHAR', `$2', `$3') +$1(short, short, `SHM_INTERNAL_SHORT', `$2', `$3') +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT', `$2', `$3') +$1(int, int, `SHM_INTERNAL_INT', `$2', `$3') +$1(uint, unsigned int, `SHM_INTERNAL_UINT', `$2', `$3') +$1(long, long, `SHM_INTERNAL_LONG', `$2', `$3') +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG', `$2', `$3') +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG', `$2', `$3') +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG', `$2', `$3')')dnl +dnl +define(`SHMEM_BIND_C_COLL_MIN_MAX', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT', `$2', `$3') +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT', `$2', `$3') +$1(int, int, `SHM_INTERNAL_INT', `$2', `$3') +$1(uint, unsigned int, `SHM_INTERNAL_UINT', `$2', `$3') +$1(long, long, `SHM_INTERNAL_LONG', `$2', `$3') +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG', `$2', `$3') +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG', `$2', `$3') +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG', `$2', `$3') +$1(float, float, `SHM_INTERNAL_FLOAT', `$2', `$3') +$1(double, double, `SHM_INTERNAL_DOUBLE', `$2', `$3') +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE', `$2', `$3')')dnl +dnl +define(`SHMEM_BIND_C_COLL_SUM_PROD', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT', `$2', `$3') +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT', `$2', `$3') +$1(int, int, `SHM_INTERNAL_INT', `$2', `$3') +$1(uint, unsigned int, `SHM_INTERNAL_UINT', `$2', `$3') +$1(long, long, `SHM_INTERNAL_LONG', `$2', `$3') +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG', `$2', `$3') +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG', `$2', `$3') +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG', `$2', `$3') +$1(float, float, `SHM_INTERNAL_FLOAT', `$2', `$3') +$1(double, double, `SHM_INTERNAL_DOUBLE', `$2', `$3') +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE', `$2', `$3') +$1(complexd, double _Complex, `SHM_INTERNAL_DOUBLE_COMPLEX', `$2', `$3') +$1(complexf, float _Complex, `SHM_INTERNAL_FLOAT_COMPLEX', `$2', `$3')')dnl +dnl define(`SHMEM_BIND_C_COLL_FLOATS', dnl args: macro_name, op_name, op_const `$1(float, float, `SHM_INTERNAL_FLOAT', `$2', `$3') $1(double, double, `SHM_INTERNAL_DOUBLE', `$2', `$3') diff --git a/bindings/shmem_bind_c11.m4 b/bindings/shmem_bind_c11.m4 index ba4c4e2c5..5ea077b7c 100644 --- a/bindings/shmem_bind_c11.m4 +++ b/bindings/shmem_bind_c11.m4 @@ -65,3 +65,43 @@ $1(ushort, unsigned short)$2 $1(uint, unsigned int)$2 $1(ulong, unsigned long)$2 $1(ulonglong, unsigned long long)')dnl +dnl +define(`SHMEM_BIND_C11_COLL_AND_OR_XOR', dnl args: macro_name, op_name, op_const +`$1(uchar, unsigned char, `SHM_INTERNAL_UCHAR')$2 +$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')')dnl +dnl +define(`SHMEM_BIND_C11_COLL_MIN_MAX', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')$2 +$1(float, float, `SHM_INTERNAL_FLOAT')$2 +$1(double, double, `SHM_INTERNAL_DOUBLE')$2 +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE')')dnl +dnl +define(`SHMEM_BIND_C11_COLL_SUM_PROD', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')$2 +$1(float, float, `SHM_INTERNAL_FLOAT')$2 +$1(double, double, `SHM_INTERNAL_DOUBLE')$2 +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE')$2 +$1(complexd, double _Complex, `SHM_INTERNAL_DOUBLE_COMPLEX')$2 +$1(complexf, float _Complex, `SHM_INTERNAL_FLOAT_COMPLEX')')dnl +dnl diff --git a/bindings/shmem_bind_cxx.m4 b/bindings/shmem_bind_cxx.m4 index 394c03f94..b245bd3cb 100644 --- a/bindings/shmem_bind_cxx.m4 +++ b/bindings/shmem_bind_cxx.m4 @@ -55,7 +55,7 @@ $1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 $1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 $1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')')dnl dnl -define(`SHMEM_BIND_CXX_SYNC', dnl args: macro_name +define(`SHMEM_BIND_CXX_SYNC', dnl args: macro_name, end SHMEM_BIND_CXX_SYNC_EXTRAS($1,$2)dnl `$1(short, short)$2 $1(int, int)$2 @@ -66,8 +66,50 @@ $1(uint, unsigned int)$2 $1(ulong, unsigned long)$2 $1(ulonglong, unsigned long long)')dnl dnl +define(`SHMEM_BIND_CXX_COLL_AND_OR_XOR', dnl args: macro_name, end +`$1(uchar, unsigned char, `SHM_INTERNAL_UCHAR')$2 +$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')')dnl +dnl +define(`SHMEM_BIND_CXX_COLL_MIN_MAX', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')$2 +$1(float, float, `SHM_INTERNAL_FLOAT')$2 +$1(double, double, `SHM_INTERNAL_DOUBLE')$2 +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE')')dnl +dnl +define(`SHMEM_BIND_CXX_COLL_SUM_PROD', dnl args: macro_name, op_name, op_const +`$1(short, short, `SHM_INTERNAL_SHORT')$2 +$1(ushort, unsigned short, `SHM_INTERNAL_USHORT')$2 +$1(int, int, `SHM_INTERNAL_INT')$2 +$1(uint, unsigned int, `SHM_INTERNAL_UINT')$2 +$1(long, long, `SHM_INTERNAL_LONG')$2 +$1(ulong, unsigned long, `SHM_INTERNAL_ULONG')$2 +$1(longlong, long long, `SHM_INTERNAL_LONG_LONG')$2 +$1(ulonglong, unsigned long long, `SHM_INTERNAL_ULONG_LONG')$2 +$1(float, float, `SHM_INTERNAL_FLOAT')$2 +$1(double, double, `SHM_INTERNAL_DOUBLE')$2 +$1(longdouble, long double, `SHM_INTERNAL_LONG_DOUBLE')$2 +$1(complexd, double _Complex, `SHM_INTERNAL_DOUBLE_COMPLEX')$2 +$1(complexf, float _Complex, `SHM_INTERNAL_FLOAT_COMPLEX')')dnl +dnl define(`SHMEM_CXX_DEFINE_FOR_RMA', `SHMEM_BIND_CXX_RMA(`$1')')dnl define(`SHMEM_CXX_DEFINE_FOR_AMO', `SHMEM_BIND_CXX_AMO(`$1')')dnl define(`SHMEM_CXX_DEFINE_FOR_EXTENDED_AMO', `SHMEM_BIND_CXX_EXTENDED_AMO(`$1')')dnl define(`SHMEM_CXX_DEFINE_FOR_BITWISE_AMO', `SHMEM_BIND_CXX_BITWISE_AMO(`$1')')dnl define(`SHMEM_CXX_DEFINE_FOR_SYNC', `SHMEM_BIND_CXX_SYNC(`$1')')dnl +define(`SHMEM_CXX_DEFINE_FOR_COLL_AND_OR_XOR', `SHMEM_BIND_CXX_COLL_AND_OR_XOR(`$1')')dnl +define(`SHMEM_CXX_DEFINE_FOR_COLL_MIN_MAX', `SHMEM_BIND_CXX_COLL_MIN_MAX(`$1')')dnl +define(`SHMEM_CXX_DEFINE_FOR_COLL_SUM_PROD', `SHMEM_BIND_CXX_COLL_SUM_PROD(`$1')')dnl diff --git a/configure.ac b/configure.ac index 32c01c743..38d411c25 100755 --- a/configure.ac +++ b/configure.ac @@ -208,6 +208,14 @@ AS_CASE([$enable_ofi_mr], AC_DEFINE([ENABLE_MR_SCALABLE], [1], [If defined, the OFI transport will use FI_MR_SCALABLE])], [AC_MSG_ERROR([Invalid OFI memory registration mode: $enable_ofi_mr])]) +AC_ARG_ENABLE([max-teams], + [AC_HELP_STRING([--enable-max-teams=NUMBER], + [Default value for the maximum number of teams allowed (default: 10)])]) + +AS_IF([test -z "$enable_max_teams"], + [AC_DEFINE([DEFAULT_TEAMS_MAX], [10], [Maximum number of teams (default)])], + [AC_DEFINE_UNQUOTED([DEFAULT_TEAMS_MAX], [$enable_max_teams], [Maximum number of teams (custom)])]) + AC_ARG_ENABLE([rpm-prefix], [AC_HELP_STRING([--enable-rpm-prefix], [Generate RPM spec file that supports an alternate installation prefix (default:disabled)])]) @@ -482,6 +490,8 @@ SHMEM_FIND_INT_TYPE([long], [$transport]) SHMEM_FIND_INT_TYPE([long long], [$transport]) SHMEM_FIND_INT_TYPE([ptrdiff_t], [$transport]) +SHMEM_FIND_UINT_TYPE([unsigned char], [$transport]) +SHMEM_FIND_UINT_TYPE([unsigned short], [$transport]) SHMEM_FIND_UINT_TYPE([unsigned int], [$transport]) SHMEM_FIND_UINT_TYPE([unsigned long], [$transport]) SHMEM_FIND_UINT_TYPE([unsigned long long], [$transport]) diff --git a/mpp/shmemx-def.h b/mpp/shmemx-def.h index 1342d5250..92f39a0e7 100644 --- a/mpp/shmemx-def.h +++ b/mpp/shmemx-def.h @@ -22,8 +22,29 @@ typedef struct { uint64_t target; } shmemx_pcntr_t; +/* Teams */ +typedef struct shmem_impl_team_t { + int dummy; +} * shmemx_team_t; + +typedef struct { + int num_contexts; +} shmemx_team_config_t; + +#if SHMEM_HAVE_ATTRIBUTE_VISIBILITY == 1 + __attribute__((visibility("default"))) extern shmemx_team_t SHMEMX_TEAM_WORLD; + __attribute__((visibility("default"))) extern shmemx_team_t SHMEMX_TEAM_SHARED; +#else + extern shmemx_team_t SHMEMX_TEAM_WORLD; + extern shmemx_team_t SHMEMX_TEAM_SHARED; +#endif + +#define SHMEMX_TEAM_INVALID NULL + #define SHMEMX_CTX_INVALID NULL +#define SHMEMX_TEAM_NUM_CONTEXTS (1l<<0) + #ifdef __cplusplus } #endif diff --git a/mpp/shmemx.h4 b/mpp/shmemx.h4 index 9cd9f5d39..dfe8338c1 100644 --- a/mpp/shmemx.h4 +++ b/mpp/shmemx.h4 @@ -175,6 +175,77 @@ static inline void shmemx_put_signal_nbi(shmem_ctx_t ctx, $2* dest, const $2* so }')dnl SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_PUT_SIGNAL_NBI') +/* Team Collective Routines */ +define(`SHMEM_CXX_BCAST', +`static inline int shmemx_broadcast(shmemx_team_t team, $2* dest, const $2* source, + size_t nelems, int PE_root) { + return shmemx_$1_broadcast(team, dest, source, nelems, PE_root); +}')dnl +SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_BCAST') + +define(`SHMEM_CXX_COLLECT', +`static inline int shmemx_collect(shmemx_team_t team, $2* dest, const $2* source, + size_t nelems) { + return shmemx_$1_collect(team, dest, source, nelems); +}')dnl +SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_COLLECT') + +define(`SHMEM_CXX_FCOLLECT', +`static inline int shmemx_fcollect(shmemx_team_t team, $2* dest, const $2* source, + size_t nelems) { + return shmemx_$1_fcollect(team, dest, source, nelems); +}')dnl +SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_FCOLLECT') + +define(`SHMEM_CXX_AND_REDUCE', +`static inline int shmemx_and_reduce(shmemx_team_t team, $2* dest, const $2* source, + size_t nreduce) { + return shmemx_$1_and_reduce(team, dest, source, nreduce); +}')dnl +SHMEM_CXX_DEFINE_FOR_COLL_AND_OR_XOR(`SHMEM_CXX_AND_REDUCE') + +define(`SHMEM_CXX_MIN_REDUCE', +`static inline int shmemx_min_reduce(shmemx_team_t team, $2* dest, const $2* source, + size_t nreduce) { + return shmemx_$1_min_reduce(team, dest, source, nreduce); +}')dnl +SHMEM_CXX_DEFINE_FOR_COLL_MIN_MAX(`SHMEM_CXX_MIN_REDUCE') + +define(`SHMEM_CXX_MAX_REDUCE', +`static inline int shmemx_max_reduce(shmemx_team_t team, $2* dest, const $2* source, + size_t nreduce) { + return shmemx_$1_max_reduce(team, dest, source, nreduce); +}')dnl +SHMEM_CXX_DEFINE_FOR_COLL_MIN_MAX(`SHMEM_CXX_MAX_REDUCE') + +define(`SHMEM_CXX_SUM_REDUCE', +`static inline int shmemx_sum_reduce(shmemx_team_t team, $2* dest, const $2* source, + size_t nreduce) { + return shmemx_$1_sum_reduce(team, dest, source, nreduce); +}')dnl +SHMEM_CXX_DEFINE_FOR_COLL_SUM_PROD(`SHMEM_CXX_SUM_REDUCE') + +define(`SHMEM_CXX_PROD_REDUCE', +`static inline int shmemx_prod_reduce(shmemx_team_t team, $2* dest, const $2* source, + size_t nreduce) { + return shmemx_$1_prod_reduce(team, dest, source, nreduce); +}')dnl +SHMEM_CXX_DEFINE_FOR_COLL_SUM_PROD(`SHMEM_CXX_PROD_REDUCE') + +define(`SHMEM_CXX_ALLTOALL', +`static inline int shmemx_alltoall(shmemx_team_t team, $2* dest, const $2* source, + size_t nelems) { + return shmemx_$1_alltoall(team, dest, source, nelems); +}')dnl +SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_ALLTOALL') + +define(`SHMEM_CXX_ALLTOALLS', +`static inline int shmemx_alltoalls(shmemx_team_t team, $2* dest, const $2* source, + ptrdiff_t dst, ptrdiff_t sst, size_t nelems) { + return shmemx_$1_alltoalls(team, dest, source, dst, sst, nelems); +}')dnl +SHMEM_CXX_DEFINE_FOR_RMA(`SHMEM_CXX_ALLTOALLS') + /* C11 Generic Macros */ #elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(SHMEM_INTERNAL_INCLUDE)) @@ -348,6 +419,79 @@ SHMEM_BIND_C11_RMA(`SHMEM_CTX_C11_GEN_PUT_SIGNAL_NBI', `, \') \ SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_PUT_SIGNAL_NBI', `, \') \ )(__VA_ARGS__) +/* Team Collective Routines */ +define(`SHMEM_C11_GEN_BCAST', ` $2*: shmemx_$1_broadcast')dnl +#define shmemx_broadcast(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_BCAST', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_COLLECT', ` $2*: shmemx_$1_collect')dnl +#define shmemx_collect(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_COLLECT', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_FCOLLECT', ` $2*: shmemx_$1_fcollect')dnl +#define shmemx_fcollect(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_FCOLLECT', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_AND_REDUCE', ` $2*: shmemx_$1_and_reduce')dnl +#define shmemx_and_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_AND_OR_XOR(`SHMEM_C11_GEN_AND_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_OR_REDUCE', ` $2*: shmemx_$1_or_reduce')dnl +#define shmemx_or_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_AND_OR_XOR(`SHMEM_C11_GEN_OR_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_XOR_REDUCE', ` $2*: shmemx_$1_xor_reduce')dnl +#define shmemx_xor_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_AND_OR_XOR(`SHMEM_C11_GEN_XOR_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_MIN_REDUCE', ` $2*: shmemx_$1_min_reduce')dnl +#define shmemx_min_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_MIN_MAX(`SHMEM_C11_GEN_MIN_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_MAX_REDUCE', ` $2*: shmemx_$1_max_reduce')dnl +#define shmemx_max_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_MIN_MAX(`SHMEM_C11_GEN_MAX_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_SUM_REDUCE', ` $2*: shmemx_$1_sum_reduce')dnl +#define shmemx_sum_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_SUM_PROD(`SHMEM_C11_GEN_SUM_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_PROD_REDUCE', ` $2*: shmemx_$1_prod_reduce')dnl +#define shmemx_prod_reduce(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_COLL_SUM_PROD(`SHMEM_C11_GEN_PROD_REDUCE', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_ALLTOALL', ` $2*: shmemx_$1_alltoall')dnl +#define shmemx_alltoall(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_ALLTOALL', `, \') \ + )(__VA_ARGS__) + +define(`SHMEM_C11_GEN_ALLTOALLS', ` $2*: shmemx_$1_alltoalls')dnl +#define shmemx_alltoalls(...) \ + _Generic(SHMEM_C11_TYPE_EVAL_PTR(SHMEM_C11_ARG1(__VA_ARGS__)), \ +SHMEM_BIND_C11_RMA(`SHMEM_C11_GEN_ALLTOALLS', `, \') \ + )(__VA_ARGS__) + #endif /* C11 */ #endif /* SHMEMX_H */ diff --git a/mpp/shmemx_c_func.h4 b/mpp/shmemx_c_func.h4 index 606de6675..b22b906ee 100644 --- a/mpp/shmemx_c_func.h4 +++ b/mpp/shmemx_c_func.h4 @@ -166,3 +166,68 @@ define(`SHMEM_C_CTX_PUT_N_SIGNAL_NBI', SHMEM_DECLARE_FOR_SIZES(`SHMEM_C_CTX_PUT_N_SIGNAL_NBI') SHMEM_C_CTX_PUT_N_SIGNAL_NBI(mem,1); + +/* Team Management Routines */ +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_my_pe(shmemx_team_t team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_n_pes(shmemx_team_t team); + +SHMEM_FUNCTION_ATTRIBUTES void SHPRE()shmemx_team_get_config(shmemx_team_t team, shmemx_team_config_t *config); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_translate_pe(shmemx_team_t src_team, int src_pe, shmemx_team_t dest_team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_split_strided(shmemx_team_t parent_team, int PE_start, int PE_stride, int PE_size, const shmemx_team_config_t *config, long config_mask, shmemx_team_t *new_team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_split_2d(shmemx_team_t parent_team, int xrange, const shmemx_team_config_t *xaxis_config, long xaxis_mask, shmemx_team_t *xaxis_team, const shmemx_team_config_t *yaxis_config, long yaxis_mask, shmemx_team_t *yaxis_team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_destroy(shmemx_team_t team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_create_ctx(shmemx_team_t team, long options, shmem_ctx_t *ctx); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_ctx_get_team(shmem_ctx_t ctx, shmemx_team_t *team); + +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_sync(shmemx_team_t team); +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_team_sync(shmemx_team_t team); + +/* Team Collective Routines */ +define(`SHMEM_C_BCAST', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_broadcast(shmemx_team_t team, $2 *dest, const $2 *source, size_t nelems, int PE_root)')dnl +SHMEM_DECLARE_FOR_RMA(`SHMEM_C_BCAST') +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_broadcastmem(shmemx_team_t team, void *dest, const void *source, size_t nelems, int PE_root); + +define(`SHMEM_C_COLLECT', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_collect(shmemx_team_t team, $2 *dest, const $2 *source, size_t nelems)')dnl +SHMEM_DECLARE_FOR_RMA(`SHMEM_C_COLLECT') +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_collectmem(shmemx_team_t team, void *dest, const void *source, size_t nelems); + +define(`SHMEM_C_FCOLLECT', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_fcollect(shmemx_team_t team, $2 *dest, const $2 *source, size_t nelems)')dnl +SHMEM_DECLARE_FOR_RMA(`SHMEM_C_FCOLLECT') +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_fcollectmem(shmemx_team_t team, void *dest, const void *source, size_t nelems); + +define(`SHMEM_C_REDUCE', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_$4_reduce(shmemx_team_t team, $2 *dest, const $2 *source, size_t nreduce);')dnl +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_C_REDUCE', `and') + +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_C_REDUCE', `or') + +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_C_REDUCE', `xor') + +SHMEM_BIND_C_COLL_MIN_MAX(`SHMEM_C_REDUCE', `min') + +SHMEM_BIND_C_COLL_MIN_MAX(`SHMEM_C_REDUCE', `max') + +SHMEM_BIND_C_COLL_SUM_PROD(`SHMEM_C_REDUCE', `sum') + +SHMEM_BIND_C_COLL_SUM_PROD(`SHMEM_C_REDUCE', `prod') + +define(`SHMEM_C_ALLTOALL', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_alltoall(shmemx_team_t team, $2 *dest, const $2 *source, size_t nelems)')dnl +SHMEM_DECLARE_FOR_RMA(`SHMEM_C_ALLTOALL') +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_alltoallmem(shmemx_team_t team, void *dest, const void *source, size_t nelems); + +define(`SHMEM_C_ALLTOALLS', +`SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_$1_alltoalls(shmemx_team_t team, $2 *dest, const $2 *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems)')dnl +SHMEM_DECLARE_FOR_RMA(`SHMEM_C_ALLTOALLS') +SHMEM_FUNCTION_ATTRIBUTES int SHPRE()shmemx_alltoallsmem(shmemx_team_t team, void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems); + diff --git a/src/Makefile.am b/src/Makefile.am index d37867728..d8c35c98d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -18,7 +18,9 @@ GEN_BINDINGS = \ atomic_nbi_c.c \ collectives_c.c \ data_c.c \ - synchronization_c.c + synchronization_c.c \ + teams_c.c + if HAVE_FORTRAN GEN_BINDINGS += \ @@ -61,7 +63,9 @@ libsma_la_SOURCES = \ contexts_c.c \ contexts.c \ perf_counters_c.c \ - backtrace.c + backtrace.c \ + shmem_team.c \ + shmem_team.h nodist_libsma_la_SOURCES = \ $(GEN_BINDINGS) @@ -168,7 +172,8 @@ EXTRA_DIST = shmem_compiler_script.in \ collectives_f.c4 \ data_c.c4 \ data_f.c4 \ - synchronization_c.c4 + synchronization_c.c4 \ + teams_c.c4 do_subst = sed -e 's|[@]PERL[@]|$(PERL)|g' \ -e 's|[@]WRAPPER_COMPILER_CC[@]|$(WRAPPER_COMPILER_CC)|g' \ diff --git a/src/atomic_c.c4 b/src/atomic_c.c4 index cce1fa799..7822b59f1 100644 --- a/src/atomic_c.c4 +++ b/src/atomic_c.c4 @@ -28,6 +28,7 @@ include(shmem_bind_c.m4)dnl #include "shmem.h" #include "shmem_internal.h" #include "shmem_comm.h" +#include "shmem_team.h" #ifdef ENABLE_PROFILING #include "pshmem.h" @@ -526,6 +527,7 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(SHMEM_DEF_XOR) /* Function prototype for v1.4 routines with contexts: */ #define SHMEM_FUNC_PROTOTYPE(TYPE, OP, ...) \ shmem_ctx_##TYPE##_atomic_##OP(shmem_ctx_t ctx, __VA_ARGS__) { \ + pe = shmem_internal_team_pe(((shmem_transport_ctx_t *) ctx)->team, pe); SHMEM_DEFINE_FOR_EXTENDED_AMO(SHMEM_DEF_FETCH) SHMEM_DEFINE_FOR_EXTENDED_AMO(SHMEM_DEF_SET) diff --git a/src/atomic_nbi_c.c4 b/src/atomic_nbi_c.c4 index 66053ed5e..59dd0178e 100644 --- a/src/atomic_nbi_c.c4 +++ b/src/atomic_nbi_c.c4 @@ -28,6 +28,7 @@ include(shmem_bind_c.m4)dnl #include "shmem.h" #include "shmem_internal.h" #include "shmem_comm.h" +#include "shmem_team.h" #ifdef ENABLE_PROFILING #include "pshmem.h" @@ -241,10 +242,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(SHMEM_DEF_FETCH_XOR_NBI) /* Function prototype for v1.4 routines with contexts: */ #define SHMEM_FUNC_PROTOTYPE(TYPE, OP, ...) \ shmem_ctx_##TYPE##_atomic_##OP(shmem_ctx_t ctx, __VA_ARGS__) { \ + pe = shmem_internal_team_pe(((shmem_transport_ctx_t *) ctx)->team, pe); /* Function prototype for v1.4 routines with contexts: */ #define SHMEMX_FUNC_PROTOTYPE(TYPE, OP, ...) \ shmemx_ctx_##TYPE##_atomic_##OP(shmem_ctx_t ctx, __VA_ARGS__) { \ + pe = shmem_internal_team_pe(((shmem_transport_ctx_t *) ctx)->team, pe); SHMEM_DEFINE_FOR_EXTENDED_AMO(SHMEM_DEF_FETCH_NBI) SHMEM_DEFINE_FOR_AMO(SHMEM_DEF_COMPARE_SWAP_NBI) diff --git a/src/collectives.c b/src/collectives.c index 6ecf743cd..869eaf779 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -94,13 +94,12 @@ shmem_internal_build_kary_tree(int radix, int PE_start, int stride, /* Circulator iterator for PE active sets */ static inline int -shmem_internal_circular_iter_next(int curr, int PE_start, int logPE_stride, int PE_size) +shmem_internal_circular_iter_next(int curr, int PE_start, int PE_stride, int PE_size) { - const int stride = 1 << logPE_stride; - const int last = PE_start + (stride * (PE_size - 1)); + const int last = PE_start + (PE_stride * (PE_size - 1)); int next; - next = curr + stride; + next = curr + PE_stride; if (next > last) next = PE_start; @@ -245,10 +244,9 @@ shmem_internal_collectives_init(void) * *****************************************/ void -shmem_internal_sync_linear(int PE_start, int logPE_stride, int PE_size, long *pSync) +shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync) { long zero = 0, one = 1; - int stride = 1 << logPE_stride; /* need 1 slot */ shmem_internal_assert(SHMEM_BARRIER_SYNC_SIZE >= 1); @@ -265,9 +263,9 @@ shmem_internal_sync_linear(int PE_start, int logPE_stride, int PE_size, long *pS SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down psync tree */ - for (pe = PE_start + stride, i = 1 ; + for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; - i++, pe += stride) { + i++, pe += PE_stride) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); } @@ -289,10 +287,9 @@ shmem_internal_sync_linear(int PE_start, int logPE_stride, int PE_size, long *pS void -shmem_internal_sync_tree(int PE_start, int logPE_stride, int PE_size, long *pSync) +shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) { long zero = 0, one = 1; - int stride = 1 << logPE_stride; int parent, num_children, *children; /* need 1 slot */ @@ -305,7 +302,7 @@ shmem_internal_sync_tree(int PE_start, int logPE_stride, int PE_size, long *pSyn children = full_tree_children; } else { children = alloca(sizeof(int) * tree_radix); - shmem_internal_build_kary_tree(tree_radix, PE_start, stride, PE_size, + shmem_internal_build_kary_tree(tree_radix, PE_start, PE_stride, PE_size, 0, &parent, &num_children, children); } @@ -371,12 +368,11 @@ shmem_internal_sync_tree(int PE_start, int logPE_stride, int PE_size, long *pSyn void -shmem_internal_sync_dissem(int PE_start, int logPE_stride, int PE_size, long *pSync) +shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync) { int one = 1, neg_one = -1; - int stride = 1 << logPE_stride; int distance, to, i; - int coll_rank = (shmem_internal_my_pe - PE_start) / stride; + int coll_rank = (shmem_internal_my_pe - PE_start) / PE_stride; int *pSync_ints = (int*) pSync; /* need log2(num_procs) int slots. max_num_procs is @@ -390,7 +386,7 @@ shmem_internal_sync_dissem(int PE_start, int logPE_stride, int PE_size, long *pS for (i = 0, distance = 1 ; distance < PE_size ; ++i, distance <<= 1) { to = ((coll_rank + distance) % PE_size); - to = PE_start + (to * stride); + to = PE_start + (to * PE_stride); shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); @@ -418,12 +414,11 @@ shmem_internal_sync_dissem(int PE_start, int logPE_stride, int PE_size, long *pS *****************************************/ void shmem_internal_bcast_linear(void *target, const void *source, size_t len, - int PE_root, int PE_start, int logPE_stride, int PE_size, + int PE_root, int PE_start, int PE_stride, int PE_size, long *pSync, int complete) { long zero = 0, one = 1; - int stride = 1 << logPE_stride; - int real_root = PE_start + PE_root * stride; + int real_root = PE_start + PE_root * PE_stride; long completion = 0; /* need 1 slot */ @@ -435,7 +430,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, int i, pe; /* send data to all peers */ - for (pe = PE_start,i=0; i < PE_size; pe += stride, i++) { + for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion); } @@ -444,7 +439,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion ack to all peers */ - for (pe = PE_start,i=0; i < PE_size; pe += stride, i++) { + for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe); } @@ -479,11 +474,10 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, void shmem_internal_bcast_tree(void *target, const void *source, size_t len, - int PE_root, int PE_start, int logPE_stride, int PE_size, + int PE_root, int PE_start, int PE_stride, int PE_size, long *pSync, int complete) { long zero = 0, one = 1; - int stride = 1 << logPE_stride; long completion = 0; int parent, num_children, *children; const void *send_buf = source; @@ -500,7 +494,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, children = full_tree_children; } else { children = alloca(sizeof(int) * tree_radix); - shmem_internal_build_kary_tree(tree_radix, PE_start, stride, PE_size, + shmem_internal_build_kary_tree(tree_radix, PE_start, PE_stride, PE_size, PE_root, &parent, &num_children, children); } @@ -572,12 +566,12 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, * *****************************************/ void -shmem_internal_op_to_all_linear(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype) { - int stride = 1 << logPE_stride; + long zero = 0, one = 1; long completion = 0; @@ -597,9 +591,9 @@ shmem_internal_op_to_all_linear(void *target, const void *source, int count, int shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ - for (pe = PE_start + stride, i = 1 ; + for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; - i++, pe += stride) { + i++, pe += PE_stride) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); } @@ -629,8 +623,8 @@ shmem_internal_op_to_all_linear(void *target, const void *source, int count, int } /* broadcast out */ - shmem_internal_bcast(target, target, count * type_size, 0, PE_start, - logPE_stride, PE_size, pSync + 2, 0); + shmem_internal_bcast(target, target, count * type_size, 0, + PE_start, PE_stride, PE_size, pSync + 2, 0); } @@ -638,16 +632,15 @@ shmem_internal_op_to_all_linear(void *target, const void *source, int count, int (count_)/(npes_) + ((id_) < (count_) % (_npes)) void -shmem_internal_op_to_all_ring(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype) { - int stride = 1 << logPE_stride; - int group_rank = (shmem_internal_my_pe - PE_start) / stride; + int group_rank = (shmem_internal_my_pe - PE_start) / PE_stride; long zero = 0, one = 1; - int peer = PE_start + ((group_rank + 1) % PE_size) * stride; + int peer = PE_start + ((group_rank + 1) % PE_size) * PE_stride; int free_source = 0; /* One slot for reduce-scatter and another for the allgather */ @@ -667,13 +660,13 @@ shmem_internal_op_to_all_ring(void *target, const void *source, int count, int t void *tmp = malloc(count * type_size); if (NULL == tmp) - RAISE_ERROR_MSG("Unable to allocate %db temporary buffer\n", count*type_size); + RAISE_ERROR_MSG("Unable to allocate %zub temporary buffer\n", count*type_size); memcpy(tmp, target, count*type_size); free_source = 1; source = tmp; - shmem_internal_sync(PE_start, logPE_stride, PE_size, pSync + 2); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2); } /* Perform reduce-scatter: @@ -685,8 +678,8 @@ shmem_internal_op_to_all_ring(void *target, const void *source, int count, int t * corresponding to its PE id + 1. */ for (int i = 0; i < PE_size - 1; i++) { - int chunk_in = (group_rank - i - 1 + PE_size) % PE_size; - int chunk_out = (group_rank - i + PE_size) % PE_size; + size_t chunk_in = (group_rank - i - 1 + PE_size) % PE_size; + size_t chunk_out = (group_rank - i + PE_size) % PE_size; /* Evenly distribute extra elements across first count % PE_size chunks */ size_t chunk_in_extra = chunk_in < count % PE_size; @@ -730,7 +723,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, int count, int t * around the ring until all PEs have all chunks. */ for (int i = 0; i < PE_size - 1; i++) { - int chunk_out = (group_rank + 1 - i + PE_size) % PE_size; + size_t chunk_out = (group_rank + 1 - i + PE_size) % PE_size; size_t chunk_out_extra = chunk_out < count % PE_size; size_t chunk_out_count = count/PE_size + chunk_out_extra; size_t chunk_out_disp = chunk_out_extra ? @@ -759,12 +752,11 @@ shmem_internal_op_to_all_ring(void *target, const void *source, int count, int t void -shmem_internal_op_to_all_tree(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype) { - int stride = 1 << logPE_stride; long zero = 0, one = 1; long completion = 0; int parent, num_children, *children; @@ -788,7 +780,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, int count, int t children = full_tree_children; } else { children = alloca(sizeof(int) * tree_radix); - shmem_internal_build_kary_tree(tree_radix, PE_start, stride, PE_size, + shmem_internal_build_kary_tree(tree_radix, PE_start, PE_stride, PE_size, 0, &parent, &num_children, children); } @@ -838,21 +830,20 @@ shmem_internal_op_to_all_tree(void *target, const void *source, int count, int t /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, PE_start, - logPE_stride, PE_size, pSync + 2, 0); + PE_stride, PE_size, pSync + 2, 0); } void -shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype) { - int stride = 1 << logPE_stride; - int my_id = ((shmem_internal_my_pe - PE_start) / stride); + int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int log2_proc = 1, pow2_proc = 2; int i = PE_size >> 1; - int wrk_size = type_size*count; + size_t wrk_size = type_size*count; void * const current_target = malloc(wrk_size); long completion = 0; long * pSync_extra_peer = pSync + SHMEM_REDUCE_SYNC_SIZE - 2; @@ -884,7 +875,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, if (current_target) memcpy(current_target, (void *) source, wrk_size); else - RAISE_ERROR_MSG("Failed to allocate current_target (count=%d, type_size=%d, size=%dB)\n", + RAISE_ERROR_MSG("Failed to allocate current_target (count=%zu, type_size=%zu, size=%zuB)\n", count, type_size, wrk_size); /* Algorithm: reduce N number of PE's into a power of two recursive @@ -899,7 +890,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, /* extra peer exchange: grab information from extra_peer so its part of * pairwise exchange */ if (my_id >= pow2_proc) { - int peer = (my_id - pow2_proc) * stride + PE_start; + int peer = (my_id - pow2_proc) * PE_stride + PE_start; /* Wait for target ready, required when source and target overlap */ SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_target_ready); @@ -914,7 +905,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, } else { if (my_id < PE_size - pow2_proc) { - int peer = (my_id + pow2_proc) * stride + PE_start; + int peer = (my_id + pow2_proc) * PE_stride + PE_start; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); @@ -927,7 +918,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, for (i = 0; i < log2_proc; i++) { long *step_psync = &pSync[i]; - int peer = (my_id ^ (1 << i)) * stride + PE_start; + int peer = (my_id ^ (1 << i)) * PE_stride + PE_start; if (shmem_internal_my_pe < peer) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_target_ready, @@ -960,7 +951,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, /* update extra peer with the final result from the pairwise exchange */ if (my_id < PE_size - pow2_proc) { - int peer = (my_id + pow2_proc) * stride + PE_start; + int peer = (my_id + pow2_proc) * PE_stride + PE_start; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, &completion); @@ -987,9 +978,8 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, *****************************************/ void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { - int stride = 1 << logPE_stride; size_t my_offset; long tmp[2]; int peer, start_pe, i; @@ -997,8 +987,8 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, /* Need 2 for lengths and barrier for completion */ shmem_internal_assert(SHMEM_COLLECT_SYNC_SIZE >= 2 + SHMEM_BARRIER_SYNC_SIZE); - DEBUG_MSG("target=%p, source=%p, len=%zd, PE_Start=%d, logPE_stride=%d, PE_size=%d, pSync=%p\n", - target, source, len, PE_start, logPE_stride, PE_size, (void*) pSync); + DEBUG_MSG("target=%p, source=%p, len=%zd, PE_Start=%d, PE_stride=%d, PE_size=%d, pSync=%p\n", + target, source, len, PE_start, PE_stride, PE_size, (void*) pSync); if (PE_size == 1) { if (target != source) memcpy(target, source, len); @@ -1010,7 +1000,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, my_offset = 0; tmp[0] = (long) len; /* FIXME: Potential truncation of size_t into long */ tmp[1] = 1; /* FIXME: Packing flag with data relies on byte ordering */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + stride); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride); } else { /* wait for send data */ @@ -1018,17 +1008,17 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, my_offset = pSync[0]; /* Not the last guy, so send offset to next PE */ - if (shmem_internal_my_pe < PE_start + stride * (PE_size - 1)) { + if (shmem_internal_my_pe < PE_start + PE_stride * (PE_size - 1)) { tmp[0] = (long) (my_offset + len); tmp[1] = 1; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), - shmem_internal_my_pe + stride); + shmem_internal_my_pe + PE_stride); } } /* Send data round-robin, ending with my PE */ start_pe = shmem_internal_circular_iter_next(shmem_internal_my_pe, - PE_start, logPE_stride, + PE_start, PE_stride, PE_size); peer = start_pe; do { @@ -1036,11 +1026,11 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + my_offset, source, len, peer); } - peer = shmem_internal_circular_iter_next(peer, PE_start, logPE_stride, + peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, logPE_stride, PE_size, &pSync[2]); + shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2]); pSync[0] = SHMEM_SYNC_VALUE; pSync[1] = SHMEM_SYNC_VALUE; @@ -1057,10 +1047,9 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, *****************************************/ void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { long tmp = 1; - int stride = 1 << logPE_stride; long completion = 0; /* need 1 slot, plus bcast */ @@ -1083,7 +1072,7 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { /* Push data into the target */ - size_t offset = ((shmem_internal_my_pe - PE_start) / stride) * len; + size_t offset = ((shmem_internal_my_pe - PE_start) / PE_stride) * len; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + offset, source, len, PE_start, &completion); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); @@ -1096,7 +1085,7 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); } - shmem_internal_bcast(target, target, len * PE_size, 0, PE_start, logPE_stride, + shmem_internal_bcast(target, target, len * PE_size, 0, PE_start, PE_stride, PE_size, pSync + 1, 0); } @@ -1110,14 +1099,13 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { - int stride = 1 << logPE_stride; int i; /* my_id is the index in a theoretical 0...N-1 array of participating tasks */ - int my_id = ((shmem_internal_my_pe - PE_start) / stride); - int next_proc = PE_start + ((my_id + 1) % PE_size) * stride; + int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); + int next_proc = PE_start + ((my_id + 1) % PE_size) * PE_stride; long completion = 0; long zero = 0, one = 1; @@ -1167,10 +1155,9 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { - int stride = 1 << logPE_stride; - int my_id = ((shmem_internal_my_pe - PE_start) / stride); + int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int i; long completion = 0; size_t curr_offset; @@ -1196,7 +1183,7 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, for (i = 0, distance = 0x1 ; distance < PE_size ; i++, distance <<= 1) { int peer = my_id ^ distance; - int real_peer = PE_start + (peer * stride); + int real_peer = PE_start + (peer * PE_stride); /* send data to peer */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + curr_offset, (char*) target + curr_offset, @@ -1225,10 +1212,9 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { - const int stride = 1 << logPE_stride; - const int my_as_rank = (shmem_internal_my_pe - PE_start) / stride; + const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_ptr = (uint8_t *) dest + my_as_rank * len; int peer, start_pe, i; @@ -1239,19 +1225,19 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, /* Send data round-robin, ending with my PE */ start_pe = shmem_internal_circular_iter_next(shmem_internal_my_pe, - PE_start, logPE_stride, + PE_start, PE_stride, PE_size); peer = start_pe; do { - int peer_as_rank = (peer - PE_start) / stride; /* Peer's index in active set */ + int peer_as_rank = (peer - PE_start) / PE_stride; /* Peer's index in active set */ shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source + peer_as_rank * len, len, peer); - peer = shmem_internal_circular_iter_next(peer, PE_start, logPE_stride, + peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; @@ -1261,10 +1247,9 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { - const int stride = 1 << logPE_stride; - const int my_as_rank = (shmem_internal_my_pe - PE_start) / stride; + const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_base = (uint8_t *) dest + my_as_rank * nelems * dst * elem_size; int peer, start_pe, i; @@ -1283,12 +1268,12 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, /* Send data round-robin, ending with my PE */ start_pe = shmem_internal_circular_iter_next(shmem_internal_my_pe, - PE_start, logPE_stride, + PE_start, PE_stride, PE_size); peer = start_pe; do { size_t i; - int peer_as_rank = (peer - PE_start) / stride; /* Peer's index in active set */ + int peer_as_rank = (peer - PE_start) / PE_stride; /* Peer's index in active set */ uint8_t *dest_ptr = (uint8_t *) dest_base; uint8_t *source_ptr = (uint8_t *) source + peer_as_rank * nelems * sst * elem_size; @@ -1299,11 +1284,11 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, source_ptr += sst * elem_size; dest_ptr += dst * elem_size; } - peer = shmem_internal_circular_iter_next(peer, PE_start, logPE_stride, + peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; diff --git a/src/collectives_c.c4 b/src/collectives_c.c4 index 97aa82f6e..33deec166 100644 --- a/src/collectives_c.c4 +++ b/src/collectives_c.c4 @@ -31,6 +31,7 @@ include(shmem_bind_c.m4)dnl #include "shmem_internal.h" #include "shmem_comm.h" #include "shmem_collectives.h" +#include "shmem_team.h" #ifdef ENABLE_PROFILING #include "pshmem.h" @@ -65,6 +66,15 @@ SHMEM_BIND_C_COLL_INTS(`SHMEM_PROF_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') SHMEM_BIND_C_COLL_FLOATS(`SHMEM_PROF_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') SHMEM_BIND_C_COLL_CMPLX(`SHMEM_PROF_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') +define(`SHMEM_PROF_DEF_BCAST', +`#pragma weak shmemx_$1_broadcast = pshmemx_$1_broadcast +#define shmemx_$1_broadcast pshmemx_$1_broadcast')dnl +dnl +SHMEM_BIND_C_RMA(`SHMEM_PROF_DEF_BCAST') + +#pragma weak shmemx_broadcastmem = pshmemx_broadcastmem +#define shmemx_broadcastmem pshmemx_broadcastmem + #pragma weak shmem_broadcast32 = pshmem_broadcast32 #define shmem_broadcast32 pshmem_broadcast32 #pragma weak shmem_broadcast64 = pshmem_broadcast64 @@ -75,21 +85,57 @@ SHMEM_BIND_C_COLL_CMPLX(`SHMEM_PROF_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') #pragma weak shmem_collect64 = pshmem_collect64 #define shmem_collect64 pshmem_collect64 +define(`SHMEM_PROF_DEF_COLLECT', +`#pragma weak shmemx_$1_collect = pshmemx_$1_collect +#define shmemx_$1_collect pshmemx_$1_collect')dnl +dnl +SHMEM_BIND_C_RMA(`SHMEM_PROF_DEF_COLLECT') + +#pragma weak shmemx_collectmem = pshmemx_collectmem +#define shmemx_collectmem pshmemx_collectmem + #pragma weak shmem_fcollect32 = pshmem_fcollect32 #define shmem_fcollect32 pshmem_fcollect32 #pragma weak shmem_fcollect64 = pshmem_fcollect64 #define shmem_fcollect64 pshmem_fcollect64 +define(`SHMEM_PROF_DEF_FCOLLECT', +`#pragma weak shmemx_$1_fcollect = pshmemx_$1_fcollect +#define shmemx_$1_fcollect pshmemx_$1_fcollect')dnl +dnl +SHMEM_BIND_C_RMA(`SHMEM_PROF_DEF_FCOLLECT') + +#pragma weak shmemx_fcollectmem = pshmemx_fcollectmem +#define shmemx_fcollectmem pshmemx_fcollectmem + #pragma weak shmem_alltoall32 = pshmem_alltoall32 #define shmem_alltoall32 pshmem_alltoall32 #pragma weak shmem_alltoall64 = pshmem_alltoall64 #define shmem_alltoall64 pshmem_alltoall64 +define(`SHMEM_PROF_DEF_ALLTOALL', +`#pragma weak shmemx_$1_alltoall = pshmemx_$1_alltoall +#define shmemx_$1_alltoall pshmemx_$1_alltoall')dnl +dnl +SHMEM_BIND_C_RMA(`SHMEM_PROF_DEF_ALLTOALL') + +#pragma weak shmemx_alltoallmem = pshmemx_alltoallmem +#define shmemx_alltoallmem pshmemx_alltoallmem + #pragma weak shmem_alltoalls32 = pshmem_alltoalls32 #define shmem_alltoalls32 pshmem_alltoalls32 #pragma weak shmem_alltoalls64 = pshmem_alltoalls64 #define shmem_alltoalls64 pshmem_alltoalls64 +define(`SHMEM_PROF_DEF_ALLTOALLS', +`#pragma weak shmemx_$1_alltoalls = pshmemx_$1_alltoalls +#define shmemx_$1_alltoalls pshmemx_$1_alltoalls')dnl +dnl +SHMEM_BIND_C_RMA(`SHMEM_PROF_DEF_ALLTOALLS') + +#pragma weak shmemx_alltoallsmem = pshmemx_alltoallsmem +#define shmemx_alltoallsmem pshmemx_alltoallsmem + #endif /* ENABLE_PROFILING */ void SHMEM_FUNCTION_ATTRIBUTES @@ -105,10 +151,10 @@ void SHMEM_FUNCTION_ATTRIBUTES shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_barrier(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync); } @@ -120,17 +166,43 @@ shmem_sync_all(void) shmem_internal_sync_all(); } - void SHMEM_FUNCTION_ATTRIBUTES shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_sync(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync); +} + +/* Team-based Collective Routines */ + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_sync(shmemx_team_t team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, SYNC); + shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync); + shmem_internal_team_release_psyncs(myteam, SYNC); + return 0; } +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_sync(shmemx_team_t team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, SYNC); + shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync); + shmem_internal_team_release_psyncs(myteam, SYNC); + return 0; +} #define SHMEM_DEF_TO_ALL(STYPE,TYPE,ITYPE,SOP,IOP) \ void SHMEM_FUNCTION_ATTRIBUTES \ @@ -141,7 +213,8 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) long *pSync) \ { \ SHMEM_ERR_CHECK_INITIALIZED(); \ - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); \ + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, \ + PE_size); \ SHMEM_ERR_CHECK_NON_NEGATIVE(nreduce); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)*nreduce); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)*nreduce); \ @@ -150,10 +223,30 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_REDUCE_SYNC_SIZE); \ \ shmem_internal_op_to_all(target, source, nreduce, sizeof(TYPE), \ - PE_start, logPE_stride, PE_size, \ + PE_start, 1 << logPE_stride, PE_size, \ pWrk, pSync, IOP, ITYPE); \ } +#define SHMEM_DEF_REDUCE(STYPE,TYPE,ITYPE,SOP,IOP) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_##SOP##_reduce(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, \ + size_t nreduce) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, sizeof(TYPE)*nreduce); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)*nreduce); \ + TYPE *pWrk = NULL; \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, REDUCE); \ + shmem_internal_op_to_all(dest, source, nreduce, sizeof(TYPE), \ + myteam->start, myteam->stride, myteam->size, pWrk, \ + psync, IOP, ITYPE); \ + shmem_internal_team_release_psyncs(myteam, REDUCE); \ + return 0; \ + } SHMEM_BIND_C_COLL_INTS(`SHMEM_DEF_TO_ALL', `and', `SHM_INTERNAL_BAND') SHMEM_BIND_C_COLL_INTS(`SHMEM_DEF_TO_ALL', `or', `SHM_INTERNAL_BOR') SHMEM_BIND_C_COLL_INTS(`SHMEM_DEF_TO_ALL', `xor', `SHM_INTERNAL_BXOR') @@ -170,6 +263,14 @@ SHMEM_BIND_C_COLL_INTS(`SHMEM_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') SHMEM_BIND_C_COLL_FLOATS(`SHMEM_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') SHMEM_BIND_C_COLL_CMPLX(`SHMEM_DEF_TO_ALL', `prod', `SHM_INTERNAL_PROD') +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_DEF_REDUCE', `and', `SHM_INTERNAL_BAND') +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_DEF_REDUCE', `or', `SHM_INTERNAL_BOR') +SHMEM_BIND_C_COLL_AND_OR_XOR(`SHMEM_DEF_REDUCE', `xor', `SHM_INTERNAL_BXOR') +SHMEM_BIND_C_COLL_SUM_PROD(`SHMEM_DEF_REDUCE', `sum', `SHM_INTERNAL_SUM') +SHMEM_BIND_C_COLL_SUM_PROD(`SHMEM_DEF_REDUCE', `prod', `SHM_INTERNAL_PROD') +SHMEM_BIND_C_COLL_MIN_MAX(`SHMEM_DEF_REDUCE', `min', `SHM_INTERNAL_MIN') +SHMEM_BIND_C_COLL_MIN_MAX(`SHMEM_DEF_REDUCE', `max', `SHM_INTERNAL_MAX') + void SHMEM_FUNCTION_ATTRIBUTES shmem_broadcast32(void *target, const void *source, size_t nlong, int PE_root, int PE_start, int logPE_stride, int PE_size, @@ -177,13 +278,13 @@ shmem_broadcast32(void *target, const void *source, size_t nlong, { SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_PE(PE_root); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); shmem_internal_bcast(target, source, nlong * 4, - PE_root, PE_start, logPE_stride, PE_size, + PE_root, PE_start, 1 << logPE_stride, PE_size, pSync, 1); } @@ -195,29 +296,70 @@ shmem_broadcast64(void *target, const void *source, size_t nlong, { SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_PE(PE_root); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); shmem_internal_bcast(target, source, nlong * 8, - PE_root, PE_start, logPE_stride, PE_size, + PE_root, PE_start, 1 << logPE_stride, PE_size, pSync, 1); } +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_broadcastmem(shmemx_team_t team, void *dest, const void *source, + size_t nelems, int PE_root) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_PE(PE_root); + SHMEM_ERR_CHECK_TEAM_VALID(team); + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, BCAST); + shmem_internal_bcast(dest, source, nelems, PE_root, myteam->start, + myteam->stride, myteam->size, + psync, 1); + shmem_internal_team_release_psyncs(myteam, BCAST); + return 0; +} + +#define SHMEM_DEF_BCAST(STYPE,TYPE) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_broadcast(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, size_t nelems, \ + int PE_root) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_PE(PE_root); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, BCAST); \ + shmem_internal_bcast(dest, source, nelems * sizeof(TYPE), \ + PE_root, myteam->start, myteam->stride, \ + myteam->size, psync, 1); \ + shmem_internal_team_release_psyncs(myteam, BCAST); \ + return 0; \ + } + +SHMEM_BIND_C_RMA(`SHMEM_DEF_BCAST') void SHMEM_FUNCTION_ATTRIBUTES shmem_collect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); shmem_internal_collect(target, source, nlong * 4, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); } @@ -226,28 +368,66 @@ shmem_collect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); shmem_internal_collect(target, source, nlong * 8, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); } +#define SHMEM_DEF_COLLECT(STYPE,TYPE) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_collect(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, size_t nelems) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, \ + COLLECT); \ + shmem_internal_collect(dest, source, nelems * sizeof(TYPE), \ + myteam->start, myteam->stride, \ + myteam->size, psync); \ + shmem_internal_team_release_psyncs(myteam, COLLECT); \ + return 0; \ + } + +SHMEM_BIND_C_RMA(`SHMEM_DEF_COLLECT') + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_collectmem(shmemx_team_t team, void *dest, const void *source, + size_t nelems) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + shmem_internal_collect(dest, source, nelems, myteam->start, + myteam->stride, myteam->size, psync); + shmem_internal_team_release_psyncs(myteam, COLLECT); + return 0; +} void SHMEM_FUNCTION_ATTRIBUTES shmem_fcollect32(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 4); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); shmem_internal_fcollect(target, source, nlong * 4, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); } @@ -256,28 +436,66 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(target, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(source, nlong * 8); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); shmem_internal_fcollect(target, source, nlong * 8, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); } +#define SHMEM_DEF_FCOLLECT(STYPE,TYPE) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_fcollect(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, size_t nelems) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, \ + COLLECT); \ + shmem_internal_fcollect(dest, source, nelems * sizeof(TYPE), \ + myteam->start, myteam->stride, \ + myteam->size, psync); \ + shmem_internal_team_release_psyncs(myteam, COLLECT); \ + return 0; \ + } + +SHMEM_BIND_C_RMA(`SHMEM_DEF_FCOLLECT') + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_fcollectmem(shmemx_team_t team, void *dest, const void *source, + size_t nelems) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + shmem_internal_fcollect(dest, source, nelems, myteam->start, + myteam->stride, myteam->size, psync); + shmem_internal_team_release_psyncs(myteam, COLLECT); + return 0; +} void SHMEM_FUNCTION_ATTRIBUTES shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * 4); SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * 4); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); shmem_internal_alltoall(dest, source, nelems * 4, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); } @@ -286,13 +504,52 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync) { SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * 8); SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * 8); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); shmem_internal_alltoall(dest, source, nelems * 8, - PE_start, logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync); +} + +#define SHMEM_DEF_ALLTOALL(STYPE,TYPE) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_alltoall(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, size_t nelems) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, \ + ALLTOALL); \ + shmem_internal_alltoall(dest, source, nelems * sizeof(TYPE), \ + myteam->start, myteam->stride, \ + myteam->size, psync); \ + shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ + return 0; \ + } + +SHMEM_BIND_C_RMA(`SHMEM_DEF_ALLTOALL') + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_alltoallmem(shmemx_team_t team, void *dest, const void *source, + size_t nelems) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + shmem_internal_alltoall(dest, source, nelems, myteam->start, + myteam->stride, myteam->size, psync); + shmem_internal_team_release_psyncs(myteam, ALLTOALL); + return 0; } @@ -304,13 +561,13 @@ shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_POSITIVE(sst); SHMEM_ERR_CHECK_POSITIVE(dst); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(dest, 4 * ((nelems-1) * dst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(source, 4 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); shmem_internal_alltoalls(dest, source, dst, sst, 4, nelems, PE_start, - logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync); } @@ -322,11 +579,51 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_POSITIVE(sst); SHMEM_ERR_CHECK_POSITIVE(dst); - SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(dest, 8 * ((nelems-1) * dst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(source, 8 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); shmem_internal_alltoalls(dest, source, dst, sst, 8, nelems, PE_start, - logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync); +} + +#define SHMEM_DEF_ALLTOALLS(STYPE,TYPE) \ + int SHMEM_FUNCTION_ATTRIBUTES \ + shmemx_##STYPE##_alltoalls(shmemx_team_t team, TYPE *dest, \ + const TYPE *source, ptrdiff_t dst, \ + ptrdiff_t sst, size_t nelems) \ + { \ + SHMEM_ERR_CHECK_INITIALIZED(); \ + SHMEM_ERR_CHECK_TEAM_VALID(team); \ + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ + \ + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); \ + shmem_internal_alltoalls(dest, source, dst, sst, sizeof(TYPE), \ + nelems, myteam->start, myteam->stride, \ + myteam->size, psync); \ + shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ + return 0; \ + } + +SHMEM_BIND_C_RMA(`SHMEM_DEF_ALLTOALLS') + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_alltoallsmem(shmemx_team_t team, void *dest, const void *source, + ptrdiff_t dst, ptrdiff_t sst, size_t nelems) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + SHMEM_ERR_CHECK_TEAM_VALID(team); + SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); + SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + shmem_internal_alltoalls(dest, source, dst, sst, 1, nelems, + myteam->start, myteam->stride, myteam->size, + psync); + shmem_internal_team_release_psyncs(myteam, ALLTOALL); + return 0; } diff --git a/src/collectives_f.c4 b/src/collectives_f.c4 index 81e99021c..39270601a 100644 --- a/src/collectives_f.c4 +++ b/src/collectives_f.c4 @@ -60,13 +60,13 @@ FC_SHMEM_BARRIER(fortran_integer_t *PE_start, { long *pSync_c; SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, *PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); /* SHMEM_BARRIER_SYNC_SIZE is defined to allow this cast */ pSync_c = (long*) pSync; - shmem_internal_barrier(*PE_start, *logPE_stride, *PE_size, pSync_c); + shmem_internal_barrier(*PE_start, 1 << *logPE_stride, *PE_size, pSync_c); } @@ -89,7 +89,8 @@ FC_SHMEM_BARRIER(fortran_integer_t *PE_start, { \ long *pSync_c; \ SHMEM_ERR_CHECK_INITIALIZED(); \ - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); \ + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, \ + *PE_size); \ SHMEM_ERR_CHECK_NON_NEGATIVE(*nreduce); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE) * *nreduce); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE) * *nreduce); \ @@ -103,7 +104,7 @@ FC_SHMEM_BARRIER(fortran_integer_t *PE_start, pSync_c = (long*) pSync; \ \ shmem_internal_op_to_all(target, source, *nreduce, SIZE, \ - *PE_start, *logPE_stride, *PE_size, \ + *PE_start, 1 << *logPE_stride, *PE_size, \ pWrk, pSync_c, IOP, ITYPE); \ } @@ -145,7 +146,8 @@ SHMEM_BIND_F_COLL_COMPS(`SHMEM_WRAP_TO_ALL', `prod', `SHM_INTERNAL_PROD') { \ long *pSync_c; \ SHMEM_ERR_CHECK_INITIALIZED(); \ - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); \ + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, \ + *PE_size); \ SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); \ SHMEM_ERR_CHECK_SYMMETRIC(target, *nelems * SIZE); \ SHMEM_ERR_CHECK_SYMMETRIC(source, *nelems * SIZE); \ @@ -155,7 +157,7 @@ SHMEM_BIND_F_COLL_COMPS(`SHMEM_WRAP_TO_ALL', `prod', `SHM_INTERNAL_PROD') pSync_c = (long*) pSync; \ \ shmem_internal_collect(target, source, *nelems * SIZE, *PE_start, \ - *logPE_stride, *PE_size, pSync_c); \ + 1 << *logPE_stride, *PE_size, pSync_c); \ } define(`SHMEM_WRAP_COLLECT', @@ -180,7 +182,8 @@ SHMEM_BIND_F_COLL_SIZES(`SHMEM_WRAP_COLLECT') { \ long *pSync_c; \ SHMEM_ERR_CHECK_INITIALIZED(); \ - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); \ + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, \ + *PE_size); \ SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); \ SHMEM_ERR_CHECK_SYMMETRIC(target, *nelems * SIZE); \ SHMEM_ERR_CHECK_SYMMETRIC(source, *nelems * SIZE); \ @@ -190,7 +193,7 @@ SHMEM_BIND_F_COLL_SIZES(`SHMEM_WRAP_COLLECT') pSync_c = (long*) pSync; \ \ shmem_internal_fcollect(target, source, *nelems * SIZE, *PE_start, \ - *logPE_stride, *PE_size, pSync_c); \ + 1 << *logPE_stride, *PE_size, pSync_c); \ } define(`SHMEM_WRAP_FCOLLECT', @@ -217,7 +220,8 @@ SHMEM_BIND_F_COLL_SIZES(`SHMEM_WRAP_FCOLLECT') { \ long *pSync_c; \ SHMEM_ERR_CHECK_INITIALIZED(); \ - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); \ + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, \ + *PE_size); \ SHMEM_ERR_CHECK_PE(*PE_root); \ SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); \ SHMEM_ERR_CHECK_SYMMETRIC(target, SIZE * *nelems); \ @@ -228,8 +232,8 @@ SHMEM_BIND_F_COLL_SIZES(`SHMEM_WRAP_FCOLLECT') pSync_c = (long*) pSync; \ \ shmem_internal_bcast(target, source, *nelems * SIZE, \ - *PE_root, *PE_start, *logPE_stride, *PE_size, \ - pSync_c, 1); \ + *PE_root, *PE_start, 1 << *logPE_stride, \ + *PE_size, pSync_c, 1); \ } define(`SHMEM_WRAP_BROADCAST', @@ -257,7 +261,7 @@ FC_SHMEM_ALLTOALL32(void *target, { long *pSync_c; SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, *PE_size); SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); SHMEM_ERR_CHECK_SYMMETRIC(target, *nelems * 4); SHMEM_ERR_CHECK_SYMMETRIC(source, *nelems * 4); @@ -267,7 +271,7 @@ FC_SHMEM_ALLTOALL32(void *target, pSync_c = (long*) pSync; shmem_internal_alltoall(target, source, *nelems * 4, *PE_start, - *logPE_stride, *PE_size, pSync_c); + 1 << *logPE_stride, *PE_size, pSync_c); } @@ -291,7 +295,7 @@ FC_SHMEM_ALLTOALL64(void *target, { long *pSync_c; SHMEM_ERR_CHECK_INITIALIZED(); - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, *PE_size); SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); SHMEM_ERR_CHECK_SYMMETRIC(target, *nelems * 8); SHMEM_ERR_CHECK_SYMMETRIC(source, *nelems * 8); @@ -301,7 +305,7 @@ FC_SHMEM_ALLTOALL64(void *target, pSync_c = (long*) pSync; shmem_internal_alltoall(target, source, *nelems * 8, *PE_start, - *logPE_stride, *PE_size, pSync_c); + 1 << *logPE_stride, *PE_size, pSync_c); } @@ -331,7 +335,7 @@ FC_SHMEM_ALLTOALLS32(void *target, SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_POSITIVE(*sst); SHMEM_ERR_CHECK_POSITIVE(*dst); - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, *PE_size); SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); SHMEM_ERR_CHECK_SYMMETRIC(target, 4 * ((*nelems-1) * *dst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(source, 4 * ((*nelems-1) * *sst + 1)); @@ -341,7 +345,7 @@ FC_SHMEM_ALLTOALLS32(void *target, pSync_c = (long*) pSync; shmem_internal_alltoalls(target, source, *dst, *sst, 4, *nelems, - *PE_start, *logPE_stride, *PE_size, pSync_c); + *PE_start, 1 << *logPE_stride, *PE_size, pSync_c); } @@ -371,7 +375,7 @@ FC_SHMEM_ALLTOALLS64(void *target, SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_POSITIVE(*sst); SHMEM_ERR_CHECK_POSITIVE(*dst); - SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, *logPE_stride, *PE_size); + SHMEM_ERR_CHECK_ACTIVE_SET(*PE_start, 1 << *logPE_stride, *PE_size); SHMEM_ERR_CHECK_NON_NEGATIVE(*nelems); SHMEM_ERR_CHECK_SYMMETRIC(target, 8 * ((*nelems-1) * *dst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(source, 8 * ((*nelems-1) * *sst + 1)); @@ -381,5 +385,5 @@ FC_SHMEM_ALLTOALLS64(void *target, pSync_c = (long*) pSync; shmem_internal_alltoalls(target, source, *dst, *sst, 8, *nelems, - *PE_start, *logPE_stride, *PE_size, pSync_c); + *PE_start, 1 << *logPE_stride, *PE_size, pSync_c); } diff --git a/src/contexts_c.c b/src/contexts_c.c index 040536edb..2d8c65ce5 100644 --- a/src/contexts_c.c +++ b/src/contexts_c.c @@ -20,6 +20,7 @@ #include "shmem_internal.h" #include "transport.h" #include "shmem_synchronization.h" +#include "shmem_team.h" #ifdef ENABLE_PROFILING #include "pshmem.h" @@ -41,7 +42,7 @@ shmem_ctx_create(long options, shmem_ctx_t *ctx) { SHMEM_ERR_CHECK_INITIALIZED(); - int ret = shmem_transport_ctx_create(options, (shmem_transport_ctx_t **) ctx); + int ret = shmem_transport_ctx_create(&shmem_internal_team_world, options, (shmem_transport_ctx_t **) ctx); SHMEM_ERR_CHECK_NULL(ctx, 0); diff --git a/src/data_c.c4 b/src/data_c.c4 index 5279b6e0c..305c03bf6 100644 --- a/src/data_c.c4 +++ b/src/data_c.c4 @@ -30,6 +30,7 @@ include(shmem_bind_c.m4)dnl #include "shmemx.h" #include "shmem_internal.h" #include "shmem_synchronization.h" +#include "shmem_team.h" #ifdef ENABLE_PROFILING #include "pshmem.h" @@ -573,11 +574,13 @@ SHMEM_DEF_PUT_N_SIGNAL_NBI(`mem', `1') #undef SHMEM_FUNC_PROTOTYPE #undef SHMEMX_FUNC_PROTOTYPE -#define SHMEM_FUNC_PROTOTYPE(FUNCNAME, ...) \ - shmem_ctx_##FUNCNAME(shmem_ctx_t ctx, __VA_ARGS__) { +#define SHMEM_FUNC_PROTOTYPE(FUNCNAME, ...) \ + shmem_ctx_##FUNCNAME(shmem_ctx_t ctx, __VA_ARGS__) { \ + pe = shmem_internal_team_pe(((shmem_transport_ctx_t *) ctx)->team, pe); -#define SHMEMX_FUNC_PROTOTYPE(FUNCNAME, ...) \ - shmemx_ctx_##FUNCNAME(shmem_ctx_t ctx, __VA_ARGS__) { +#define SHMEMX_FUNC_PROTOTYPE(FUNCNAME, ...) \ + shmemx_ctx_##FUNCNAME(shmem_ctx_t ctx, __VA_ARGS__) { \ + pe = shmem_internal_team_pe(((shmem_transport_ctx_t *) ctx)->team, pe); SHMEM_DEFINE_FOR_RMA(`SHMEM_DEF_P') SHMEM_DEFINE_FOR_RMA(`SHMEM_DEF_G') diff --git a/src/init.c b/src/init.c index 32eb8fd53..5127e7c2d 100644 --- a/src/init.c +++ b/src/init.c @@ -36,6 +36,7 @@ #include "shmem_comm.h" #include "runtime.h" #include "build_info.h" +#include "shmem_team.h" #if defined(ENABLE_REMOTE_VIRTUAL_ADDRESSING) && defined(__linux__) #include @@ -111,6 +112,9 @@ shmem_internal_shutdown(void) shmem_internal_barrier_all(); shmem_internal_finalized = 1; + + shmem_internal_team_fini(); + shmem_transport_fini(); #ifdef USE_XPMEM @@ -168,6 +172,7 @@ shmem_internal_init(int tl_requested, int *tl_provided) int cma_initialized = 0; #endif int randr_initialized = 0; + int teams_initialized = 0; int enable_node_ranks = 0; /* Parse environment variables into shmem_internal_params */ @@ -190,6 +195,9 @@ shmem_internal_init(int tl_requested, int *tl_provided) enable_node_ranks = (shmem_internal_params.OFI_STX_AUTO) ? 1 : 0; #endif + if (!shmem_internal_params.TEAM_SHARED_ONLY_SELF) + enable_node_ranks = 1; + ret = shmem_runtime_init(enable_node_ranks); if (0 != ret) { fprintf(stderr, "ERROR: runtime init failed: %d\n", ret); @@ -431,6 +439,13 @@ shmem_internal_init(int tl_requested, int *tl_provided) goto cleanup; } + ret = shmem_internal_team_init(); + if (ret != 0) { + RETURN_ERROR_MSG("Initialization of teams failed (%d)\n", ret); + goto cleanup; + } + teams_initialized = 1; + shmem_internal_randr_init(); randr_initialized = 1; @@ -463,6 +478,10 @@ shmem_internal_init(int tl_requested, int *tl_provided) shmem_internal_randr_fini(); } + if (teams_initialized) { + shmem_internal_team_fini(); + } + if (NULL != shmem_internal_data_base) { shmem_internal_symmetric_fini(); } diff --git a/src/shmem_collectives.h b/src/shmem_collectives.h index 2fcf827ba..6409c5178 100644 --- a/src/shmem_collectives.h +++ b/src/shmem_collectives.h @@ -40,13 +40,13 @@ extern coll_type_t shmem_internal_reduce_type; extern coll_type_t shmem_internal_collect_type; extern coll_type_t shmem_internal_fcollect_type; -void shmem_internal_sync_linear(int PE_start, int logPE_stride, int PE_size, long *pSync); -void shmem_internal_sync_tree(int PE_start, int logPE_stride, int PE_size, long *pSync); -void shmem_internal_sync_dissem(int PE_start, int logPE_stride, int PE_size, long *pSync); +void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync); +void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync); +void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync); static inline void -shmem_internal_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) +shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) { if (shmem_internal_params.BARRIERS_FLUSH) { fflush(stdout); @@ -58,19 +58,19 @@ shmem_internal_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) switch (shmem_internal_barrier_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { - shmem_internal_sync_linear(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); } else { - shmem_internal_sync_tree(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); } break; case LINEAR: - shmem_internal_sync_linear(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); break; case TREE: - shmem_internal_sync_tree(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); break; case DISSEM: - shmem_internal_sync_dissem(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync); break; default: RAISE_ERROR_MSG("Illegal barrier/sync type (%d)\n", @@ -87,16 +87,16 @@ static inline void shmem_internal_sync_all(void) { - shmem_internal_sync(0, 0, shmem_internal_num_pes, shmem_internal_sync_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync); } static inline void -shmem_internal_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) +shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(PE_start, logPE_stride, PE_size, pSync); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync); } @@ -105,40 +105,40 @@ void shmem_internal_barrier_all(void) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(0, 0, shmem_internal_num_pes, shmem_internal_barrier_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync); } void shmem_internal_bcast_linear(void *target, const void *source, size_t len, - int PE_root, int PE_start, int logPE_stride, int PE_size, + int PE_root, int PE_start, int PE_stride, int PE_size, long *pSync, int complete); void shmem_internal_bcast_tree(void *target, const void *source, size_t len, - int PE_root, int PE_start, int logPE_stride, int PE_size, + int PE_root, int PE_start, int PE_stride, int PE_size, long *pSync, int complete); static inline void shmem_internal_bcast(void *target, const void *source, size_t len, - int PE_root, int PE_start, int logPE_stride, int PE_size, + int PE_root, int PE_start, int PE_stride, int PE_size, long *pSync, int complete) { switch (shmem_internal_bcast_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - logPE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete); } else { shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - logPE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete); } break; case LINEAR: shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - logPE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete); break; case TREE: shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - logPE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete); break; default: RAISE_ERROR_MSG("Illegal broadcast type (%d)\n", @@ -147,28 +147,28 @@ shmem_internal_bcast(void *target, const void *source, size_t len, } -void shmem_internal_op_to_all_linear(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype); -void shmem_internal_op_to_all_ring(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype); -void shmem_internal_op_to_all_tree(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype); -void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, int count, int type_size, - int PE_start, int logPE_stride, int PE_size, +void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, + int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype); static inline void -shmem_internal_op_to_all(void *target, const void *source, int count, - int type_size, int PE_start, int logPE_stride, +shmem_internal_op_to_all(void *target, const void *source, size_t count, + size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, shm_internal_datatype_t datatype) @@ -180,21 +180,21 @@ shmem_internal_op_to_all(void *target, const void *source, int count, if (shmem_transport_atomic_supported(op, datatype)) { if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_op_to_all_linear(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } else { shmem_internal_op_to_all_tree(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } } else { if (count * type_size < shmem_internal_params.COLL_SIZE_CROSSOVER) shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); else shmem_internal_op_to_all_ring(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } @@ -202,33 +202,33 @@ shmem_internal_op_to_all(void *target, const void *source, int count, case LINEAR: if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_linear(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } break; case RING: shmem_internal_op_to_all_ring(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); break; case TREE: if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_tree(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); } break; case RECDBL: shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, - PE_start, logPE_stride, PE_size, + PE_start, PE_stride, PE_size, pWrk, pSync, op, datatype); break; default: @@ -239,20 +239,20 @@ shmem_internal_op_to_all(void *target, const void *source, int count, void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); static inline void shmem_internal_collect(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { switch (shmem_internal_collect_type) { case AUTO: - shmem_internal_collect_linear(target, source, len, PE_start, logPE_stride, + shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, PE_size, pSync); break; case LINEAR: - shmem_internal_collect_linear(target, source, len, PE_start, logPE_stride, + shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, PE_size, pSync); break; default: @@ -263,36 +263,36 @@ shmem_internal_collect(void *target, const void *source, size_t len, void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); static inline void shmem_internal_fcollect(void *target, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync) { switch (shmem_internal_fcollect_type) { case AUTO: - shmem_internal_fcollect_ring(target, source, len, PE_start, logPE_stride, + shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, PE_size, pSync); break; case LINEAR: - shmem_internal_fcollect_linear(target, source, len, PE_start, logPE_stride, + shmem_internal_fcollect_linear(target, source, len, PE_start, PE_stride, PE_size, pSync); break; case RING: - shmem_internal_fcollect_ring(target, source, len, PE_start, logPE_stride, + shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, PE_size, pSync); break; case RECDBL: if (0 == (PE_size & (PE_size - 1))) { - shmem_internal_fcollect_recdbl(target, source, len, PE_start, logPE_stride, + shmem_internal_fcollect_recdbl(target, source, len, PE_start, PE_stride, PE_size, pSync); } else { - shmem_internal_fcollect_ring(target, source, len, PE_start, logPE_stride, + shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, PE_size, pSync); } break; @@ -304,9 +304,9 @@ shmem_internal_fcollect(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int logPE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync); #endif diff --git a/src/shmem_env_defs.h b/src/shmem_env_defs.h index ae23487cb..f062ac53c 100644 --- a/src/shmem_env_defs.h +++ b/src/shmem_env_defs.h @@ -70,6 +70,11 @@ SHMEM_INTERNAL_ENV_DEF(FCOLLECT_ALGORITHM, string, "auto", SHMEM_INTERNAL_ENV_CA SHMEM_INTERNAL_ENV_DEF(BARRIERS_FLUSH, bool, false, SHMEM_INTERNAL_ENV_CAT_COLLECTIVES, "Flush stdout and stderr on barrier") +SHMEM_INTERNAL_ENV_DEF(TEAMS_MAX, long, DEFAULT_TEAMS_MAX, SHMEM_INTERNAL_ENV_CAT_OTHER, + "Maximum number of teams per PE") +SHMEM_INTERNAL_ENV_DEF(TEAM_SHARED_ONLY_SELF, bool, false, SHMEM_INTERNAL_ENV_CAT_OTHER, + "Include only the self PE in SHMEM_TEAM_SHARED") + #ifdef USE_CMA SHMEM_INTERNAL_ENV_DEF(CMA_PUT_MAX, size, 8*1024, SHMEM_INTERNAL_ENV_CAT_INTRANODE, "Size below which to use CMA for puts") diff --git a/src/shmem_internal.h b/src/shmem_internal.h index 130fa2aa6..61c0d21d5 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -194,24 +194,30 @@ extern unsigned int shmem_internal_rand_seed; } \ } while (0) -#define SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size) \ +#define SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, PE_stride, PE_size) \ do { \ - int shmem_err_check_active_stride = 1 << logPE_stride; \ - if (PE_start < 0 || logPE_stride < 0 || PE_size < 0 || \ - PE_start + (PE_size - 1) * shmem_err_check_active_stride > shmem_internal_num_pes) { \ - fprintf(stderr, "ERROR: %s(): Invalid active set (PE_start = %d, logPE_stride = %d, PE_size = %d)\n", \ - __func__, PE_start, logPE_stride, PE_size); \ + if (PE_start < 0 || (PE_stride) < 1 || PE_size < 0 || \ + PE_start + ((PE_size - 1) * (PE_stride)) > shmem_internal_num_pes) { \ + fprintf(stderr, "ERROR: %s(): Invalid active set (PE_start = %d, PE_stride = %d, PE_size = %d)\n", \ + __func__, PE_start, (PE_stride), PE_size); \ shmem_runtime_abort(100, PACKAGE_NAME " exited in error"); \ } \ if (! (shmem_internal_my_pe >= PE_start && \ - shmem_internal_my_pe <= PE_start + (PE_size-1) * shmem_err_check_active_stride && \ - (shmem_internal_my_pe - PE_start) % shmem_err_check_active_stride == 0)) { \ + shmem_internal_my_pe <= PE_start + ((PE_size-1) * (PE_stride)) && \ + (shmem_internal_my_pe - PE_start) % (PE_stride) == 0)) { \ fprintf(stderr, "ERROR: %s(): Calling PE (%d) is not a member of the active set\n", \ __func__, shmem_internal_my_pe); \ shmem_runtime_abort(100, PACKAGE_NAME " exited in error"); \ } \ } while (0) +#define SHMEM_ERR_CHECK_TEAM_VALID(team) \ + do { \ + if (team == SHMEMX_TEAM_INVALID) { \ + RAISE_ERROR_STR("Invalid team argument"); \ + } \ + } while (0) + #define SHMEM_ERR_CHECK_PE(pe) \ do { \ if ((pe) < 0 || (pe) >= shmem_internal_num_pes) { \ @@ -301,7 +307,8 @@ extern unsigned int shmem_internal_rand_seed; #define SHMEM_ERR_CHECK_INITIALIZED() #define SHMEM_ERR_CHECK_POSITIVE(arg) #define SHMEM_ERR_CHECK_NON_NEGATIVE(arg) -#define SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, logPE_stride, PE_size) +#define SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, PE_stride, PE_size) +#define SHMEM_ERR_CHECK_TEAM_VALID(team) #define SHMEM_ERR_CHECK_PE(pe) #define SHMEM_ERR_CHECK_SYMMETRIC(ptr, len) #define SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr) @@ -422,6 +429,19 @@ int shmem_internal_collectives_init(void); void *shmem_internal_shmalloc(size_t size); void* shmem_internal_get_next(intptr_t incr); +void dlfree(void*); + +static inline void shmem_internal_free(void *ptr) +{ + /* It's fine to call dlfree with NULL, but better to avoid unnecessarily + * taking the mutex in the threaded case. */ + if (ptr != NULL) { + SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); + dlfree(ptr); + SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); + } +} + /* Query PEs reachable using shared memory */ static inline int shmem_internal_get_shr_rank(int pe) { @@ -445,7 +465,8 @@ static inline int shmem_internal_get_shr_size(void) #endif } -static inline double shmem_internal_wtime(void) { +static inline double shmem_internal_wtime(void) +{ double wtime = 0.0; #ifdef HAVE_CLOCK_GETTIME @@ -476,4 +497,81 @@ void shmem_util_backtrace(void); extern uint64_t (*shmem_internal_gettid_fn)(void); extern void shmem_internal_register_gettid(uint64_t (*gettid_fn)(void)); +static inline +void shmem_internal_bit_set(unsigned char *ptr, size_t size, size_t index) +{ + shmem_internal_assert(size > 0 && (index < size * CHAR_BIT)); + + size_t which_byte = index / size; + ptr[which_byte] |= (1 << (index % CHAR_BIT)); + + return; +} + +static inline +void shmem_internal_bit_clear(unsigned char *ptr, size_t size, size_t index) +{ + shmem_internal_assert(size > 0 && (index < size * CHAR_BIT)); + + size_t which_byte = index / size; + ptr[which_byte] &= ~(1 << (index % CHAR_BIT)); + + return; +} + +static inline +unsigned char shmem_internal_bit_fetch(unsigned char *ptr, size_t index) +{ + return (ptr[index / CHAR_BIT] >> index) & 1; +} + +static inline +size_t shmem_internal_bit_1st_nonzero(const unsigned char *ptr, const size_t size) +{ + /* The following ignores endianess: */ + for(size_t i = 0; i < size; i++) { + unsigned char bit_val = ptr[i]; + for (size_t j = 0; bit_val && j < CHAR_BIT; j++) { + if (bit_val & 1) return i * CHAR_BIT + j; + bit_val >>= 1; + } + } + + return -1; +} + +/* Create a bit string of the format AAAAAAAA.BBBBBBBB into str for the byte + * array passed via ptr. */ +static inline +void shmem_internal_bit_to_string(char *str, size_t str_size, + unsigned char *ptr, size_t ptr_size) +{ + size_t off = 0; + + for (size_t i = 0; i < ptr_size; i++) { + for (size_t j = 0; j < CHAR_BIT; j++) { + off += snprintf(str+off, str_size-off, "%s", + (ptr[i] & (1 << (CHAR_BIT-1-j))) ? "1" : "0"); + if (off >= str_size) return; + } + if (i < ptr_size - 1) { + off += snprintf(str+off, str_size-off, "."); + if (off >= str_size) return; + } + } +} + +/* Return -1 if `global_pe` is not in the given active set. + * If `global_pe` is in the active set, return the PE index within this set. */ +static inline +int shmem_internal_pe_in_active_set(int global_pe, int PE_start, int PE_stride, int PE_size) +{ + int n = (global_pe - PE_start) / PE_stride; + if (global_pe < PE_start || (global_pe - PE_start) % PE_stride || n >= PE_size) + return -1; + else { + return n; + } +} + #endif diff --git a/src/shmem_internal_op.h b/src/shmem_internal_op.h index 74eb7722d..13f0efba4 100644 --- a/src/shmem_internal_op.h +++ b/src/shmem_internal_op.h @@ -87,6 +87,30 @@ FUNC_OP_CREATE(short, short, and, shmem_internal_and_op) FUNC_OP_CREATE(short, short, or, shmem_internal_or_op) FUNC_OP_CREATE(short, short, xor, shmem_internal_xor_op) +FUNC_OP_CREATE(ushort, unsigned short, max, shmem_internal_max_op) +FUNC_OP_CREATE(ushort, unsigned short, min, shmem_internal_min_op) +FUNC_OP_CREATE(ushort, unsigned short, sum, shmem_internal_sum_op) +FUNC_OP_CREATE(ushort, unsigned short, prod, shmem_internal_prod_op) +FUNC_OP_CREATE(ushort, unsigned short, and, shmem_internal_and_op) +FUNC_OP_CREATE(ushort, unsigned short, or, shmem_internal_or_op) +FUNC_OP_CREATE(ushort, unsigned short, xor, shmem_internal_xor_op) + +FUNC_OP_CREATE(uint, unsigned int, max, shmem_internal_max_op) +FUNC_OP_CREATE(uint, unsigned int, min, shmem_internal_min_op) +FUNC_OP_CREATE(uint, unsigned int, sum, shmem_internal_sum_op) +FUNC_OP_CREATE(uint, unsigned int, prod, shmem_internal_prod_op) +FUNC_OP_CREATE(uint, unsigned int, and, shmem_internal_and_op) +FUNC_OP_CREATE(uint, unsigned int, or, shmem_internal_or_op) +FUNC_OP_CREATE(uint, unsigned int, xor, shmem_internal_xor_op) + +FUNC_OP_CREATE(ulong, unsigned long, max, shmem_internal_max_op) +FUNC_OP_CREATE(ulong, unsigned long, min, shmem_internal_min_op) +FUNC_OP_CREATE(ulong, unsigned long, sum, shmem_internal_sum_op) +FUNC_OP_CREATE(ulong, unsigned long, prod, shmem_internal_prod_op) +FUNC_OP_CREATE(ulong, unsigned long, and, shmem_internal_and_op) +FUNC_OP_CREATE(ulong, unsigned long, or, shmem_internal_or_op) +FUNC_OP_CREATE(ulong, unsigned long, xor, shmem_internal_xor_op) + FUNC_OP_CREATE(int8, int8_t, max, shmem_internal_max_op) FUNC_OP_CREATE(int8, int8_t, min, shmem_internal_min_op) FUNC_OP_CREATE(int8, int8_t, sum, shmem_internal_sum_op) @@ -95,6 +119,10 @@ FUNC_OP_CREATE(int8, int8_t, and, shmem_internal_and_op) FUNC_OP_CREATE(int8, int8_t, or, shmem_internal_or_op) FUNC_OP_CREATE(int8, int8_t, xor, shmem_internal_xor_op) +FUNC_OP_CREATE(uchar, unsigned char, and, shmem_internal_and_op) +FUNC_OP_CREATE(uchar, unsigned char, or, shmem_internal_or_op) +FUNC_OP_CREATE(uchar, unsigned char, xor, shmem_internal_xor_op) + #define REDUCE_LOCAL_DTYPE_CASE_FP(dtype, dtype_name, c_type) \ case dtype: \ switch(op) { \ @@ -158,6 +186,23 @@ FUNC_OP_CREATE(int8, int8_t, xor, shmem_internal_xor_op) } \ break; +#define REDUCE_LOCAL_DTYPE_CASE_AND_OR_XOR(dtype, dtype_name, c_type) \ + case dtype: \ + switch(op) { \ + case SHM_INTERNAL_BAND: \ + shmem_op_##dtype_name##_and_func((c_type *) in, (c_type *) inout, count); \ + break; \ + case SHM_INTERNAL_BOR: \ + shmem_op_##dtype_name##_or_func((c_type *) in, (c_type *) inout, count); \ + break; \ + case SHM_INTERNAL_BXOR: \ + shmem_op_##dtype_name##_xor_func((c_type *) in, (c_type *) inout, count); \ + break; \ + default: \ + RAISE_ERROR_STR("unsupported reduction on " # c_type); \ + } \ + break; + static inline void shmem_internal_reduce_local(shm_internal_op_t op, shm_internal_datatype_t datatype, int count, void *in, void *inout) { @@ -170,7 +215,11 @@ static inline void shmem_internal_reduce_local(shm_internal_op_t op, REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_INT64, int64, int64_t); REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_INT32, int32, int32_t); REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_SHORT, short, short); + REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_USHORT, ushort, unsigned short); REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_SIGNED_BYTE, int8, int8_t); + REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_UINT, uint, unsigned int); + REDUCE_LOCAL_DTYPE_CASE_INT(SHM_INTERNAL_ULONG, ulong, unsigned long); + REDUCE_LOCAL_DTYPE_CASE_AND_OR_XOR(SHM_INTERNAL_UCHAR, uchar, unsigned char); default: RAISE_ERROR_MSG("invalid data type (%d)", (int) datatype); diff --git a/src/shmem_team.c b/src/shmem_team.c new file mode 100644 index 000000000..28fd8e6ce --- /dev/null +++ b/src/shmem_team.c @@ -0,0 +1,440 @@ +/* -*- C -*- + * + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license. + * + * This file is part of the Sandia OpenSHMEM software package. For license + * information, see the LICENSE file in the top level directory of the + * distribution. + * + */ + + +#include "config.h" + +#include "shmemx.h" +#include "shmem_team.h" +#include "shmem_collectives.h" +#include "shmem_remote_pointer.h" + +#include + +#define SHMEM_TEAM_WORLD_INDEX 0 +#define SHMEM_TEAM_SHARED_INDEX 1 +#define SHMEM_TEAMS_MIN 2 + +#define N_PSYNC_BYTES 8 +#define PSYNC_CHUNK_SIZE (N_PSYNCS_PER_TEAM * SHMEM_SYNC_SIZE) + + +shmem_internal_team_t shmem_internal_team_world; +shmemx_team_t SHMEMX_TEAM_WORLD = (shmemx_team_t) &shmem_internal_team_world; + +shmem_internal_team_t shmem_internal_team_shared; +shmemx_team_t SHMEMX_TEAM_SHARED = (shmemx_team_t) &shmem_internal_team_shared; + +shmem_internal_team_t **shmem_internal_team_pool; +long *shmem_internal_psync_pool; +long *shmem_internal_psync_barrier_pool; +static unsigned char *psync_pool_avail; +static unsigned char *psync_pool_avail_reduced; + + +/* Checks whether a PE has a consistent stride given (start, stride, size). + * This function is useful within a loop across PE IDs, and sets 'start', + * 'stride' and 'size' accordingly upon exiting the loop. It also assumes + * 'start' and 'stride' are initialized to a negative number and 'size' to 0. + * If an inconsistent stride is found, returns -1. */ +static inline +int check_for_linear_stride(int pe, int *start, int *stride, int *size) +{ + if (*start < 0) { + *start = pe; + (*size)++; + } else if (*stride < 0) { + *stride = pe - *start; + (*size)++; + } else if ((pe - *start) % *stride != 0) { + RAISE_WARN_MSG("Detected non-uniform stride inserting PE %d into <%d, %d, %d>\n", + pe, *start, *stride, *size); + return -1; + } else { + (*size)++; + } + return 0; +} + +/* Team Management Routines */ + +int shmem_internal_team_init(void) +{ + + /* Initialize SHMEM_TEAM_WORLD */ + shmem_internal_team_world.psync_idx = SHMEM_TEAM_WORLD_INDEX; + shmem_internal_team_world.start = 0; + shmem_internal_team_world.stride = 1; + shmem_internal_team_world.size = shmem_internal_num_pes; + shmem_internal_team_world.my_pe = shmem_internal_my_pe; + shmem_internal_team_world.config_mask = 0; + shmem_internal_team_world.contexts_len = 0; + memset(&shmem_internal_team_world.config, 0, sizeof(shmemx_team_config_t)); + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) + shmem_internal_team_world.psync_avail[i] = 1; + SHMEMX_TEAM_WORLD = (shmemx_team_t) &shmem_internal_team_world; + + /* Initialize SHMEM_TEAM_SHARED */ + shmem_internal_team_shared.psync_idx = SHMEM_TEAM_SHARED_INDEX; + shmem_internal_team_shared.my_pe = shmem_internal_my_pe; + shmem_internal_team_shared.config_mask = 0; + shmem_internal_team_shared.contexts_len = 0; + memset(&shmem_internal_team_shared.config, 0, sizeof(shmemx_team_config_t)); + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) + shmem_internal_team_shared.psync_avail[i] = 1; + SHMEMX_TEAM_SHARED = (shmemx_team_t) &shmem_internal_team_shared; + + if (shmem_internal_params.TEAM_SHARED_ONLY_SELF) { + shmem_internal_team_shared.start = shmem_internal_my_pe; + shmem_internal_team_shared.stride = 1; + shmem_internal_team_shared.size = 1; + } else { /* Search for on-node peer PEs while checking for a consistent stride */ + int start = -1, stride = -1, size = 0; + + for (int pe = 0; pe < shmem_internal_num_pes; pe++) { + void *ret_ptr = shmem_internal_ptr(shmem_internal_heap_base, pe); + if (ret_ptr == NULL) continue; + + int ret = check_for_linear_stride(pe, &start, &stride, &size); + if (ret < 0) { + start = shmem_internal_my_pe; + stride = 1; + size = 1; + break; + } + } + shmem_internal_assertp(size > 0 && size <= shmem_runtime_get_node_size()); + + shmem_internal_team_shared.start = start; + shmem_internal_team_shared.stride = (stride == -1) ? 1 : stride; + shmem_internal_team_shared.size = size; + + DEBUG_MSG("SHMEM_TEAM_SHARED: start=%d, stride=%d, size=%d\n", + shmem_internal_team_shared.start, shmem_internal_team_shared.stride, + shmem_internal_team_shared.size); + } + + if (shmem_internal_params.TEAMS_MAX > N_PSYNC_BYTES * CHAR_BIT) { + RETURN_ERROR_MSG("Requested %ld teams, but only %d are supported\n", + shmem_internal_params.TEAMS_MAX, N_PSYNC_BYTES * CHAR_BIT); + return 1; + } + + if (shmem_internal_params.TEAMS_MAX < SHMEM_TEAMS_MIN) + shmem_internal_params.TEAMS_MAX = SHMEM_TEAMS_MIN; + + shmem_internal_team_pool = malloc(shmem_internal_params.TEAMS_MAX * + sizeof(shmem_internal_team_t*)); + + for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { + shmem_internal_team_pool[i] = NULL; + } + shmem_internal_team_pool[SHMEM_TEAM_WORLD_INDEX] = &shmem_internal_team_world; + shmem_internal_team_pool[SHMEM_TEAM_SHARED_INDEX] = &shmem_internal_team_shared; + + /* Allocate pSync pool, each with the maximum possible size requirement */ + /* Create two pSyncs per team for back-to-back collectives and one for barriers. + * Array organization: + * + * [ (world) (shared) (team 1) (team 2) ... (world) (shared) (team 1) (team 2) ... ] + * <----------- groups 1 & 2-------------->|<------------- group 3 ----------------> + * <--- (bcast, collect, reduce, etc.) --->|<------ (barriers and syncs) ----------> + * */ + long psync_len = shmem_internal_params.TEAMS_MAX * (PSYNC_CHUNK_SIZE + SHMEM_SYNC_SIZE); + shmem_internal_psync_pool = shmem_internal_shmalloc(sizeof(long) * psync_len); + if (NULL == shmem_internal_psync_pool) return -1; + + for (long i = 0; i < psync_len; i++) { + shmem_internal_psync_pool[i] = SHMEM_SYNC_VALUE; + } + + /* Convenience pointer to the group-3 pSync array (for barriers and syncs): */ + shmem_internal_psync_barrier_pool = &shmem_internal_psync_pool[PSYNC_CHUNK_SIZE * + shmem_internal_params.TEAMS_MAX]; + + psync_pool_avail = shmem_internal_shmalloc(2 * N_PSYNC_BYTES); + psync_pool_avail_reduced = &psync_pool_avail[N_PSYNC_BYTES]; + + /* Initialize the psync bits to 1, making all slots available: */ + memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES); + for (size_t i = 0; i < (size_t) shmem_internal_params.TEAMS_MAX; i++) { + shmem_internal_bit_set(psync_pool_avail, N_PSYNC_BYTES, i); + } + + /* Set the bits for SHMEM_TEAM_WORLD and SHMEM_TEAM_SHARED to 0: */ + shmem_internal_bit_clear(psync_pool_avail, N_PSYNC_BYTES, SHMEM_TEAM_WORLD_INDEX); + shmem_internal_bit_clear(psync_pool_avail, N_PSYNC_BYTES, SHMEM_TEAM_SHARED_INDEX); + + return 0; +} + +void shmem_internal_team_fini(void) +{ + /* Destroy all undestroyed teams */ + for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { + if (shmem_internal_team_pool[i] != NULL) + shmem_internal_team_destroy(shmem_internal_team_pool[i]); + } + + free(shmem_internal_team_pool); + shmem_internal_free(shmem_internal_psync_pool); + shmem_internal_free(psync_pool_avail); + + return; +} + +int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe, + shmem_internal_team_t *dest_team) +{ + int src_pe_world, dest_pe = -1; + + if (src_team == SHMEMX_TEAM_INVALID || dest_team == SHMEMX_TEAM_INVALID) + return -1; + + if (src_pe > src_team->size) + return -1; + + src_pe_world = src_team->start + src_pe * src_team->stride; + + shmem_internal_assert(src_pe_world >= src_team->start && src_pe_world < shmem_internal_num_pes); + + dest_pe = shmem_internal_pe_in_active_set(src_pe_world, dest_team->start, dest_team->stride, + dest_team->size); + + return dest_pe; +} + +int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, + int PE_size, const shmemx_team_config_t *config, long config_mask, + shmem_internal_team_t **new_team) +{ + + *new_team = SHMEMX_TEAM_INVALID; + + if (parent_team == SHMEMX_TEAM_INVALID) { + return 0; + } + + int global_PE_start = shmem_internal_team_pe(parent_team, PE_start); + int global_PE_end = global_PE_start + PE_stride * (PE_size -1); + + if (PE_start < 0 || PE_start >= parent_team->size || + PE_size <= 0 || PE_size > parent_team->size || + PE_stride < 1) { + RAISE_WARN_MSG("Invalid : child <%d, %d, %d>, parent <%d, %d, %d>\n", + PE_start, PE_stride, PE_size, + parent_team->start, parent_team->stride, parent_team->size); + return -1; + } + + if (global_PE_start >= shmem_internal_num_pes || + global_PE_end >= shmem_internal_num_pes) { + RAISE_WARN_MSG("Starting PE (%d) or ending PE (%d) is invalid\n", + global_PE_start, global_PE_end); + return -1; + } + + int my_pe = shmem_internal_pe_in_active_set(shmem_internal_my_pe, + global_PE_start, PE_stride, PE_size); + + long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + shmem_internal_team_t *myteam = NULL; + + if (my_pe != -1) { + char bit_str[SHMEM_INTERNAL_DIAG_STRLEN]; + + myteam = calloc(1, sizeof(shmem_internal_team_t)); + + myteam->my_pe = my_pe; + myteam->start = global_PE_start; + myteam->stride = PE_stride; + myteam->size = PE_size; + if (config) { + myteam->config = *config; + myteam->config_mask = config_mask; + } + myteam->contexts_len = 0; + myteam->psync_idx = -1; + + shmem_internal_op_to_all(psync_pool_avail_reduced, + psync_pool_avail, N_PSYNC_BYTES, 1, + myteam->start, PE_stride, PE_size, NULL, + psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR); + + /* We cannot release the psync here, because this reduction may not + * have been performed on the entire parent team. */ + + shmem_internal_bit_to_string(bit_str, SHMEM_INTERNAL_DIAG_STRLEN, + psync_pool_avail_reduced, N_PSYNC_BYTES); + DEBUG_MSG("My pSyncs [ %s ]\n", bit_str); + + /* Select the least signficant nonzero bit, which corresponds to an available pSync. */ + myteam->psync_idx = shmem_internal_bit_1st_nonzero(psync_pool_avail_reduced, N_PSYNC_BYTES); + + shmem_internal_bit_to_string(bit_str, SHMEM_INTERNAL_DIAG_STRLEN, + psync_pool_avail_reduced, N_PSYNC_BYTES); + DEBUG_MSG("All pSyncs [ %s ], allocated %d\n", bit_str, + myteam->psync_idx); + + if (myteam->psync_idx == -1 || myteam->psync_idx >= shmem_internal_params.TEAMS_MAX) { + RAISE_WARN_MSG("No more teams available (max = %ld), try increasing SHMEM_TEAMS_MAX\n", + shmem_internal_params.TEAMS_MAX); + myteam->psync_idx = -1; + } else { + /* Set the selected psync bit to 0, reserving that slot */ + shmem_internal_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->psync_idx); + + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) + myteam->psync_avail[i] = 1; + + *new_team = myteam; + + shmem_internal_team_pool[myteam->psync_idx] = *new_team; + } + } + + psync = shmem_internal_team_choose_psync(parent_team, SYNC); + + shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync); + + shmem_internal_team_release_psyncs(parent_team, SYNC); + + if (my_pe >= 0 && myteam != NULL && myteam->psync_idx == -1) + RAISE_ERROR_MSG("Team split strided failed: child <%d, %d, %d>, parent <%d, %d, %d>\n", + global_PE_start, PE_stride, PE_size, + parent_team->start, parent_team->stride, parent_team->size); + else + return 0; +} + +int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, + const shmemx_team_config_t *xaxis_config, long xaxis_mask, + shmem_internal_team_t **xaxis_team, const shmemx_team_config_t *yaxis_config, + long yaxis_mask, shmem_internal_team_t **yaxis_team) +{ + const int parent_start = parent_team->start; + const int parent_stride = parent_team->stride; + const int parent_size = parent_team->size; + const int num_xteams = ceil( parent_size / (float)xrange ); + const int num_yteams = xrange; + + int start = parent_start; + int ret = 0; + + for (int i = 0; i < num_xteams; i++) { + int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange; + + ret = shmem_internal_team_split_strided(parent_team, start, parent_stride, + xsize, xaxis_config, xaxis_mask, xaxis_team); + if (ret) { + RAISE_ERROR_MSG("Creation of x-axis team %d of %d failed\n", i, num_xteams); + } + start += xrange * parent_stride; + } + + start = parent_start; + + for (int i = 0; i < num_yteams; i++) { + int remainder = parent_size % xrange; + int yrange = parent_size / xrange; + int ysize = (remainder && i < remainder) ? yrange + 1 : yrange; + + ret = shmem_internal_team_split_strided(parent_team, start, xrange*parent_stride, + ysize, yaxis_config, yaxis_mask, yaxis_team); + if (ret) { + RAISE_ERROR_MSG("Creation of y-axis team %d of %d failed\n", i, num_yteams); + } + start += parent_stride; + } + + long *psync = shmem_internal_team_choose_psync(parent_team, SYNC); + + shmem_internal_barrier(parent_start, parent_stride, parent_size, psync); + + shmem_internal_team_release_psyncs(parent_team, SYNC); + + return 0; +} + +int shmem_internal_team_destroy(shmem_internal_team_t *team) +{ + + if (team == SHMEMX_TEAM_INVALID) { + return -1; + } else if (shmem_internal_bit_fetch(psync_pool_avail, team->psync_idx)) { + RAISE_ERROR_STR("Destroying a team without an active pSync"); + } else { + shmem_internal_bit_set(psync_pool_avail, N_PSYNC_BYTES, team->psync_idx); + } + + /* Destroy all undestroyed shareable contexts on this team */ + for (size_t i = 0; i < team->contexts_len; i++) { + if (team->contexts[i] != NULL) { + if (team->contexts[i]->options & SHMEM_CTX_PRIVATE) + RAISE_WARN_MSG("Destroying team with unfreed private context (%zu)\n", i); + shmem_transport_quiet(team->contexts[i]); + shmem_transport_ctx_destroy(shmem_internal_team_world.contexts[i]); + } + } + shmem_internal_team_pool[team->psync_idx] = NULL; + free(team->contexts); + + if (team != &shmem_internal_team_world && team != &shmem_internal_team_shared) { + free(team); + } + + return 0; +} + +/* Returns a psync from the given team that can be safely used for the + * specified collective operation. */ +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) +{ + + switch (op) { + case SYNC: + return &shmem_internal_psync_barrier_pool[team->psync_idx * SHMEM_SYNC_SIZE]; + + default: + for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { + if (team->psync_avail[i]) { + team->psync_avail[i] = 0; + return &shmem_internal_psync_pool[(team->psync_idx + i) * PSYNC_CHUNK_SIZE]; + } + } + + size_t psync = team->psync_idx * SHMEM_SYNC_SIZE; + shmem_internal_sync(team->start, team->stride, team->size, + &shmem_internal_psync_barrier_pool[psync]); + + for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { + team->psync_avail[i] = 1; + } + team->psync_avail[0] = 0; + + return &shmem_internal_psync_pool[psync]; + } +} + +void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op) +{ + switch (op) { + case SYNC: + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) { + team->psync_avail[i] = 1; + } + break; + default: + break; + } + + return; +} diff --git a/src/shmem_team.h b/src/shmem_team.h new file mode 100644 index 000000000..1d58a9f14 --- /dev/null +++ b/src/shmem_team.h @@ -0,0 +1,83 @@ +/* -*- C -*- + * + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license. + * + * This file is part of the Sandia OpenSHMEM software package. For license + * information, see the LICENSE file in the top level directory of the + * distribution. + * + */ + +#ifndef SHMEM_TEAM_H +#define SHMEM_TEAM_H + +#include "shmemx.h" +#include "transport.h" +#include "uthash.h" + +#define N_PSYNCS_PER_TEAM 2 + +struct shmem_internal_team_t { + int my_pe; + int start, stride, size; + int psync_idx; + int psync_avail[N_PSYNCS_PER_TEAM]; + shmemx_team_config_t config; + long config_mask; + size_t contexts_len; + struct shmem_transport_ctx_t **contexts; +}; +typedef struct shmem_internal_team_t shmem_internal_team_t; + +extern shmem_internal_team_t shmem_internal_team_world; +extern shmem_internal_team_t shmem_internal_team_shared; + +enum shmem_internal_team_op_t { + SYNC = 0, + BCAST, + REDUCE, + COLLECT, + ALLTOALL +}; +typedef enum shmem_internal_team_op_t shmem_internal_team_op_t; + +/* Team Management Routines */ + +int shmem_internal_team_init(void); + +void shmem_internal_team_fini(void); + +int shmem_internal_team_my_pe(shmem_internal_team_t *team); + +int shmem_internal_team_n_pes(shmem_internal_team_t *team); + +void shmem_internal_team_get_config(shmem_internal_team_t *team, shmemx_team_config_t *config); + +int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe, shmem_internal_team_t *dest_team); + +int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, + int PE_size, const shmemx_team_config_t *config, long config_mask, + shmem_internal_team_t **new_team); + +int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, + const shmemx_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, + const shmemx_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team); + +int shmem_internal_team_destroy(shmem_internal_team_t *team); + +int shmem_internal_team_create_ctx(shmem_internal_team_t *team, long options, shmem_ctx_t *ctx); + +int shmem_internal_ctx_get_team(shmem_ctx_t ctx, shmem_internal_team_t **team); + +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op); + +void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op); + +static inline +int shmem_internal_team_pe(shmem_internal_team_t *team, int pe) +{ + return team->start + team->stride * pe; +} + +#endif diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index e6e5b745b..6961b21da 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -317,13 +317,7 @@ shmem_free(void *ptr) shmem_internal_barrier_all(); - /* It's fine to call dlfree with NULL, but better to avoid unnecessarily - * taking the mutex in the threaded case. */ - if (ptr != NULL) { - SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); - dlfree(ptr); - SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - } + shmem_internal_free(ptr); } diff --git a/src/teams_c.c4 b/src/teams_c.c4 new file mode 100644 index 000000000..764ffb21e --- /dev/null +++ b/src/teams_c.c4 @@ -0,0 +1,131 @@ +/* -*- C -*- + * + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license. + * + * This file is part of the Sandia OpenSHMEM software package. For license + * information, see the LICENSE file in the top level directory of the + * distribution. + * + */ + +#include "config.h" + +#define SHMEM_INTERNAL_INCLUDE +#include "shmem.h" +#include "shmemx.h" + +#include "shmem_team.h" + +/* Team Managment Routines */ + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_my_pe(shmemx_team_t team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if (team == SHMEMX_TEAM_INVALID) + return -1; + else + return ((shmem_internal_team_t *)team)->my_pe; +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_n_pes(shmemx_team_t team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if (team == SHMEMX_TEAM_INVALID) + return -1; + else + return ((shmem_internal_team_t *)team)->size; +} + +void SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_get_config(shmemx_team_t team, shmemx_team_config_t *config) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if (team == SHMEMX_TEAM_INVALID) + return; + + shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; + *config = myteam->config; + return; +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_translate_pe(shmemx_team_t src_team, int src_pe, shmemx_team_t dest_team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + return shmem_internal_team_translate_pe((shmem_internal_team_t *)src_team, + src_pe, (shmem_internal_team_t *)dest_team); +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_split_strided(shmemx_team_t parent_team, int PE_start, + int PE_stride, int PE_size, const shmemx_team_config_t + *config, long config_mask, shmemx_team_t *new_team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + return shmem_internal_team_split_strided((shmem_internal_team_t *)parent_team, + PE_start, PE_stride, PE_size, config, + config_mask, (shmem_internal_team_t **)new_team); +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_split_2d(shmemx_team_t parent_team, int xrange, + const shmemx_team_config_t *xaxis_config, long xaxis_mask, + shmemx_team_t *xaxis_team, const shmemx_team_config_t *yaxis_config, + long yaxis_mask, shmemx_team_t *yaxis_team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + return shmem_internal_team_split_2d((shmem_internal_team_t *)parent_team, + xrange, xaxis_config, xaxis_mask, + (shmem_internal_team_t **)xaxis_team, + yaxis_config, yaxis_mask, + (shmem_internal_team_t **)yaxis_team); +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_destroy(shmemx_team_t team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if ((shmem_internal_team_t *)team == &shmem_internal_team_world || + (shmem_internal_team_t *)team == &shmem_internal_team_shared) + RAISE_ERROR_STR("Cannot destroy a pre-defined team"); + + return shmem_internal_team_destroy((shmem_internal_team_t *)team); +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_team_create_ctx(shmemx_team_t team, long options, shmem_ctx_t *ctx) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if (team == SHMEMX_TEAM_INVALID) + return -1; + + int ret = shmem_transport_ctx_create((shmem_internal_team_t *) team, + options, (shmem_transport_ctx_t **) ctx); + return ret; +} + +int SHMEM_FUNCTION_ATTRIBUTES +shmemx_ctx_get_team(shmem_ctx_t ctx, shmemx_team_t *team) +{ + SHMEM_ERR_CHECK_INITIALIZED(); + + if (ctx == SHMEMX_CTX_INVALID) { + *team = SHMEMX_TEAM_INVALID; + return -1; + } + + shmem_transport_ctx_t *ctxp = (shmem_transport_ctx_t *)ctx; + *team = (shmemx_team_t) ctxp->team; + return 0; +} diff --git a/src/transport_none.h b/src/transport_none.h index 2d8aaa47b..07ab34930 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -39,8 +39,12 @@ #define SHM_INTERNAL_ULONG_LONG DTYPE_UNSIGNED_LONG_LONG #define SHM_INTERNAL_SIZE_T DTYPE_SIZE_T #define SHM_INTERNAL_PTRDIFF_T DTYPE_PTRDIFF_T +#define SHM_INTERNAL_UINT8 -11 +#define SHM_INTERNAL_UINT16 -12 #define SHM_INTERNAL_UINT32 -13 #define SHM_INTERNAL_UINT64 -14 +#define SHM_INTERNAL_UCHAR DTYPE_UNSIGNED_CHAR +#define SHM_INTERNAL_USHORT DTYPE_UNSIGNED_SHORT /* Operations */ #define SHM_INTERNAL_BAND -1 @@ -55,7 +59,8 @@ typedef int shm_internal_datatype_t; typedef int shm_internal_op_t; typedef int shmem_transport_ct_t; -struct shmem_transport_ctx_t{ int dummy; }; +struct shmem_transport_ctx_t{ long options; + struct shmem_internal_team_t *team;}; typedef struct shmem_transport_ctx_t shmem_transport_ctx_t; @@ -89,7 +94,7 @@ shmem_transport_probe(void) static inline int -shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx) +shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, shmem_transport_ctx_t **ctx) { *ctx = NULL; return 0; diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 8f99d4593..3047c1c4f 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -137,8 +137,6 @@ struct shmem_internal_tid shmem_transport_ofi_gettid(void) static struct fabric_info shmem_transport_ofi_info = {0}; -static shmem_transport_ctx_t** shmem_transport_ofi_contexts = NULL; -static size_t shmem_transport_ofi_contexts_len = 0; static size_t shmem_transport_ofi_grow_size = 128; #define SHMEM_TRANSPORT_CTX_DEFAULT_ID -1 @@ -1514,6 +1512,8 @@ int shmem_transport_startup(void) shmem_transport_ofi_stx_pool[i].is_private = 0; } + shmem_transport_ctx_default.team = &shmem_internal_team_world; + ret = shmem_transport_ofi_ctx_init(&shmem_transport_ctx_default, SHMEM_TRANSPORT_CTX_DEFAULT_ID); if (ret != 0) return ret; @@ -1529,32 +1529,34 @@ int shmem_transport_startup(void) return 0; } -int shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx) +int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, shmem_transport_ctx_t **ctx) { - SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); - int ret; size_t id; + if (team == NULL) + RAISE_ERROR_STR("Context creation occured on a NULL team"); + + SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); + /* Look for an open slot in the contexts array */ - for (id = 0; id < shmem_transport_ofi_contexts_len; id++) - if (shmem_transport_ofi_contexts[id] == NULL) break; + for (id = 0; id < team->contexts_len; id++) + if (team->contexts[id] == NULL) break; /* If none found, grow the array */ - if (id >= shmem_transport_ofi_contexts_len) { - id = shmem_transport_ofi_contexts_len; + if (id >= team->contexts_len) { + id = team->contexts_len; - size_t i = shmem_transport_ofi_contexts_len; - shmem_transport_ofi_contexts_len += shmem_transport_ofi_grow_size; - shmem_transport_ofi_contexts = realloc(shmem_transport_ofi_contexts, - shmem_transport_ofi_contexts_len * sizeof(shmem_transport_ctx_t*)); + size_t i = team->contexts_len; + team->contexts_len += shmem_transport_ofi_grow_size; + team->contexts = realloc(team->contexts, team->contexts_len * sizeof(shmem_transport_ctx_t*)); - for ( ; i < shmem_transport_ofi_contexts_len; i++) - shmem_transport_ofi_contexts[i] = NULL; - - if (shmem_transport_ofi_contexts == NULL) { + if (team->contexts == NULL) { RAISE_ERROR_STR("Out of memory when allocating OFI ctx array"); } + + for ( ; i < team->contexts_len; i++) + team->contexts[i] = NULL; } shmem_transport_ctx_t *ctxp = malloc(sizeof(shmem_transport_ctx_t)); @@ -1573,12 +1575,14 @@ int shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx) ctxp->stx_idx = -1; ctxp->options = options; + ctxp->team = team; + ret = shmem_transport_ofi_ctx_init(ctxp, id); if (ret) { shmem_transport_ctx_destroy(ctxp); } else { - shmem_transport_ofi_contexts[id] = ctxp; + team->contexts[id] = ctxp; *ctx = ctxp; } @@ -1591,6 +1595,9 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) { int ret; + if (ctx == NULL) + return; + if(shmem_internal_params.DEBUG) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (ctx->bounce_buffers) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); @@ -1670,7 +1677,7 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) if (ctx->id >= 0) { SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); - shmem_transport_ofi_contexts[ctx->id] = NULL; + ctx->team->contexts[ctx->id] = NULL; SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); free(ctx); } @@ -1685,20 +1692,8 @@ int shmem_transport_fini(void) shmem_transport_ofi_stx_kvs_t* e; int stx_len = 0; - /* Free all shareable contexts. This performs a quiet on each context, - * ensuring all operations have completed before proceeding with shutdown. */ - - for (size_t i = 0; i < shmem_transport_ofi_contexts_len; ++i) { - if (shmem_transport_ofi_contexts[i]) { - if (shmem_transport_ofi_is_private(shmem_transport_ofi_contexts[i]->options)) - RAISE_WARN_MSG("Shutting down with unfreed private context (%zu)\n", i); - shmem_transport_quiet(shmem_transport_ofi_contexts[i]); - shmem_transport_ctx_destroy(shmem_transport_ofi_contexts[i]); - } - } - - if (shmem_transport_ofi_contexts) free(shmem_transport_ofi_contexts); - + /* The default context is not inserted into the list of contexts on + * SHMEM_TEAM_WORLD, so it must be destroyed here */ shmem_transport_quiet(&shmem_transport_ctx_default); shmem_transport_ctx_destroy(&shmem_transport_ctx_default); diff --git a/src/transport_ofi.h b/src/transport_ofi.h index d449127c0..c70c7b07a 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -30,6 +30,7 @@ #include "shmem_free_list.h" #include "shmem_internal.h" #include "shmem_atomic.h" +#include "shmem_team.h" #include @@ -201,8 +202,12 @@ typedef enum fi_op shm_internal_op_t; #define SHM_INTERNAL_ULONG_LONG DTYPE_UNSIGNED_LONG_LONG #define SHM_INTERNAL_SIZE_T DTYPE_SIZE_T #define SHM_INTERNAL_PTRDIFF_T DTYPE_PTRDIFF_T +#define SHM_INTERNAL_UINT8 FI_UINT8 +#define SHM_INTERNAL_UINT16 FI_UINT16 #define SHM_INTERNAL_UINT32 FI_UINT32 #define SHM_INTERNAL_UINT64 FI_UINT64 +#define SHM_INTERNAL_UCHAR DTYPE_UNSIGNED_CHAR +#define SHM_INTERNAL_USHORT DTYPE_UNSIGNED_SHORT /* Operations */ #define SHM_INTERNAL_BAND FI_BAND @@ -277,6 +282,7 @@ struct shmem_transport_ctx_t { shmem_free_list_t *bounce_buffers; int stx_idx; struct shmem_internal_tid tid; + struct shmem_internal_team_t *team; }; typedef struct shmem_transport_ctx_t shmem_transport_ctx_t; @@ -340,7 +346,7 @@ void shmem_transport_probe(void) return; } -int shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx); +int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, shmem_transport_ctx_t **ctx); void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx); int shmem_transport_init(void); diff --git a/src/transport_portals4.c b/src/transport_portals4.c index 588e3bc73..b525eadc1 100644 --- a/src/transport_portals4.c +++ b/src/transport_portals4.c @@ -120,8 +120,6 @@ shmem_internal_mutex_t shmem_internal_mutex_ptl4_frag; shmem_internal_mutex_t shmem_internal_mutex_ptl4_event_slots; #endif -static shmem_transport_ctx_t** shmem_transport_portals4_contexts = NULL; -static size_t shmem_transport_portals4_contexts_len = 0; static size_t shmem_transport_portals4_grow_size = 128; #define SHMEM_TRANSPORT_CTX_DEFAULT_ID -1 @@ -220,32 +218,34 @@ shmem_transport_ctx_init(shmem_transport_ctx_t *ctx, long options, int id) } int -shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx) +shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, shmem_transport_ctx_t **ctx) { int ret; size_t id; + if (team == NULL) + RAISE_ERROR_STR("Context creation occured on a NULL team"); + SHMEM_MUTEX_LOCK(shmem_internal_mutex_ptl4_ctx); /* Look for an open slot in the contexts array */ - for (id = 0; id < shmem_transport_portals4_contexts_len; id++) - if (shmem_transport_portals4_contexts[id] == NULL) break; + for (id = 0; id < team->contexts_len; id++) + if (team->contexts[id] == NULL) break; /* If none found, grow the array */ - if (id >= shmem_transport_portals4_contexts_len) { - id = shmem_transport_portals4_contexts_len; - - size_t i = shmem_transport_portals4_contexts_len; - shmem_transport_portals4_contexts_len += shmem_transport_portals4_grow_size; - shmem_transport_portals4_contexts = realloc(shmem_transport_portals4_contexts, - shmem_transport_portals4_contexts_len * sizeof(shmem_transport_ctx_t*)); + if (id >= team->contexts_len) { + id = team->contexts_len; - for ( ; i < shmem_transport_portals4_contexts_len; i++) - shmem_transport_portals4_contexts[i] = NULL; + size_t i = team->contexts_len; + team->contexts_len += shmem_transport_portals4_grow_size; + team->contexts = realloc(team->contexts, team->contexts_len * sizeof(shmem_transport_ctx_t*)); - if (shmem_transport_portals4_contexts == NULL) { + if (team->contexts == NULL) { RAISE_ERROR_STR("Error: out of memory when allocating ctx array"); } + + for ( ; i < team->contexts_len; i++) + team->contexts[i] = NULL; } *ctx = malloc(sizeof(shmem_transport_ctx_t)); @@ -263,9 +263,11 @@ shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx) free(*ctx); *ctx = NULL; } else { - shmem_transport_portals4_contexts[id] = *ctx; + team->contexts[id] = *ctx; } + (*ctx)->team = team; + SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_ptl4_ctx); return ret; @@ -282,7 +284,7 @@ shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) if (ctx->id >= 0) { SHMEM_MUTEX_LOCK(shmem_internal_mutex_ptl4_ctx); - shmem_transport_portals4_contexts[ctx->id] = NULL; + ctx->team->contexts[ctx->id] = NULL; SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_ptl4_ctx); free(ctx); } @@ -711,6 +713,8 @@ shmem_transport_startup(void) SHMEMX_CTX_BOUNCE_BUFFER, SHMEM_TRANSPORT_CTX_DEFAULT_ID); + shmem_transport_ctx_default.team = &shmem_internal_team_world; + cleanup: if (NULL != pe_map) free(pe_map); return ret; @@ -720,26 +724,11 @@ shmem_transport_startup(void) int shmem_transport_fini(void) { - size_t i; - /* synchronize the atomic cache, if there is one */ shmem_transport_syncmem(); - /* Free all contexts. This performs a quiet on each context, ensuring all - * operations have completed before proceeding with shutdown. */ - - for (i = 0; i < shmem_transport_portals4_contexts_len; ++i) { - if (shmem_transport_portals4_contexts[i]) { - if (shmem_transport_portals4_contexts[i]->options & SHMEM_CTX_PRIVATE) - RAISE_WARN_MSG("Shutting down with unfreed private context (%zu)\n", i); - shmem_transport_quiet(shmem_transport_portals4_contexts[i]); - shmem_transport_ctx_destroy(shmem_transport_portals4_contexts[i]); - } - } - - if (shmem_transport_portals4_contexts) - free(shmem_transport_portals4_contexts); - + /* The default context is not inserted into the list of contexts on + * SHMEM_TEAM_WORLD, so it must be destroyed here */ shmem_transport_quiet(&shmem_transport_ctx_default); shmem_transport_ctx_destroy(&shmem_transport_ctx_default); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index c9f05032c..fbabd6a57 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -25,6 +25,7 @@ #include "shmem_free_list.h" #include "shmem_internal.h" #include "shmem_atomic.h" +#include "shmem_team.h" #ifndef MIN #define MIN(a,b) (((a)<(b))?(a):(b)) @@ -53,8 +54,12 @@ typedef ptl_op_t shm_internal_op_t; #define SHM_INTERNAL_ULONG_LONG DTYPE_UNSIGNED_LONG_LONG #define SHM_INTERNAL_SIZE_T DTYPE_SIZE_T #define SHM_INTERNAL_PTRDIFF_T DTYPE_PTRDIFF_T +#define SHM_INTERNAL_UINT8 PTL_UINT8_T +#define SHM_INTERNAL_UINT16 PTL_UINT16_T #define SHM_INTERNAL_UINT32 PTL_UINT32_T #define SHM_INTERNAL_UINT64 PTL_UINT64_T +#define SHM_INTERNAL_UCHAR DTYPE_UNSIGNED_CHAR +#define SHM_INTERNAL_USHORT DTYPE_UNSIGNED_SHORT #define SHM_INTERNAL_BAND PTL_BAND #define SHM_INTERNAL_BOR PTL_BOR @@ -162,11 +167,12 @@ struct shmem_transport_ctx_t { * event arrival. This race can cause early exit from quiet. */ shmem_internal_cntr_t pending_put_cntr; shmem_internal_cntr_t pending_get_cntr; + struct shmem_internal_team_t *team; }; typedef struct shmem_transport_ctx_t shmem_transport_ctx_t; extern shmem_transport_ctx_t shmem_transport_ctx_default; -int shmem_transport_ctx_create(long options, shmem_transport_ctx_t **ctx); +int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, shmem_transport_ctx_t **ctx); void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx); /* diff --git a/test/shmemx/Makefile.am b/test/shmemx/Makefile.am index d10323cb1..985430eb1 100644 --- a/test/shmemx/Makefile.am +++ b/test/shmemx/Makefile.am @@ -38,7 +38,20 @@ check_PROGRAMS += \ shmemx_test_any \ shmemx_test_some \ c11_test_shmemx_wait_until \ - c11_test_shmemx_test + c11_test_shmemx_test \ + shmemx_team_split_strided \ + shmemx_team_translate \ + shmemx_team_reuse_teams \ + shmemx_team_sync \ + shmemx_team_broadcast \ + shmemx_team_collect \ + shmemx_team_collect_active_set \ + shmemx_team_alltoall \ + shmemx_team_alltoalls \ + shmemx_team_shared \ + shmemx_team_b2b_collectives \ + c11_shmemx_team_collective_types \ + c11_shmemx_team_reduce if HAVE_PTHREADS check_PROGRAMS += \ diff --git a/test/shmemx/c11_shmemx_team_collective_types.c b/test/shmemx/c11_shmemx_team_collective_types.c new file mode 100644 index 000000000..cfddff027 --- /dev/null +++ b/test/shmemx/c11_shmemx_team_collective_types.c @@ -0,0 +1,282 @@ +/* + * This test program is derived from a unit test created by Nick Park. + * The original unit test is a work of the U.S. Government and is not subject + * to copyright protection in the United States. Foreign copyrights may + * apply. + * + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#define MAX_NPES 32 + +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + +enum op { BCAST = 0, COLLECT, FCOLLECT, ALLTOALL, ALLTOALLS }; + +#define TEST_SHMEM_COLLECTIVE(OP, TYPE) \ + do { \ + static TYPE src[MAX_NPES]; \ + static TYPE dest[MAX_NPES*MAX_NPES]; \ + \ + for (int i = 0; i < MAX_NPES; i++) { \ + src[i] = (TYPE)mype; \ + } \ + \ + switch (OP) { \ + case BCAST: \ + shmemx_broadcast(SHMEMX_TEAM_WORLD, dest, src, MAX_NPES, npes-1); \ + break; \ + case COLLECT: \ + shmemx_collect(SHMEMX_TEAM_WORLD, dest, src, MAX_NPES); \ + break; \ + case FCOLLECT: \ + shmemx_fcollect(SHMEMX_TEAM_WORLD, dest, src, MAX_NPES); \ + break; \ + case ALLTOALL: \ + shmemx_alltoall(SHMEMX_TEAM_WORLD, dest, src, 1); \ + break; \ + case ALLTOALLS: \ + shmemx_alltoalls(SHMEMX_TEAM_WORLD, dest, src, 1, 1, 1); \ + break; \ + default: \ + printf("Invalid operation (%d)\n", OP); \ + shmem_global_exit(1); \ + } \ + shmem_barrier_all(); \ + \ + switch (OP) { \ + case BCAST: \ + if (mype != npes-1) { \ + for (int i = 0; i < MAX_NPES; i++) { \ + if (dest[i] != (TYPE)npes-1) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_COLLECTIVE(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + break; \ + case COLLECT: \ + for (int i = 0; i < MAX_NPES*npes; i++) { \ + if (dest[i] != (TYPE)(i / MAX_NPES)) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_COLLECTIVE(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + break; \ + case FCOLLECT: \ + for (int i = 0; i < MAX_NPES*npes; i++) { \ + if (dest[i] != (TYPE)(i / MAX_NPES)) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_COLLECTIVE(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + break; \ + case ALLTOALL: \ + for (int i = 0; i < npes; i++) { \ + if (dest[i] != (TYPE)i) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_COLLECTIVE(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + break; \ + case ALLTOALLS: \ + for (int i = 0; i < npes; i++) { \ + if (dest[i] != (TYPE)i) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_COLLECTIVE(%s, %s)\n", mype, \ + #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + break; \ + default: \ + printf("Invalid operation (%d)\n", OP); \ + shmem_global_exit(1); \ + } \ + } \ + } while (0) + +#else +#define TEST_SHMEM_COLLECTIVE(OP, TYPE) + +#endif + +int main(void) { + shmem_init(); + + int rc = EXIT_SUCCESS; + + const int mype = shmem_my_pe(); + const int npes = shmem_n_pes(); + + if (npes > MAX_NPES) { + if (mype == 0) + fprintf(stderr, "ERR - Requires at least %d PEs\n", MAX_NPES); + shmem_finalize(); + return 0; + } + + TEST_SHMEM_COLLECTIVE(BCAST, float); + TEST_SHMEM_COLLECTIVE(BCAST, double); + TEST_SHMEM_COLLECTIVE(BCAST, long double); + TEST_SHMEM_COLLECTIVE(BCAST, char); + TEST_SHMEM_COLLECTIVE(BCAST, signed char); + TEST_SHMEM_COLLECTIVE(BCAST, short); + TEST_SHMEM_COLLECTIVE(BCAST, int); + TEST_SHMEM_COLLECTIVE(BCAST, long); + TEST_SHMEM_COLLECTIVE(BCAST, long long); + TEST_SHMEM_COLLECTIVE(BCAST, unsigned char); + TEST_SHMEM_COLLECTIVE(BCAST, unsigned short); + TEST_SHMEM_COLLECTIVE(BCAST, unsigned int); + TEST_SHMEM_COLLECTIVE(BCAST, unsigned long); + TEST_SHMEM_COLLECTIVE(BCAST, unsigned long long); + TEST_SHMEM_COLLECTIVE(BCAST, int8_t); + TEST_SHMEM_COLLECTIVE(BCAST, int16_t); + TEST_SHMEM_COLLECTIVE(BCAST, int32_t); + TEST_SHMEM_COLLECTIVE(BCAST, int64_t); + TEST_SHMEM_COLLECTIVE(BCAST, uint8_t); + TEST_SHMEM_COLLECTIVE(BCAST, uint16_t); + TEST_SHMEM_COLLECTIVE(BCAST, uint32_t); + TEST_SHMEM_COLLECTIVE(BCAST, uint64_t); + TEST_SHMEM_COLLECTIVE(BCAST, size_t); + TEST_SHMEM_COLLECTIVE(BCAST, ptrdiff_t); + + TEST_SHMEM_COLLECTIVE(COLLECT, float); + TEST_SHMEM_COLLECTIVE(COLLECT, double); + TEST_SHMEM_COLLECTIVE(COLLECT, long double); + TEST_SHMEM_COLLECTIVE(COLLECT, char); + TEST_SHMEM_COLLECTIVE(COLLECT, signed char); + TEST_SHMEM_COLLECTIVE(COLLECT, short); + TEST_SHMEM_COLLECTIVE(COLLECT, int); + TEST_SHMEM_COLLECTIVE(COLLECT, long); + TEST_SHMEM_COLLECTIVE(COLLECT, long long); + TEST_SHMEM_COLLECTIVE(COLLECT, unsigned char); + TEST_SHMEM_COLLECTIVE(COLLECT, unsigned short); + TEST_SHMEM_COLLECTIVE(COLLECT, unsigned int); + TEST_SHMEM_COLLECTIVE(COLLECT, unsigned long); + TEST_SHMEM_COLLECTIVE(COLLECT, unsigned long long); + TEST_SHMEM_COLLECTIVE(COLLECT, int8_t); + TEST_SHMEM_COLLECTIVE(COLLECT, int16_t); + TEST_SHMEM_COLLECTIVE(COLLECT, int32_t); + TEST_SHMEM_COLLECTIVE(COLLECT, int64_t); + TEST_SHMEM_COLLECTIVE(COLLECT, uint8_t); + TEST_SHMEM_COLLECTIVE(COLLECT, uint16_t); + TEST_SHMEM_COLLECTIVE(COLLECT, uint32_t); + TEST_SHMEM_COLLECTIVE(COLLECT, uint64_t); + TEST_SHMEM_COLLECTIVE(COLLECT, size_t); + TEST_SHMEM_COLLECTIVE(COLLECT, ptrdiff_t); + + TEST_SHMEM_COLLECTIVE(FCOLLECT, float); + TEST_SHMEM_COLLECTIVE(FCOLLECT, double); + TEST_SHMEM_COLLECTIVE(FCOLLECT, long double); + TEST_SHMEM_COLLECTIVE(FCOLLECT, char); + TEST_SHMEM_COLLECTIVE(FCOLLECT, signed char); + TEST_SHMEM_COLLECTIVE(FCOLLECT, short); + TEST_SHMEM_COLLECTIVE(FCOLLECT, int); + TEST_SHMEM_COLLECTIVE(FCOLLECT, long); + TEST_SHMEM_COLLECTIVE(FCOLLECT, long long); + TEST_SHMEM_COLLECTIVE(FCOLLECT, unsigned char); + TEST_SHMEM_COLLECTIVE(FCOLLECT, unsigned short); + TEST_SHMEM_COLLECTIVE(FCOLLECT, unsigned int); + TEST_SHMEM_COLLECTIVE(FCOLLECT, unsigned long); + TEST_SHMEM_COLLECTIVE(FCOLLECT, unsigned long long); + TEST_SHMEM_COLLECTIVE(FCOLLECT, int8_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, int16_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, int32_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, int64_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, uint8_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, uint16_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, uint32_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, uint64_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, size_t); + TEST_SHMEM_COLLECTIVE(FCOLLECT, ptrdiff_t); + + TEST_SHMEM_COLLECTIVE(ALLTOALL, float); + TEST_SHMEM_COLLECTIVE(ALLTOALL, double); + TEST_SHMEM_COLLECTIVE(ALLTOALL, long double); + TEST_SHMEM_COLLECTIVE(ALLTOALL, char); + TEST_SHMEM_COLLECTIVE(ALLTOALL, signed char); + TEST_SHMEM_COLLECTIVE(ALLTOALL, short); + TEST_SHMEM_COLLECTIVE(ALLTOALL, int); + TEST_SHMEM_COLLECTIVE(ALLTOALL, long); + TEST_SHMEM_COLLECTIVE(ALLTOALL, long long); + TEST_SHMEM_COLLECTIVE(ALLTOALL, unsigned char); + TEST_SHMEM_COLLECTIVE(ALLTOALL, unsigned short); + TEST_SHMEM_COLLECTIVE(ALLTOALL, unsigned int); + TEST_SHMEM_COLLECTIVE(ALLTOALL, unsigned long); + TEST_SHMEM_COLLECTIVE(ALLTOALL, unsigned long long); + TEST_SHMEM_COLLECTIVE(ALLTOALL, int8_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, int16_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, int32_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, int64_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, uint8_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, uint16_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, uint32_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, uint64_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, size_t); + TEST_SHMEM_COLLECTIVE(ALLTOALL, ptrdiff_t); + + TEST_SHMEM_COLLECTIVE(ALLTOALLS, float); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, double); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, long double); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, char); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, signed char); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, short); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, int); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, long); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, long long); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, unsigned char); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, unsigned short); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, unsigned int); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, unsigned long); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, unsigned long long); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, int8_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, int16_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, int32_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, int64_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, uint8_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, uint16_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, uint32_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, uint64_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, size_t); + TEST_SHMEM_COLLECTIVE(ALLTOALLS, ptrdiff_t); + + shmem_finalize(); + return rc; +} diff --git a/test/shmemx/c11_shmemx_team_reduce.c b/test/shmemx/c11_shmemx_team_reduce.c new file mode 100644 index 000000000..3ad721b4d --- /dev/null +++ b/test/shmemx/c11_shmemx_team_reduce.c @@ -0,0 +1,259 @@ +/* + * This test program is derived from a unit test created by Nick Park. + * The original unit test is a work of the U.S. Government and is not subject + * to copyright protection in the United States. Foreign copyrights may + * apply. + * + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_NPES 32 + +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + +enum op { and = 0, or, xor, max, min, sum, prod }; + +const double FLOATING_POINT_TOLERANCE = 1e-6; + +#define REDUCTION(OP) \ + do { ret = shmemx_##OP##_reduce(SHMEMX_TEAM_WORLD, dest, src, npes); } while (0) + +#define is_floating_point(X) _Generic((X), \ + float: true, \ + double: true, \ + long double: true, \ + float _Complex: true, \ + double _Complex: true, \ + default: false \ +) + +#define INIT_SRC_BUFFER(TYPE) \ + do { \ + for (int i = 0; i < MAX_NPES; i++) { \ + src[i] = (TYPE)1ULL; \ + } \ + } while (0) + +#define CHECK_DEST_BUFFER(OP, TYPE, CORRECT_VAL) \ + do { \ + for (int i = 0; i < npes; i++) { \ + if (dest[i] != (TYPE)CORRECT_VAL) { \ + printf("PE %i received incorrect value with " \ + "TEST_SHMEM_REDUCE(%s, %s)\n", mype, #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + } while (0) + +#define CHECK_DEST_BUFFER_FP(OP, TYPE, CORRECT_VAL, TOLERANCE) \ + do { \ + for (int i = 0; i < npes; i++) { \ + if (fabsl(creal(dest[i]) - creal((TYPE)CORRECT_VAL)) > TOLERANCE) { \ + printf("PE %i received incorrect real value with " \ + "TEST_SHMEM_REDUCE(%s, %s)\n", mype, #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + if (fabsl(cimag(dest[i]) - cimag((TYPE)CORRECT_VAL)) > TOLERANCE) { \ + printf("PE %i received incorrect imaginary value with " \ + "TEST_SHMEM_REDUCE(%s, %s)\n", mype, #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + } \ + } while (0) + +#define TEST_SHMEM_REDUCE(OP, TYPE) \ + do { \ + static TYPE src[MAX_NPES]; \ + static TYPE dest[MAX_NPES]; \ + int ret; \ + const bool floating_point_val = is_floating_point((TYPE)0); \ + \ + INIT_SRC_BUFFER(TYPE); \ + \ + REDUCTION(OP); \ + \ + if (ret != 0) { \ + printf("Reduction returned non-zero value (%i) on PE (%i) with " \ + "TEST_SHMEM_REDUCE(%s, %s)\n", ret, mype, #OP, #TYPE); \ + rc = EXIT_FAILURE; \ + } \ + \ + shmem_barrier_all(); \ + \ + switch (OP) { \ + case and: \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + break; \ + case or: \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + break; \ + case xor: \ + CHECK_DEST_BUFFER(OP, TYPE, (TYPE)(npes % 2 ? 1ULL : 0ULL)); \ + break; \ + case max: \ + if (floating_point_val) \ + CHECK_DEST_BUFFER_FP(OP, TYPE, 1ULL, FLOATING_POINT_TOLERANCE); \ + else \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + break; \ + case min: \ + if (floating_point_val) \ + CHECK_DEST_BUFFER_FP(OP, TYPE, 1ULL, FLOATING_POINT_TOLERANCE); \ + else \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + break; \ + case sum: \ + if (floating_point_val) \ + CHECK_DEST_BUFFER_FP(OP, TYPE, npes, FLOATING_POINT_TOLERANCE); \ + else \ + CHECK_DEST_BUFFER(OP, TYPE, npes); \ + break; \ + case prod: \ + if (floating_point_val) \ + CHECK_DEST_BUFFER_FP(OP, TYPE, 1ULL, FLOATING_POINT_TOLERANCE); \ + else \ + CHECK_DEST_BUFFER(OP, TYPE, 1ULL); \ + break; \ + default: \ + printf("Invalid operation (%d)\n", OP); \ + shmem_global_exit(1); \ + } \ + } while (0) + +#else +#define TEST_SHMEM_REDUCE(OP, TYPE) +#endif + + +int main(void) { + + shmem_init(); + + int rc = EXIT_SUCCESS; + + const int mype = shmem_my_pe(); + const int npes = shmem_n_pes(); + + if (npes > MAX_NPES) { + if (mype == 0) + fprintf(stderr, "ERR - Requires less than %d PEs\n", MAX_NPES); + shmem_global_exit(1); + } + + TEST_SHMEM_REDUCE(and, unsigned char); + TEST_SHMEM_REDUCE(and, short); + TEST_SHMEM_REDUCE(and, unsigned short); + TEST_SHMEM_REDUCE(and, int); + TEST_SHMEM_REDUCE(and, unsigned int); + TEST_SHMEM_REDUCE(and, long); + TEST_SHMEM_REDUCE(and, unsigned long); + TEST_SHMEM_REDUCE(and, long long); + TEST_SHMEM_REDUCE(and, unsigned long long); + + TEST_SHMEM_REDUCE(or, unsigned char); + TEST_SHMEM_REDUCE(or, short); + TEST_SHMEM_REDUCE(or, unsigned short); + TEST_SHMEM_REDUCE(or, int); + TEST_SHMEM_REDUCE(or, unsigned int); + TEST_SHMEM_REDUCE(or, long); + TEST_SHMEM_REDUCE(or, unsigned long); + TEST_SHMEM_REDUCE(or, long long); + TEST_SHMEM_REDUCE(or, unsigned long long); + + TEST_SHMEM_REDUCE(xor, unsigned char); + TEST_SHMEM_REDUCE(xor, short); + TEST_SHMEM_REDUCE(xor, unsigned short); + TEST_SHMEM_REDUCE(xor, int); + TEST_SHMEM_REDUCE(xor, unsigned int); + TEST_SHMEM_REDUCE(xor, long); + TEST_SHMEM_REDUCE(xor, unsigned long); + TEST_SHMEM_REDUCE(xor, long long); + TEST_SHMEM_REDUCE(xor, unsigned long long); + + TEST_SHMEM_REDUCE(max, short); + TEST_SHMEM_REDUCE(max, unsigned short); + TEST_SHMEM_REDUCE(max, int); + TEST_SHMEM_REDUCE(max, unsigned int); + TEST_SHMEM_REDUCE(max, long); + TEST_SHMEM_REDUCE(max, unsigned long); + TEST_SHMEM_REDUCE(max, long long); + TEST_SHMEM_REDUCE(max, unsigned long long); + TEST_SHMEM_REDUCE(max, float); + TEST_SHMEM_REDUCE(max, double); + TEST_SHMEM_REDUCE(max, long double); + + TEST_SHMEM_REDUCE(min, short); + TEST_SHMEM_REDUCE(min, unsigned short); + TEST_SHMEM_REDUCE(min, int); + TEST_SHMEM_REDUCE(min, unsigned int); + TEST_SHMEM_REDUCE(min, long); + TEST_SHMEM_REDUCE(min, unsigned long); + TEST_SHMEM_REDUCE(min, long long); + TEST_SHMEM_REDUCE(min, unsigned long long); + TEST_SHMEM_REDUCE(min, float); + TEST_SHMEM_REDUCE(min, double); + TEST_SHMEM_REDUCE(min, long double); + + TEST_SHMEM_REDUCE(sum, short); + TEST_SHMEM_REDUCE(sum, unsigned short); + TEST_SHMEM_REDUCE(sum, int); + TEST_SHMEM_REDUCE(sum, unsigned int); + TEST_SHMEM_REDUCE(sum, long); + TEST_SHMEM_REDUCE(sum, unsigned long); + TEST_SHMEM_REDUCE(sum, long long); + TEST_SHMEM_REDUCE(sum, unsigned long long); + TEST_SHMEM_REDUCE(sum, float); + TEST_SHMEM_REDUCE(sum, double); + TEST_SHMEM_REDUCE(sum, long double); + TEST_SHMEM_REDUCE(sum, double _Complex); + TEST_SHMEM_REDUCE(sum, float _Complex); + + TEST_SHMEM_REDUCE(prod, short); + TEST_SHMEM_REDUCE(prod, unsigned short); + TEST_SHMEM_REDUCE(prod, int); + TEST_SHMEM_REDUCE(prod, unsigned int); + TEST_SHMEM_REDUCE(prod, long); + TEST_SHMEM_REDUCE(prod, unsigned long); + TEST_SHMEM_REDUCE(prod, long long); + TEST_SHMEM_REDUCE(prod, unsigned long long); + TEST_SHMEM_REDUCE(prod, float); + TEST_SHMEM_REDUCE(prod, double); + TEST_SHMEM_REDUCE(prod, long double); + TEST_SHMEM_REDUCE(prod, double _Complex); + TEST_SHMEM_REDUCE(prod, float _Complex); + + shmem_finalize(); + return rc; +} diff --git a/test/shmemx/shmemx_team_alltoall.c b/test/shmemx/shmemx_team_alltoall.c new file mode 100644 index 000000000..516f5ccb5 --- /dev/null +++ b/test/shmemx/shmemx_team_alltoall.c @@ -0,0 +1,48 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include +#include + +int main(void) +{ + shmem_init(); + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + + const int count = 2; + int64_t* dest = (int64_t*) shmem_malloc(count * npes * sizeof(int64_t)); + int64_t* source = (int64_t*) shmem_malloc(count * npes * sizeof(int64_t)); + + /* assign source values */ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < count; i++) { + source[(pe * count) + i] = me + pe; + dest[(pe * count) + i] = 9999; + } + } + /* wait for all PEs to initialize source/dest */ + shmemx_team_sync(SHMEMX_TEAM_WORLD); + + /* alltoall on all PES */ + shmemx_int64_alltoall(SHMEMX_TEAM_WORLD, dest, source, count); + + /* verify results */ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < count; i++) { + if (dest[(pe * count) + i] != pe + me) { + printf("[%d] ERROR: dest[%d]=%" PRId64 ", should be %d\n", + me, (pe * count) + i, dest[(pe * count) + i], pe + me); + } + } + } + + shmem_free(dest); + shmem_free(source); + shmem_finalize(); + return 0; +} diff --git a/test/shmemx/shmemx_team_alltoalls.c b/test/shmemx/shmemx_team_alltoalls.c new file mode 100644 index 000000000..64b591fd1 --- /dev/null +++ b/test/shmemx/shmemx_team_alltoalls.c @@ -0,0 +1,51 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include +#include + +int main(void) +{ + shmem_init(); + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + + const int count = 2; + const ptrdiff_t dst = 2; + const ptrdiff_t sst = 3; + int64_t* dest = (int64_t*) shmem_malloc(count * dst * npes * sizeof(int64_t)); + int64_t* source = (int64_t*) shmem_malloc(count * sst * npes * sizeof(int64_t)); + + /* assign source values */ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < count; i++) { + source[sst * ((pe * count) + i)] = me + pe; + dest[dst * ((pe * count) + i)] = 9999; + } + } + /* wait for all PEs to initialize source/dest */ + shmemx_team_sync(SHMEMX_TEAM_WORLD); + + /* alltoalls on all PES */ + shmemx_int64_alltoalls(SHMEMX_TEAM_WORLD, dest, source, dst, sst, count); + + /* verify results */ + for (int pe = 0; pe < npes; pe++) { + for (int i = 0; i < count; i++) { + int j = dst * ((pe * count) + i); + if (dest[j] != pe + me) { + printf("[%d] ERROR: dest[%d]=%" PRId64 ", should be %d\n", + me, j, dest[j], pe + me); + } + } + } + + shmem_free(dest); + shmem_free(source); + shmem_finalize(); + return 0; +} diff --git a/test/shmemx/shmemx_team_b2b_collectives.c b/test/shmemx/shmemx_team_b2b_collectives.c new file mode 100644 index 000000000..d3beb45cb --- /dev/null +++ b/test/shmemx/shmemx_team_b2b_collectives.c @@ -0,0 +1,80 @@ +/* Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#define NITERS 100 +#define NELEMS 10 + +#define TEST_B2B_COLLECTIVE(NAME, ROUTINE, ...) \ + do { \ + if (me == 0) printf("%s... ", NAME); \ + int i; \ + for (i = 0; i < NITERS; i++) { \ + errors += ROUTINE(__VA_ARGS__); \ + } \ + error_check(&errors, &total_errors, NAME, me); \ + } while (0) + +static void error_check(int *errors, int *total_errors, char *routine, int me) { + if (*errors == 0) { + if (me == 0) printf("passed.\n"); + } else { + printf("%s error on PE %d\n", routine, me); + *total_errors += *errors; + *errors = 0; + } + return; +} + + +int main(void) +{ + int errors = 0, total_errors = 0; + shmem_init(); + int me = shmem_my_pe(); + + long *dest = shmem_malloc(NELEMS * sizeof(long)); + long *src = shmem_malloc(NELEMS * sizeof(long)); + + size_t i; + for (i = 0; i < NELEMS; i++) { + src[i] = me; + } + + TEST_B2B_COLLECTIVE("broadcast", shmemx_long_broadcast, SHMEMX_TEAM_WORLD, dest, src, NELEMS, 0); + TEST_B2B_COLLECTIVE("reduce", shmemx_long_sum_reduce, SHMEMX_TEAM_WORLD, dest, src, NELEMS); + TEST_B2B_COLLECTIVE("collect", shmemx_long_collect, SHMEMX_TEAM_WORLD, dest, src, NELEMS); + TEST_B2B_COLLECTIVE("fcollect", shmemx_long_fcollect, SHMEMX_TEAM_WORLD, dest, src, NELEMS); + TEST_B2B_COLLECTIVE("alltoall", shmemx_long_alltoall, SHMEMX_TEAM_WORLD, dest, src, NELEMS); + TEST_B2B_COLLECTIVE("alltoalls", shmemx_long_alltoalls, SHMEMX_TEAM_WORLD, dest, src, 1, 1, NELEMS); + + shmem_finalize(); + return total_errors; +} diff --git a/test/shmemx/shmemx_team_broadcast.c b/test/shmemx/shmemx_team_broadcast.c new file mode 100644 index 000000000..1a813b641 --- /dev/null +++ b/test/shmemx/shmemx_team_broadcast.c @@ -0,0 +1,33 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include +#include + +int main(void) +{ + static long source[4], dest[4]; + + shmem_init(); + int me = shmem_my_pe(); + + if (me == 0) + for (int i = 0; i < 4; i++) + source[i] = i; + + shmemx_long_broadcast(SHMEMX_TEAM_WORLD, dest, source, 4, 0); + + printf("%d: %ld, %ld, %ld, %ld\n", me, dest[0], dest[1], dest[2], dest[3]); + + if (me != 0) + for (int i = 0; i < 4; i++) + if (dest[i] != i) + shmem_global_exit(1); + + shmem_finalize(); + return 0; +} diff --git a/test/shmemx/shmemx_team_collect.c b/test/shmemx/shmemx_team_collect.c new file mode 100644 index 000000000..cbd1ebd0b --- /dev/null +++ b/test/shmemx/shmemx_team_collect.c @@ -0,0 +1,47 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include +#include + +int main(void) +{ + static long lock = 0; + + shmem_init(); + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + int my_nelem = me + 1; /* linearly increasing number of elements with PE */ + int total_nelem = (npes * (npes + 1)) / 2; + + int *source = (int *)shmem_malloc(npes * sizeof(int)); /* symmetric alloc */ + int *dest = (int *)shmem_malloc(total_nelem * sizeof(int)); + + for (int i = 0; i < my_nelem; i++) + source[i] = (me * (me + 1)) / 2 + i; + for (int i = 0; i < total_nelem; i++) + dest[i] = -9999; + + /* Wait for all PEs to initialize source/dest: */ + shmemx_team_sync(SHMEMX_TEAM_WORLD); + + shmemx_int_collect(SHMEMX_TEAM_WORLD, dest, source, my_nelem); + + shmem_set_lock(&lock); /* Lock prevents interleaving printfs */ + printf("%d: %d", me, dest[0]); + for (int i = 1; i < total_nelem; i++) + printf(", %d", dest[i]); + printf("\n"); + shmem_clear_lock(&lock); + + for (int i = 0; i < total_nelem; i++) + if (dest[i] != i) + shmem_global_exit(1); + + shmem_finalize(); + return 0; +} diff --git a/test/shmemx/shmemx_team_collect_active_set.c b/test/shmemx/shmemx_team_collect_active_set.c new file mode 100644 index 000000000..590d8a18b --- /dev/null +++ b/test/shmemx/shmemx_team_collect_active_set.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#define MAX_NPES 32 + +int64_t src[MAX_NPES]; +int64_t dst[MAX_NPES*MAX_NPES]; + +int main(void) +{ + int i, me, npes; + int errors = 0; + + shmem_init(); + + me = shmem_my_pe(); + npes = shmem_n_pes(); + + if (npes > MAX_NPES) { + if (me == 0) + printf("Test requires fewer than %d PEs\n", MAX_NPES); + shmem_finalize(); + return 0; + } + + for (i = 0; i < MAX_NPES; i++) + src[i] = me; + + for (i = 0; i < MAX_NPES*MAX_NPES; i++) + dst[i] = -1; + + if (me == 0) + printf("Shrinking team size test\n"); + + shmemx_team_t old_team, new_team; + shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 0, 1, npes, NULL, 0, &old_team); + + /* A total of npes-1 tests are performed, where the active set in each test + * includes PEs i..npes-1, and each PE contributes PE ID elements. The size + * of the team decreases by 1 each iteration. */ + for (i = 1; i < npes; i++) { + int j, k; + int idx = 0; + + if (me == i) + printf(" + team size %d\n", shmemx_team_n_pes(old_team)-1); + + shmemx_team_split_strided(old_team, 1, 1, shmemx_team_n_pes(old_team)-1, NULL, 0, &new_team); + + if (new_team != SHMEMX_TEAM_INVALID) { + + shmemx_int64_collect(new_team, dst, src, me); + + /* Validate destination buffer data */ + for (j = 0; j < npes - i; j++) { + for (k = 0; k < i+j; k++, idx++) { + if (dst[idx] != i+j) { + printf("%d: Expected dst[%d] = %d, got dst[%d] = %"PRId64", iteration %d\n", + me, idx, i+j, idx, dst[idx], i); + errors++; + } + } + } + + /* Validate unused destination buffer */ + for ( ; idx < MAX_NPES*MAX_NPES; idx++) { + if (dst[idx] != -1) { + printf("%d: Expected dst[%d] = %d, got dst[%d] = %"PRId64", iteration %d\n", + me, idx, -1, idx, dst[idx], i); + errors++; + } + } + + /* Reset for next iteration */ + for (j = 0; j < MAX_NPES*MAX_NPES; j++) + dst[j] = -1; + + shmemx_sync(new_team); + } + + old_team = new_team; + } + + shmemx_team_destroy(old_team); + shmem_finalize(); + + return errors != 0; +} diff --git a/test/shmemx/shmemx_team_reuse_teams.c b/test/shmemx/shmemx_team_reuse_teams.c new file mode 100644 index 000000000..33f3b6120 --- /dev/null +++ b/test/shmemx/shmemx_team_reuse_teams.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + + +int main(void) +{ + int i, me, npes; + int ret = 0, errors = 0; + + shmem_init(); + + me = shmem_my_pe(); + npes = shmem_n_pes(); + + if (me == 0) + printf("Reuse teams test\n"); + + shmemx_team_t old_team, new_team; + ret = shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 0, 1, npes, NULL, 0, &old_team); + if (ret) ++errors; + + /* A total of npes-1 iterations are performed, where the active set in iteration i + * includes PEs i..npes-1. The size of the team decreases by 1 each iteration. */ + for (i = 1; i < npes; i++) { + + if (me == i) { + printf("%3d: creating new team (start, stride, size): %3d, %3d, %3d\n", me, + shmemx_team_translate_pe(old_team, 1, SHMEMX_TEAM_WORLD), 1, shmemx_team_n_pes(old_team)-1); + } + + ret = shmemx_team_split_strided(old_team, 1, 1, shmemx_team_n_pes(old_team)-1, NULL, 0, &new_team); + if (ret) ++errors; + + shmemx_team_destroy(old_team); + old_team = new_team; + } + + shmemx_team_destroy(old_team); + shmem_finalize(); + + return errors != 0; +} diff --git a/test/shmemx/shmemx_team_shared.c b/test/shmemx/shmemx_team_shared.c new file mode 100644 index 000000000..6cb99fafa --- /dev/null +++ b/test/shmemx/shmemx_team_shared.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + + +int main(void) +{ + static long lock = 0; + + shmem_init(); + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + + int team_shared_npes = shmemx_team_n_pes(SHMEMX_TEAM_SHARED); + + int *peers = malloc(team_shared_npes * sizeof(int)); + int num_peers = 0; + + /* Print the team members on SHMEMX_TEAM_SHARED */ + /* Use a lock for cleaner output */ + shmem_set_lock(&lock); + + printf("[PE: %d] SHMEM_TEAM_SHARED peers: { ", me); + for (int i = 0; i < npes; i++) { + if (shmemx_team_translate_pe(SHMEMX_TEAM_WORLD, i, + SHMEMX_TEAM_SHARED) != -1) { + peers[num_peers++] = i; + printf("%d ", i); + } + } + + printf("} (num_peers: %d)\n", num_peers); + + fflush(NULL); + + shmem_clear_lock(&lock); + + if (num_peers != team_shared_npes) { + shmem_global_exit(1); + } + + free(peers); + shmem_finalize(); + return 0; +} + diff --git a/test/shmemx/shmemx_team_split_strided.c b/test/shmemx/shmemx_team_split_strided.c new file mode 100644 index 000000000..b4629221d --- /dev/null +++ b/test/shmemx/shmemx_team_split_strided.c @@ -0,0 +1,53 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int rank, npes; + int t_pe, t_size; + int ret; + shmemx_team_t new_team; + shmemx_team_config_t *config; + + shmem_init(); + config = NULL; + rank = shmem_my_pe(); + npes = shmem_n_pes(); + + if (npes < 2) { + fprintf(stderr, "ERR - Requires at least 2 PEs\n"); + shmem_finalize(); + return 0; + } + + ret = shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 0, 2, (npes + 1) / 2, + config, 0, &new_team); + + if (ret != 0) { + shmem_global_exit(2); + } + + t_size = shmemx_team_n_pes(new_team); + t_pe = shmemx_team_my_pe(new_team); + + if (new_team != SHMEMX_TEAM_INVALID) { + if ((rank % 2 != 0) || (rank / 2 != t_pe) || ((npes + 1) / 2 != t_size)) { + shmem_global_exit(3); + } + } else { + if ((rank % 2 == 0) || (t_pe != -1) || (t_size != -1)) { + shmem_global_exit(4); + } + } + + + shmem_finalize(); + return 0; +} + diff --git a/test/shmemx/shmemx_team_sync.c b/test/shmemx/shmemx_team_sync.c new file mode 100644 index 000000000..d42c9af64 --- /dev/null +++ b/test/shmemx/shmemx_team_sync.c @@ -0,0 +1,70 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include + +#if !(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +#define shmem_p shmem_int_p +#endif + +int main(void) +{ + static int x = 10101; + + shmemx_team_t twos_team = SHMEMX_TEAM_INVALID; + shmemx_team_t threes_team = SHMEMX_TEAM_INVALID; + shmemx_team_config_t *config; + + shmem_init(); + config = NULL; + int me = shmem_my_pe(); + int npes = shmem_n_pes(); + + if (npes > 2) + shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 2, 2, (npes-1) / 2, config, + 0, &twos_team); + + if (npes > 3) + shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 3, 3, (npes-1) / 3, config, + 0, &threes_team); + + int my_pe_twos = shmemx_team_my_pe(twos_team); + int my_pe_threes = shmemx_team_my_pe(threes_team); + int npes_twos = shmemx_team_n_pes(twos_team); + int npes_threes = shmemx_team_n_pes(threes_team); + + if (twos_team != SHMEMX_TEAM_INVALID) { + /* put the value 2 to the next team member in a circular fashion */ + shmem_p(&x, 2, shmemx_team_translate_pe(twos_team, (my_pe_twos + 1) % + npes_twos, SHMEMX_TEAM_WORLD)); + shmem_quiet(); + shmemx_sync(twos_team); + } + + shmemx_sync(SHMEMX_TEAM_WORLD); + + if (threes_team != SHMEMX_TEAM_INVALID) { + /* put the value 3 to the next team member in a circular fashion */ + shmem_p(&x, 3, shmemx_team_translate_pe(threes_team, (my_pe_threes + 1) % + npes_threes, SHMEMX_TEAM_WORLD)); + shmem_quiet(); + shmemx_sync(threes_team); + } + + if (me && me % 3 == 0) { + if (x != 3) shmem_global_exit(3); + } + else if (me && me % 2 == 0) { + if (x != 2) shmem_global_exit(2); + } + else if (x != 10101) { + shmem_global_exit(1); + } + + shmem_finalize(); + return 0; +} diff --git a/test/shmemx/shmemx_team_translate.c b/test/shmemx/shmemx_team_translate.c new file mode 100644 index 000000000..9e462ec30 --- /dev/null +++ b/test/shmemx/shmemx_team_translate.c @@ -0,0 +1,47 @@ +/* + * This test program is derived from an example program in the + * OpenSHMEM specification. + */ + +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int rank, npes; + int t_pe; + int t_global; + shmemx_team_t new_team; + shmemx_team_config_t *config; + + shmem_init(); + config = NULL; + rank = shmem_my_pe(); + npes = shmem_n_pes(); + + if (npes < 2) { + fprintf(stderr, "ERR - Requires > 1 PEs\n"); + shmem_finalize(); + return 0; + } + + shmemx_team_split_strided(SHMEMX_TEAM_WORLD, 0, 2, npes / 2, config, 0, + &new_team); + + t_pe = shmemx_team_my_pe(new_team); + t_global = shmemx_team_translate_pe(new_team, t_pe, SHMEMX_TEAM_WORLD); + + if (new_team != SHMEMX_TEAM_INVALID) { + if (t_global != rank) { + shmem_global_exit(2); + } + } else { + if (t_global != -1) { + shmem_global_exit(3); + } + } + + shmem_finalize(); + return 0; +}