diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index d7c21394ca486f..2f25e3b0c47dc0 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -153,6 +153,7 @@ jobs: 'generic-no-wide-characters', 'generic-no-rtti', 'generic-optimized-speed', + 'generic-pstl-openmp', 'generic-static', # TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive # or don't provide much value since the benchmark run results are too noise on the bots. diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 4b927017f8c2ac..c1dec6bf54ab8e 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -301,10 +301,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF) if (LIBCXX_ENABLE_THREADS) - set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use") + set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread") else() - set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use") + set(LIBCXX_PSTL_BACKEND_DEFAULT "serial") endif() +set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}") # Misc options ---------------------------------------------------------------- # FIXME: Turn -pedantic back ON. It is currently off because it warns @@ -571,6 +572,11 @@ function(cxx_add_basic_build_flags target) endif() endif() target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}") + + # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain + if (LIBCXX_PSTL_BACKEND STREQUAL "openmp") + target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp) + endif() endfunction() # Exception flags ============================================================= @@ -800,6 +806,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread") config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD) elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch") config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH) +elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp") + config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP) else() message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend. Valid backends are: serial, std_thread and libdispatch") diff --git a/libcxx/cmake/caches/Generic-pstl-openmp.cmake b/libcxx/cmake/caches/Generic-pstl-openmp.cmake new file mode 100644 index 00000000000000..f3ff4f3b57fd21 --- /dev/null +++ b/libcxx/cmake/caches/Generic-pstl-openmp.cmake @@ -0,0 +1 @@ +set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "") diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst index e425b9dadfe7d1..5727005e24fbde 100644 --- a/libcxx/docs/BuildingLibcxx.rst +++ b/libcxx/docs/BuildingLibcxx.rst @@ -424,6 +424,17 @@ libc++ Feature Options provided, this header will be included by the library, replacing the default assertion handler. +.. option:: LIBCXX_PSTL_BACKEND:STRING + + **Default**:: ``"serial"`` + + **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp`` + + Select the desired backend for C++ parallel algorithms. All four options can + target multi-core CPU architectures, and ``openmp`` can additionally target + GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or + later. + libc++ ABI Feature Options -------------------------- diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst index df08875c13beae..f1e7b19ead5798 100644 --- a/libcxx/docs/UsingLibcxx.rst +++ b/libcxx/docs/UsingLibcxx.rst @@ -364,6 +364,110 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a * You are using allocator, which does not call destructor during deallocation. * You are aware that memory allocated with an allocator may be accessed, even when unused by container. +Offloading C++ Parallel Algorithms to GPUs +------------------------------------------ + +Experimental support for GPU offloading has been added to ``libc++``. The +implementation uses OpenMP target offloading to leverage GPU compute resources. +The OpenMP PSTL backend can target both NVIDIA and AMD GPUs. +However, the implementation only supports contiguous iterators, such as +iterators for ``std::vector`` or ``std::array``. +To enable the OpenMP offloading backend it must be selected with +``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when +compiling a program, the user must specify the command line options +``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading +enabled, please read +`the LLVM OpenMP FAQ. `_ +You may also want to to visit +`the OpenMP offloading command-line argument reference. `_ + +Example +~~~~~~~ + +The following is an example of offloading vector addition to a GPU using our +standard library extension. It implements the classical vector addition from +BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is +both used as an input and an output iterator in this example. + +.. code-block:: cpp + + #include + #include + + template + void axpy(const T1 a, const std::vector &x, std::vector &y) { + std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(), + y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; }); + } + +The execution policy ``std::execution::par_unseq`` states that the algorithm's +execution may be parallelized, vectorized, and migrated across threads. This is +the only execution mode that is safe to offload to GPUs, and for all other +execution modes the algorithms will execute on the CPU. +Special attention must be paid to the lambda captures when enabling GPU +offloading. If the lambda captures by reference, the user must manually map the +variables to the device. If capturing by reference, the above example could +be implemented in the following way. + +.. code-block:: cpp + + template + void axpy(const T1 a, const std::vector &x, std::vector &y) { + #pragma omp target data map(to : a) + std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(), + y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; }); + } + +However, if unified shared memory, USM, is enabled, no additional data mapping +is necessary when capturing y reference. + +Compiling functions for GPUs with OpenMP +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The C++ standard defines that all accesses to memory are inside a single address +space. However, discrete GPU systems have distinct address spaces. A single +address space can be emulated if your system supports unified shared memory. +However, many discrete GPU systems do not, and in those cases it is important to +pass device function pointers to the parallel algorithms. Below is an example of +how the OpenMP ``declare target`` directive with the ``indirect`` clause can be +used to mark that a function should be compiled for both host and device. + +.. code-block:: cpp + + // This function computes the squared difference of two floating points + float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; }; + + // Declare that the function must be compiled for both host and device + #pragma omp declare target indirect to(squared) + + int main() { + std::vector a(100, 1.0); + std::vector b(100, 1.25); + + // Pass the host function pointer to the parallel algorithm and let OpenMP + // translate it to the device function pointer internally + float sum = + std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(), + b.begin(), 0.0f, std::plus{}, squared); + + // Validate that the result is approximately 6.25 + assert(std::abs(sum - 6.25f) < 1e-10); + return 0; + } + +Without unified shared memory, the above example will not work if the host +function pointer ``squared`` is passed to the parallel algorithm. + +Important notes about exception handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +GPU architectures do not support exception handling. If compiling a program +containing parallel algorithms with current versions of Clang, a program with +exceptions in offloaded code regions will compile, but the program will +terminate if an exception is thrown on the device. This does not conform with +the C++ standard and exception handling on GPUs will hopefully be better +supported in future releases of LLVM. + Platform specific behavior ========================== diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 8d0ffd6ed725bd..f641abc38bb816 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -579,6 +579,7 @@ set(files __pstl/backend_fwd.h __pstl/backends/default.h __pstl/backends/libdispatch.h + __pstl/backends/openmp.h __pstl/backends/serial.h __pstl/backends/std_thread.h __pstl/cpu_algos/any_of.h diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index 89a14609ee3f92..00693aeb919cb4 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -35,6 +35,7 @@ #cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL #cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD #cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH +#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP // Hardening. #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@ diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h index 86d9f28c77fa8c..cb47501c19fc88 100644 --- a/libcxx/include/__pstl/backend.h +++ b/libcxx/include/__pstl/backend.h @@ -28,6 +28,10 @@ _LIBCPP_PUSH_MACROS #elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) # include <__pstl/backends/default.h> # include <__pstl/backends/libdispatch.h> +#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP) +# include <__pstl/backends/default.h> +# include <__pstl/backends/openmp.h> +# include <__pstl/backends/std_thread.h> #endif _LIBCPP_POP_MACROS diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h index 32c5da576fb3c0..ed08d45206a8b8 100644 --- a/libcxx/include/__pstl/backend_fwd.h +++ b/libcxx/include/__pstl/backend_fwd.h @@ -47,6 +47,7 @@ struct __backend_configuration; struct __default_backend_tag; struct __libdispatch_backend_tag; +struct __openmp_backend_tag; struct __serial_backend_tag; struct __std_thread_backend_tag; @@ -56,6 +57,9 @@ using __current_configuration = __backend_configuration<__serial_backend_tag, __ using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>; #elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>; +#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP) +using __current_configuration = + __backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>; #else // ...New vendors can add parallel backends here... diff --git a/libcxx/include/__pstl/backends/openmp.h b/libcxx/include/__pstl/backends/openmp.h new file mode 100644 index 00000000000000..67ae172730d6f2 --- /dev/null +++ b/libcxx/include/__pstl/backends/openmp.h @@ -0,0 +1,511 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___PSTL_BACKENDS_OPENMP_H +#define _LIBCPP___PSTL_BACKENDS_OPENMP_H + +// Combined OpenMP CPU and GPU Backend +// =================================== +// Contrary to the CPU backends found in ./cpu_backends/, the OpenMP backend can +// target both CPUs and GPUs. The OpenMP standard defines that when offloading +// code to an accelerator, the compiler must generate a fallback code for +// execution on the host. Thereby, the backend works as a CPU backend if no +// targeted accelerator is available at execution time. The target regions can +// also be compiled directly for a CPU architecture, for instance by adding the +// command-line option `-fopenmp-targets=x86_64-pc-linux-gnu` in Clang. +// +// When is an Algorithm Offloaded? +// ------------------------------- +// Only parallel algorithms with the parallel unsequenced execution policy are +// offloaded to the device. We cannot offload parallel algorithms with a +// parallel execution policy to GPUs because invocations executing in the same +// thread "are indeterminately sequenced with respect to each other" which we +// cannot guarantee on a GPU. +// +// The standard draft states that "the semantics [...] allow the implementation +// to fall back to sequential execution if the system cannot parallelize an +// algorithm invocation". If it is not deemed safe to offload the parallel +// algorithm to the device, we first fall back to a parallel unsequenced +// implementation from ./cpu_backends. The CPU implementation may then fall back +// to sequential execution. In that way we strive to achieve the best possible +// performance. +// +// Further, "it is the caller's responsibility to ensure that the invocation +// does not introduce data races or deadlocks." +// +// Implicit Assumptions +// -------------------- +// If the user provides a function pointer as an argument to a parallel +// algorithm, it is assumed that it is the device pointer as there is currently +// no way to check whether a host or device pointer was passed. +// +// Mapping Clauses +// --------------- +// In some of the parallel algorithms, the user is allowed to provide the same +// iterator as input and output. The order of the maps matters because OpenMP +// keeps a reference counter of which variables have been mapped to the device. +// Thereby, a varible is only copied to the device if its reference counter is +// incremented from zero, and it is only copied back to the host when the +// reference counter is decremented to zero again. +// This allows nesting mapped regions, for instance in recursive functions, +// without enforcing a lot of unnecessary data movement. +// Therefore, `pragma omp target data map(to:...)` must be used before +// `pragma omp target data map(alloc:...)`. Conversely, the maps with map +// modifier `release` must be placed before the maps with map modifier `from` +// when transferring the result from the device to the host. +// +// Example: Assume `a` and `b` are pointers to the same array. +// ``` C++ +// #pragma omp target enter data map(alloc:a[0:n]) +// // The reference counter is incremented from 0 to 1. a is not copied to the +// // device because of the `alloc` map modifier. +// #pragma omp target enter data map(to:b[0:n]) +// // The reference counter is incremented from 1 to 2. b is not copied because +// // the reference counter is positive. Therefore b, and a, are uninitialized +// // on the device. +// ``` +// +// Exceptions +// ---------- +// Currently, GPU architectures do not handle exceptions. OpenMP target regions +// are allowed to contain try/catch statements and throw expressions in Clang, +// but if a throw expression is reached, it will terminate the program. That +// does not conform to the C++ standard. +// +// [This document](https://eel.is/c++draft/algorithms.parallel) has been used as +// reference for these considerations. + +#include <__algorithm/unwrap_iter.h> +#include <__config> +#include <__functional/operations.h> +#include <__iterator/iterator_traits.h> +#include <__iterator/wrap_iter.h> +#include <__pstl/backend_fwd.h> +#include <__pstl/dispatch.h> +#include <__type_traits/desugars_to.h> +#include <__type_traits/is_arithmetic.h> +#include <__type_traits/is_trivially_copyable.h> +#include <__type_traits/remove_cvref.h> +#include <__utility/empty.h> +#include <__utility/forward.h> +#include <__utility/move.h> +#include +#include + +#if !defined(_OPENMP) +# error "Trying to use the OpenMP PSTL backend, but OpenMP is not enabled. Did you compile with -fopenmp?" +#elif (defined(_OPENMP) && _OPENMP < 201511) +# error \ + "OpenMP target offloading has been supported since OpenMP version 4.5 (201511). Please use a more recent version of OpenMP." +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __pstl { + +// The following functions can be used to map contiguous array sections to and from the device. +// For now, they are simple overlays of the OpenMP pragmas, but they should be updated when adding +// support for other iterator types. +template +_LIBCPP_HIDE_FROM_ABI void +__omp_map_to([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept { + static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value); +#pragma omp target enter data map(to : __p[0 : __len]) +} + +template +_LIBCPP_HIDE_FROM_ABI void +__omp_map_from([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept { + static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value); +#pragma omp target exit data map(from : __p[0 : __len]) +} + +template +_LIBCPP_HIDE_FROM_ABI void +__omp_map_alloc([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept { + static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value); +#pragma omp target enter data map(alloc : __p[0 : __len]) +} + +template +_LIBCPP_HIDE_FROM_ABI void +__omp_map_release([[maybe_unused]] const _Iterator __p, [[maybe_unused]] const _DifferenceType __len) noexcept { + static_assert(__libcpp_is_contiguous_iterator<_Iterator>::value); +#pragma omp target exit data map(release : __p[0 : __len]) +} + +// +// fill +// +template +_LIBCPP_HIDE_FROM_ABI _Tp* __omp_fill(_Tp* __out1, _DifferenceType __n, const _Up& __value) noexcept { + __pstl::__omp_map_alloc(__out1, __n); +#pragma omp target teams distribute parallel for + for (_DifferenceType __i = 0; __i < __n; ++__i) + *(__out1 + __i) = __value; + __pstl::__omp_map_from(__out1, __n); + return __out1 + __n; +} + +template <> +struct __fill<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> + operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Tp const& __value) const noexcept { + using _ValueType = typename iterator_traits<_ForwardIterator>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType> && + is_trivially_copyable_v<_Tp>) { + __pstl::__omp_fill(std::__unwrap_iter(__first), __last - __first, __value); + return __empty{}; + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__fill, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), __value); + } + } +}; + +// +// find_if +// +template +_LIBCPP_HIDE_FROM_ABI _Tp* __omp_find_if(_Tp* __first, _DifferenceType __n, _Predicate __pred) noexcept { + __pstl::__omp_map_to(__first, __n); + _DifferenceType __idx = __n; +#pragma omp target teams distribute parallel for reduction(min : __idx) + for (_DifferenceType __i = 0; __i < __n; ++__i) { + if (__pred(*(__first + __i))) { + __idx = (__i < __idx) ? __i : __idx; + } + } + __pstl::__omp_map_release(__first, __n); + return __first + __idx; +} + +template <> +struct __find_if<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> + operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) const noexcept { + using _ValueType = typename iterator_traits<_ForwardIterator>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) { + return std::__rewrap_iter(__first, __pstl::__omp_find_if(std::__unwrap_iter(__first), __last - __first, __pred)); + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__find_if, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__pred)); + } + } +}; + +// +// for_each +// +template +_LIBCPP_HIDE_FROM_ABI _Tp* __omp_for_each(_Tp* __inout1, _DifferenceType __n, _Function __f) noexcept { + __pstl::__omp_map_to(__inout1, __n); +#pragma omp target teams distribute parallel for + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(*(__inout1 + __i)); + __pstl::__omp_map_from(__inout1, __n); + return __inout1 + __n; +} + +template <> +struct __for_each<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<__empty> + operator()(_Policy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) const noexcept { + using _ValueType = typename iterator_traits<_ForwardIterator>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && + __libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_trivially_copyable_v<_ValueType>) { + __pstl::__omp_for_each(std::__unwrap_iter(__first), __last - __first, std::move(__func)); + return __empty{}; + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__for_each, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}(std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__func)); + } + } +}; + +// +// transform +// +template +_LIBCPP_HIDE_FROM_ABI _Tp* __omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __out1, _Function __f) noexcept { + // The order of the following maps matter, as we wish to move the data. If + // they were placed in the reverse order, and __in equals __out, then we would + // allocate the buffer on the device without copying the data. + __pstl::__omp_map_to(__in1, __n); + __pstl::__omp_map_alloc(__out1, __n); +#pragma omp target teams distribute parallel for + for (_DifferenceType __i = 0; __i < __n; ++__i) + *(__out1 + __i) = __f(*(__in1 + __i)); + // The order of the following two maps matters, since the user could legally + // overwrite __in The "release" map modifier decreases the reference counter + // by one, and "from" only moves the data to the host, when the reference + // count is decremented to zero. + __pstl::__omp_map_release(__in1, __n); + __pstl::__omp_map_from(__out1, __n); + return __out1 + __n; +} + +template <> +struct __transform<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> + operator()(_Policy&& __policy, + _ForwardIterator __first, + _ForwardIterator __last, + _ForwardOutIterator __outit, + _UnaryOperation __op) const noexcept { + using _ValueType = typename iterator_traits<_ForwardIterator>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && + __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType>) { + return std::__rewrap_iter( + __outit, + std::__omp_transform( + std::__unwrap_iter(__first), __last - __first, std::__unwrap_iter(__outit), std::move(__op))); + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__transform, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}( + std::forward<_Policy>(__policy), std::move(__first), std::move(__last), std::move(__outit), std::move(__op)); + } + } +}; + +// +// transform_binary +// +template +_LIBCPP_HIDE_FROM_ABI _Tp* +__omp_transform(_Tp* __in1, _DifferenceType __n, _Up* __in2, _Vp* __out1, _Function __f) noexcept { + // The order of the following maps matter, as we wish to move the data. If + // they were placed in the reverse order, and __out equals __in1 or __in2, + // then we would allocate one of the buffer on the device without copying the + // data. + __pstl::__omp_map_to(__in1, __n); + __pstl::__omp_map_to(__in2, __n); + __pstl::__omp_map_alloc(__out1, __n); +#pragma omp target teams distribute parallel for + for (_DifferenceType __i = 0; __i < __n; ++__i) + *(__out1 + __i) = __f(*(__in1 + __i), *(__in2 + __i)); + // The order of the following three maps matters, since the user could legally + // overwrite either of the inputs if __out equals __in1 or __in2. The + // "release" map modifier decreases the reference counter by one, and "from" + // only moves the data from the device, when the reference count is + // decremented to zero. + __pstl::__omp_map_release(__in1, __n); + __pstl::__omp_map_release(__in2, __n); + __pstl::__omp_map_from(__out1, __n); + return __out1 + __n; +} + +template <> +struct __transform_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> + operator()(_Policy&& __policy, + _ForwardIterator1 __first1, + _ForwardIterator1 __last1, + _ForwardIterator2 __first2, + _ForwardOutIterator __outit, + _BinaryOperation __op) const noexcept { + using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type; + using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value && + __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && + __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType1> && + is_trivially_copyable_v<_ValueType2>) { + return std::__rewrap_iter( + __outit, + __pstl::__omp_transform( + std::__unwrap_iter(__first1), + __last1 - __first1, + std::__unwrap_iter(__first2), + std::__unwrap_iter(__outit), + std::move(__op))); + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__transform_binary, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}( + std::forward<_Policy>(__policy), + std::move(__first1), + std::move(__last1), + std::move(__first2), + std::move(__outit), + std::move(__op)); + } + } +}; + +// +// transform_reduce +// +#define _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \ + template \ + _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce( \ + _Iterator __first, \ + _DifferenceType __n, \ + _Tp __init, \ + std_op<_BinaryOperationType> __reduce, \ + _UnaryOperation __transform) noexcept { \ + __pstl::__omp_map_to(__first, __n); \ +_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init)) \ + for (_DifferenceType __i = 0; __i < __n; ++__i) \ + __init = __reduce(__init, __transform(*(__first + __i))); \ + __pstl::__omp_map_release(__first, __n); \ + return __init; \ + } + +#define _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) \ + template \ + _LIBCPP_HIDE_FROM_ABI _Tp __omp_transform_reduce( \ + _Iterator1 __first1, \ + _Iterator2 __first2, \ + _DifferenceType __n, \ + _Tp __init, \ + std_op<_BinaryOperationType> __reduce, \ + _UnaryOperation __transform) noexcept { \ + __pstl::__omp_map_to(__first1, __n); \ + __pstl::__omp_map_to(__first2, __n); \ +_PSTL_PRAGMA(omp target teams distribute parallel for reduction(omp_op:__init)) \ + for (_DifferenceType __i = 0; __i < __n; ++__i) \ + __init = __reduce(__init, __transform(*(__first1 + __i), *(__first2 + __i))); \ + __pstl::__omp_map_release(__first1, __n); \ + __pstl::__omp_map_release(__first2, __n); \ + return __init; \ + } + +#define _LIBCPP_PSTL_OMP_SIMD_REDUCTION(omp_op, std_op) \ + _LIBCPP_PSTL_OMP_SIMD_1_REDUCTION(omp_op, std_op) \ + _LIBCPP_PSTL_OMP_SIMD_2_REDUCTION(omp_op, std_op) + +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(+, std::plus) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(-, std::minus) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(*, std::multiplies) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&&, std::logical_and) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(||, std::logical_or) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(&, std::bit_and) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(|, std::bit_or) +_LIBCPP_PSTL_OMP_SIMD_REDUCTION(^, std::bit_xor) + +// Determine whether a reduction is supported by the OpenMP backend +template +struct __is_supported_reduction : std::false_type {}; + +#define _LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(func) \ + template \ + struct __is_supported_reduction, _Tp, _Tp> : true_type {}; \ + template \ + struct __is_supported_reduction, _Tp, _Up> : true_type {}; + +// __is_trivial_plus_operation already exists +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::plus) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::minus) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::multiplies) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_and) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::logical_or) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_and) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_or) +_LIBCPP_PSTL_IS_SUPPORTED_REDUCTION(std::bit_xor) + +template <> +struct __transform_reduce<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<_Tp> + operator()(_Policy&& __policy, + _ForwardIterator __first, + _ForwardIterator __last, + _Tp __init, + _Reduction __reduce, + _Transform __transform) const noexcept { + using _ValueType = typename iterator_traits<_ForwardIterator>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator>::value && is_arithmetic_v<_Tp> && + __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType>) { + return __pstl::__omp_transform_reduce( + std::__unwrap_iter(__first), __last - __first, __init, std::move(__reduce), std::move(__transform)); + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__transform_reduce, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}( + std::forward<_Policy>(__policy), + std::move(__first), + std::move(__last), + std::move(__init), + std::move(__reduce), + std::move(__transform)); + } + } +}; + +// +// transform_reduce_binary +// +template <> +struct __transform_reduce_binary<__openmp_backend_tag, execution::parallel_unsequenced_policy> { + template + _LIBCPP_HIDE_FROM_ABI optional<_Tp> operator()( + _Policy&& __policy, + _ForwardIterator1 __first1, + _ForwardIterator1 __last1, + _ForwardIterator2 __first2, + _Tp __init, + _Reduction __reduce, + _Transform __transform) const noexcept { + using _ValueType1 = typename iterator_traits<_ForwardIterator1>::value_type; + using _ValueType2 = typename iterator_traits<_ForwardIterator2>::value_type; + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIterator1>::value && + __libcpp_is_contiguous_iterator<_ForwardIterator2>::value && is_arithmetic_v<_Tp> && + __is_supported_reduction<_Reduction, _Tp, _Tp>::value && is_trivially_copyable_v<_ValueType1> && + is_trivially_copyable_v<_ValueType2>) { + return __pstl::__omp_transform_reduce( + std::__unwrap_iter(__first1), + std::__unwrap_iter(__first2), + __last1 - __first1, + std::move(__init), + std::move(__reduce), + std::move(__transform)); + } else { + using _Backends = __backends_after<__current_configuration, __openmp_backend_tag>; + using _Fallback = __dispatch<__pstl::__transform_reduce_binary, _Backends, __remove_cvref_t<_Policy>>; + return _Fallback{}( + std::forward<_Policy>(__policy), + std::move(__first1), + std::move(__last1), + std::move(__first2), + std::move(__init), + std::move(__reduce), + std::move(__transform)); + } + } +}; + +} // namespace __pstl +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___PSTL_BACKENDS_OPENMP_H diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h index 5e903f7524fe9b..c984c22456120c 100644 --- a/libcxx/include/__pstl/dispatch.h +++ b/libcxx/include/__pstl/dispatch.h @@ -58,6 +58,21 @@ struct __find_first_implemented<_Algorithm, __backend_configuration<_B1, _Bn...> template