llvm · AntonRydahl · Sep 21, 2023 · Aug 12, 2024 · Sep 2, 2024
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
@@ -153,6 +153,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           # TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive
           # or don't provide much value since the benchmark run results are too noise on the bots.

@@ -301,10 +301,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -571,6 +572,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -800,6 +806,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")

@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
@@ -424,6 +424,17 @@ libc++ Feature Options
   provided, this header will be included by the library, replacing the
   default assertion handler.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later.
+
 
 libc++ ABI Feature Options
 --------------------------

@@ -364,6 +364,110 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling. If compiling a program
+containing parallel algorithms with current versions of Clang, a program with
+exceptions in offloaded code regions will compile, but the program will
+terminate if an exception is thrown on the device. This does not conform with
+the C++ standard and exception handling on GPUs will hopefully be better
+supported in future releases of LLVM.
+
 Platform specific behavior
 ==========================
 

@@ -579,6 +579,7 @@ set(files
   __pstl/backend_fwd.h
   __pstl/backends/default.h
   __pstl/backends/libdispatch.h
+  __pstl/backends/openmp.h
   __pstl/backends/serial.h
   __pstl/backends/std_thread.h
   __pstl/cpu_algos/any_of.h

@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+#define _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/projected.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/dangling.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <optional>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 20
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+namespace __find_last {
+
+struct __fn {
+  template <forward_iterator _Ip, sentinel_for<_Ip> _Sp, class _Tp, class _Proj = identity>
+    requires indirect_binary_predicate<ranges::equal_to, projected<_Ip, _Proj>, const _Tp*>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr subrange<_Ip>
+  operator()(_Ip __first, _Sp __last, const _Tp& __value, _Proj __proj = {}) const {
+    std::optional<_Ip> __found;
+    for (; __first != __last; ++__first) {
+      if (std::invoke(__proj, *__first) == __value) {
+        __found = __first;
+      }
+    }
+    if (!__found)
+      return {__first, __first};
+    return {*__found, std::ranges::next(*__found, __last)};
+  }
+
+  template <forward_range _Rp, class _Tp, class _Proj = identity>
+    requires indirect_binary_predicate<ranges::equal_to, projected<iterator_t<_Rp>, _Proj>, const _Tp*>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr borrowed_subrange_t<_Rp>
+  operator()(_Rp&& __r, const _Tp& __value, _Proj __proj = {}) const {
+    return this->operator()(ranges::begin(__r), ranges::end(__r), __value, std::ref(__proj));
+  }
+};
+
+} // namespace __find_last
+
+inline namespace __cpo {
+inline constexpr auto find_last        = __find_last::__fn{};
+inline constexpr auto find_last_if     = __find_last::__fn{};
+inline constexpr auto find_last_if_not = __find_last::__fn{};
+} // namespace __cpo
+
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_RANGES_FIND_LAST_IF_H
+#define _LIBCPP___ALGORITHM_RANGES_FIND_LAST_IF_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/projected.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/dangling.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <optional>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 20
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+
+namespace __find_last_if {
+
+struct __fn {
+  template <forward_iterator _Ip,
+            sentinel_for<_Ip> _Sp,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<_Ip, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr subrange<_Ip>
+  operator()(_Ip __first, _Sp __last, _Pred __pred, _Proj __proj = {}) const {
+    std::optional<_Ip> __found;
+    for (; __first != __last; ++__first) {
+      if (std::invoke(__pred, std::invoke(__proj, *__first))) {
+        __found = __first;
+      }
+    }
+    if (!__found)
+      return {__first, __first};
+    return {*__found, std::ranges::next(*__found, __last)};
+  }
+
+  template <forward_range _Rp,
+            class _Proj = identity,
+            indirect_unary_predicate<projected<iterator_t<_Rp>, _Proj>> _Pred>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr borrowed_subrange_t<_Rp>
+  operator()(_Rp&& __r, _Pred __pred, _Proj __proj = {}) const {
+    return this->operator()(ranges::begin(__r), ranges::end(__r), std::ref(__pred), std::ref(__proj));
+  }
+};
+
+} // namespace __find_last_if
+
+inline namespace __cpo {
+inline constexpr auto find_last_if = __find_last_if::__fn{};
+} // namespace __cpo
+
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_RANGES_FIND_LAST_IF_H