diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index f0410d549bd..f9a6ed3cb86 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -36,6 +36,8 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" #include "parallel_backend_sycl_reduce.h" +#include "parallel_backend_sycl_merge.h" +#include "parallel_backend_sycl_merge_sort.h" #include "execution_sycl_defs.h" #include "sycl_iterator.h" #include "unseq_backend_sycl.h" @@ -199,15 +201,6 @@ class __find_or_kernel; template class __scan_propagate_kernel; -template -class __sort_leaf_kernel; - -template -class __sort_global_kernel; - -template -class __sort_copy_back_kernel; - template class __scan_single_wg_kernel; @@ -1415,346 +1408,6 @@ struct __partial_merge_kernel } }; -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, _Index __i_elem, _Index1 __n1, _Index2 __n2, - _Compare __comp) -{ - _Index1 __start1 = 0; - _Index2 __start2 = 0; - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - auto __q = __i_elem; //diagonal index - auto __n_diag = ::std::min<_Index2>(__q, __n1); //diagonal size - - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - auto __res = ::std::lower_bound(__diag_it, __diag_it + __n_diag, 1/*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable - { - auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - __start1 = *__res; - __start2 = __q - *__res; - } - else - { - auto __q = __i_elem - __n2; //diagonal index - auto __n_diag = ::std::min<_Index1>(__n1 - __q, __n2); //diagonal size - - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - auto __res = ::std::lower_bound(__diag_it, __diag_it + __n_diag, 1/*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable - { - auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - - __start1 = __q + *__res; - __start2 = __n2 - *__res; - } - return std::make_pair(__start1, __start2); -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index1 __start1, _Index2 __start2, - _Index3 __start3, ::std::uint8_t __chunk, _Size1 __n1, _Size2 __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const auto __n = ::std::min<_Index2>(__n2 - __start2, __chunk); - for (::std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const auto __n = ::std::min<_Index1>(__n1 - __start1, __chunk); - for (::std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - ::std::uint8_t __n = __chunk; - for (::std::uint8_t __i = 0; __i < __n && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __n && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __n && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - using _Size = oneapi::dpl::__internal::__difference_t<_Range3>; - - auto __n1 = __rng1.size(); - auto __n2 = __rng2.size(); - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - const ::std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - const _Size __n = __n1 + __n2; - const _Size __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - return __future(__event); - } -}; - -template -class __merge_kernel_name; - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - const auto __n = __rng1.size() + __rng2.size(); - if (__n <= std::numeric_limits<::std::uint32_t>::max()) - { - using _wi_index_type = ::std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _wi_index_type>>; - return __parallel_merge_submitter<_wi_index_type, _MergeKernel>()( - ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - ::std::forward<_Range3>(__rng3), __comp); - } - else - { - using _wi_index_type = ::std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _wi_index_type>>; - return __parallel_merge_submitter<_wi_index_type, _MergeKernel>()( - ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - ::std::forward<_Range3>(__rng3), __comp); - } -} - -//----------------------------------------------------------------------- -// parallel_sort: general implementation -//----------------------------------------------------------------------- -struct __leaf_sort_kernel -{ - template - void - operator()(const _Acc& __acc, const _Size1 __start, const _Size1 __end, _Compare __comp) const - { - for (_Size1 i = __start; i < __end; ++i) - { - for (_Size1 j = __start + 1; j < __start + __end - i; ++j) - { - // forwarding references allow binding of internal tuple of references with rvalue - auto&& __first_item = __acc[j - 1]; - auto&& __second_item = __acc[j]; - if (__comp(__second_item, __first_item)) - { - using ::std::swap; - swap(__first_item, __second_item); - } - } - } - } -}; - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_sort_submitter; - -template -struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_LeafSortName...>, - __internal::__optional_kernel_name<_GlobalSortName...>, - __internal::__optional_kernel_name<_CopyBackName...>> -{ - template - auto - operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp) const - { - using _Tp = oneapi::dpl::__internal::__value_t<_Range>; - using _Size = oneapi::dpl::__internal::__difference_t<_Range>; - - const ::std::size_t __n = __rng.size(); - assert(__n > 1); - - const bool __is_cpu = __exec.queue().get_device().is_cpu(); - const ::std::uint32_t __leaf = __is_cpu ? 16 : 4; - _Size __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __leaf); - - // 1. Perform sorting of the leaves of the merge sort tree - sycl::event __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng); - __cgh.parallel_for<_LeafSortName...>(sycl::range(__steps), [=](sycl::item __item_id) - { - const _IdType __i_elem = __item_id.get_linear_id() * __leaf; - __leaf_sort_kernel()(__rng, __i_elem, std::min<_IdType>(__i_elem + __leaf, __n), __comp); - }); - }); - - // 2. Merge sorting - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp_buf(__exec, __n); - auto __temp = __temp_buf.get_buffer(); - bool __data_in_temp = false; - _IdType __n_sorted = __leaf; - const ::std::uint32_t __chunk = __is_cpu ? 32 : 4; - __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - const ::std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n); - const ::std::int64_t __n_iter = ::std::log2(__n_power2) - ::std::log2(__leaf); - for (::std::int64_t __i = 0; __i < __n_iter; ++__i) - { - __event1 = __exec.queue().submit([&, __n_sorted, __data_in_temp](sycl::handler& __cgh) { - __cgh.depends_on(__event1); - - oneapi::dpl::__ranges::__require_access(__cgh, __rng); - sycl::accessor __dst(__temp, __cgh, sycl::read_write, sycl::no_init); - - __cgh.parallel_for<_GlobalSortName...>(sycl::range(__steps), [=](sycl::item __item_id) - { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __i_elem_local = __i_elem % (__n_sorted * 2); - - const auto __offset = ::std::min<_IdType>((__i_elem / (__n_sorted * 2)) * (__n_sorted * 2), __n); - const auto __n1 = ::std::min<_IdType>(__offset + __n_sorted, __n) - __offset; - const auto __n2 = ::std::min<_IdType>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); - - if (__data_in_temp) - { - const auto& __rng1 = oneapi::dpl::__ranges::drop_view_simple(__dst, __offset); - const auto& __rng2 = oneapi::dpl::__ranges::drop_view_simple(__dst, __offset + __n1); - - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng/*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); - } - else - { - const auto& __rng1 = oneapi::dpl::__ranges::drop_view_simple(__rng, __offset); - const auto& __rng2 = oneapi::dpl::__ranges::drop_view_simple(__rng, __offset + __n1); - - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __dst/*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); - } - }); - }); - __n_sorted *= 2; - __data_in_temp = !__data_in_temp; - } - - // 3. If the data remained in the temporary buffer then copy it back - if (__data_in_temp) - { - __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { - __cgh.depends_on(__event1); - oneapi::dpl::__ranges::__require_access(__cgh, __rng); - auto __temp_acc = __temp.template get_access(__cgh); - // We cannot use __cgh.copy here because of zip_iterator usage - __cgh.parallel_for<_CopyBackName...>(sycl::range(__n), [=](sycl::item __item_id) { - const _IdType __idx = __item_id.get_linear_id(); - __rng[__idx] = __temp_acc[__idx]; - }); - }); - } - - return __future(__event1); - } -}; - -template -auto -__parallel_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, - _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - const auto __n = __rng.size(); - if (__n <= std::numeric_limits<::std::uint32_t>::max()) - { - using _wi_index_type = ::std::uint32_t; - using _LeafSortKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_leaf_kernel<_CustomName, _wi_index_type>>; - using _GlobalSortKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_global_kernel<_CustomName, _wi_index_type>>; - using _CopyBackKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName, _wi_index_type>>; - return __parallel_sort_submitter<_wi_index_type, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()( - oneapi::dpl::__internal::__device_backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range>(__rng), __comp); - } - else - { - using _wi_index_type = ::std::uint64_t; - using _LeafSortKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_leaf_kernel<_CustomName, _wi_index_type>>; - using _GlobalSortKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_global_kernel<_CustomName, _wi_index_type>>; - using _CopyBackKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName, _wi_index_type>>; - return __parallel_sort_submitter<_wi_index_type, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()( - oneapi::dpl::__internal::__device_backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range>(__rng), __comp); - } -} - // Please see the comment for __parallel_for_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h new file mode 100644 index 00000000000..9e63f6e1236 --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -0,0 +1,215 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, _Index __i_elem, _Index1 __n1, _Index2 __n2, + _Compare __comp) +{ + _Index1 __start1 = 0; + _Index2 __start2 = 0; + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + auto __q = __i_elem; //diagonal index + auto __n_diag = std::min<_Index2>(__q, __n1); //diagonal size + + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + __start1 = *__res; + __start2 = __q - *__res; + } + else + { + auto __q = __i_elem - __n2; //diagonal index + auto __n_diag = std::min<_Index1>(__n1 - __q, __n2); //diagonal size + + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + + __start1 = __q + *__res; + __start2 = __n2 - *__res; + } + return std::make_pair(__start1, __start2); +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index1 __start1, _Index2 __start2, + _Index3 __start3, std::uint8_t __chunk, _Size1 __n1, _Size2 __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const auto __n = std::min<_Index2>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const auto __n = std::min<_Index1>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + std::uint8_t __n = __chunk; + for (std::uint8_t __i = 0; __i < __n && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __n && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __n && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + using _Size = oneapi::dpl::__internal::__difference_t<_Range3>; + + auto __n1 = __rng1.size(); + auto __n2 = __rng2.size(); + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const _Size __n = __n1 + __n2; + const _Size __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + return __future(__event); + } +}; + +template +class __merge_kernel_name; + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + const auto __n = __rng1.size() + __rng2.size(); + if (__n <= std::numeric_limits::max()) + { + using _wi_index_type = std::uint32_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _wi_index_type>>; + return __parallel_merge_submitter<_wi_index_type, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _wi_index_type = std::uint64_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _wi_index_type>>; + return __parallel_merge_submitter<_wi_index_type, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h new file mode 100644 index 00000000000..c5988187c6f --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -0,0 +1,211 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_merge_sort.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_SORT_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_SORT_H + +#include // std::log2 +#include // std::numeric_limits +#include // assert +#include // std::swap +#include // std::uint32_t, ... +#include // std::min + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "parallel_backend_sycl_merge.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +struct __leaf_sort_kernel +{ + template + void + operator()(const _Acc& __acc, const _Size1 __start, const _Size1 __end, _Compare __comp) const + { + for (_Size1 i = __start; i < __end; ++i) + { + for (_Size1 j = __start + 1; j < __start + __end - i; ++j) + { + // forwarding references allow binding of internal tuple of references with rvalue + auto&& __first_item = __acc[j - 1]; + auto&& __second_item = __acc[j]; + if (__comp(__second_item, __first_item)) + { + using std::swap; + swap(__first_item, __second_item); + } + } + } + } +}; + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_sort_submitter; + +template +struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_LeafSortName...>, + __internal::__optional_kernel_name<_GlobalSortName...>, + __internal::__optional_kernel_name<_CopyBackName...>> +{ + template + auto + operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp) const + { + using _Tp = oneapi::dpl::__internal::__value_t<_Range>; + using _Size = oneapi::dpl::__internal::__difference_t<_Range>; + + const std::size_t __n = __rng.size(); + assert(__n > 1); + + const bool __is_cpu = __exec.queue().get_device().is_cpu(); + const std::uint32_t __leaf = __is_cpu ? 16 : 4; + _Size __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __leaf); + + // 1. Perform sorting of the leaves of the merge sort tree + sycl::event __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + __cgh.parallel_for<_LeafSortName...>(sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __leaf; + __leaf_sort_kernel()(__rng, __i_elem, std::min<_IdType>(__i_elem + __leaf, __n), __comp); + }); + }); + + // 2. Merge sorting + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp> __temp_buf(__exec, __n); + auto __temp = __temp_buf.get_buffer(); + bool __data_in_temp = false; + _IdType __n_sorted = __leaf; + const std::uint32_t __chunk = __is_cpu ? 32 : 4; + __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + const std::size_t __n_power2 = oneapi::dpl::__internal::__dpl_bit_ceil(__n); + const std::int64_t __n_iter = std::log2(__n_power2) - std::log2(__leaf); + for (std::int64_t __i = 0; __i < __n_iter; ++__i) + { + __event1 = __exec.queue().submit([&, __n_sorted, __data_in_temp](sycl::handler& __cgh) { + __cgh.depends_on(__event1); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + sycl::accessor __dst(__temp, __cgh, sycl::read_write, sycl::no_init); + + __cgh.parallel_for<_GlobalSortName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __i_elem_local = __i_elem % (__n_sorted * 2); + + const auto __offset = std::min<_IdType>((__i_elem / (__n_sorted * 2)) * (__n_sorted * 2), __n); + const auto __n1 = std::min<_IdType>(__offset + __n_sorted, __n) - __offset; + const auto __n2 = std::min<_IdType>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); + + if (__data_in_temp) + { + const auto& __rng1 = oneapi::dpl::__ranges::drop_view_simple(__dst, __offset); + const auto& __rng2 = oneapi::dpl::__ranges::drop_view_simple(__dst, __offset + __n1); + + const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, + __chunk, __n1, __n2, __comp); + } + else + { + const auto& __rng1 = oneapi::dpl::__ranges::drop_view_simple(__rng, __offset); + const auto& __rng2 = oneapi::dpl::__ranges::drop_view_simple(__rng, __offset + __n1); + + const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, + __chunk, __n1, __n2, __comp); + } + }); + }); + __n_sorted *= 2; + __data_in_temp = !__data_in_temp; + } + + // 3. If the data remained in the temporary buffer then copy it back + if (__data_in_temp) + { + __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__event1); + oneapi::dpl::__ranges::__require_access(__cgh, __rng); + auto __temp_acc = __temp.template get_access(__cgh); + // We cannot use __cgh.copy here because of zip_iterator usage + __cgh.parallel_for<_CopyBackName...>(sycl::range(__n), [=](sycl::item __item_id) { + const _IdType __idx = __item_id.get_linear_id(); + __rng[__idx] = __temp_acc[__idx]; + }); + }); + } + + return __future(__event1); + } +}; + +template +class __sort_leaf_kernel; + +template +class __sort_global_kernel; + +template +class __sort_copy_back_kernel; + +template +auto +__parallel_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, + _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + const auto __n = __rng.size(); + if (__n <= std::numeric_limits::max()) + { + using _wi_index_type = std::uint32_t; + using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_leaf_kernel<_CustomName, _wi_index_type>>; + using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel<_CustomName, _wi_index_type>>; + using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_copy_back_kernel<_CustomName, _wi_index_type>>; + return __parallel_sort_submitter<_wi_index_type, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range>(__rng), __comp); + } + else + { + using _wi_index_type = std::uint64_t; + using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_leaf_kernel<_CustomName, _wi_index_type>>; + using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_global_kernel<_CustomName, _wi_index_type>>; + using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __sort_copy_back_kernel<_CustomName, _wi_index_type>>; + return __parallel_sort_submitter<_wi_index_type, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range>(__rng), __comp); + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_SORT_H