-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc++] Optimize std::min_element #100616
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-libcxx Author: vaibhav (mrdaybird) ChangesThis PR provides an alternate implementation of std::min_element which is faster in most cases and can leverage clang's auto-vectorization capabilities. The algorithm is inspired by Argmin in SIMD. You can refer to it for reason of choosing the algorithm. The algorithm performs min-reduction and std::find in blocks, to get the global minimum element. Since, clang can auto-vectorize min-reduction and soon may also auto-vectorize std::find(#88385), the new algorithm can partially be auto-vectorized, and in the future may get completely auto-vectorized. You can see the benchmark results in this repo
I would encourage the reviewers to perform the benchmark on other systems as well. Full diff: https://github.com/llvm/llvm-project/pull/100616.diff 3 Files Affected:
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index d96ccc1e49f66..952777d5346f7 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -121,6 +121,7 @@ set(BENCHMARK_TESTS
algorithms/make_heap_then_sort_heap.bench.cpp
algorithms/min.bench.cpp
algorithms/minmax.bench.cpp
+ algorithms/min_element.bench.cpp
algorithms/min_max_element.bench.cpp
algorithms/mismatch.bench.cpp
algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/min_element.bench.cpp b/libcxx/benchmarks/min_element.bench.cpp
new file mode 100644
index 0000000000000..ed769aeeea7d2
--- /dev/null
+++ b/libcxx/benchmarks/min_element.bench.cpp
@@ -0,0 +1,48 @@
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#include <benchmark/benchmark.h>
+#include <random>
+
+template<typename T>
+static void BM_stdmin_element_decreasing(benchmark::State &state){
+ std::vector<T> v(state.range(0));
+ T start = std::numeric_limits<T>::max();
+ T end = std::numeric_limits<T>::min();
+
+ for(size_t i = 0; i < v.size(); i++)
+ v[i] = ((start != end) ? start-- : end);
+
+ for(auto _ : state){
+ benchmark::DoNotOptimize(v);
+ benchmark::DoNotOptimize(std::min_element(v.begin(), v.end()));
+ }
+}
+
+BENCHMARK(BM_stdmin_element_decreasing<char>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<short>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<int>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<long long>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<unsigned char>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<unsigned short>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<unsigned int>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+BENCHMARK(BM_stdmin_element_decreasing<unsigned long long>)
+ ->DenseRange(1, 8)->Range(32, 128)->Range(256, 4096)->DenseRange(5000, 10000, 1000)
+ ->Range(1<<14, 1<<16)->Arg(70000);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/include/__algorithm/min_element.h b/libcxx/include/__algorithm/min_element.h
index 65f3594d630ce..891a4afae3ea1 100644
--- a/libcxx/include/__algorithm/min_element.h
+++ b/libcxx/include/__algorithm/min_element.h
@@ -11,6 +11,7 @@
#include <__algorithm/comp.h>
#include <__algorithm/comp_ref_type.h>
+#include <__algorithm/iterator_operations.h>
#include <__config>
#include <__functional/identity.h>
#include <__functional/invoke.h>
@@ -33,12 +34,56 @@ __min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj) {
if (__first == __last)
return __first;
- _Iter __i = __first;
- while (++__i != __last)
- if (std::__invoke(__comp, std::__invoke(__proj, *__i), std::__invoke(__proj, *__first)))
- __first = __i;
+ const size_t __n = static_cast<size_t>(std::distance(__first, __last));
- return __first;
+ if (__n <= 64) {
+ _Iter __i = __first;
+ while (++__i != __last)
+ if (std::__invoke(__comp, std::__invoke(__proj, *__i), std::__invoke(__proj, *__first)))
+ __first = __i;
+ return __first;
+ }
+
+ size_t __block_size = 256;
+
+ size_t __n_blocked = __n - (__n % __block_size);
+ _Iter __block_start = __first, __block_end = __first;
+
+ typedef typename std::iterator_traits<_Iter>::value_type value_type;
+ value_type __min_val = std::invoke(__proj, *__first);
+
+ _Iter __curr = __first;
+ for (size_t __i = 0; __i < __n_blocked; __i += __block_size) {
+ _Iter __start = __curr;
+ value_type __block_min = __min_val;
+ for (size_t j = 0; j < __block_size; j++) {
+ if (std::__invoke(__comp, std::__invoke(__proj, *__curr), __block_min)) {
+ __block_min = *__curr;
+ }
+ __curr++;
+ }
+ if (std::invoke(__comp, __block_min, __min_val)) {
+ __min_val = __block_min;
+ __block_start = __start;
+ __block_end = __curr;
+ }
+ }
+
+ value_type __epilogue_min = __min_val;
+ _Iter __epilogue_start = __curr;
+ while (__curr != __last) {
+ if (std::__invoke(__comp, std::__invoke(__proj, *__curr), __epilogue_min)) {
+ __epilogue_min = *__curr;
+ }
+ __curr++;
+ }
+ if (std::__invoke(__comp, __epilogue_min, __min_val)) {
+ __min_val = __epilogue_min;
+ __block_start = __epilogue_start;
+ __block_end = __last;
+ }
+
+ return find(__block_start, __block_end, __min_val);
}
template <class _Comp, class _Iter, class _Sent>
|
Pinging @philnik777, if you are interested in reviewing this, because #84663 was the inspiration for this. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
a997ee5
to
01c71b4
Compare
74bad7e
to
6aae7b1
Compare
[libc++] Fix formatting and move min_element.bench.cpp [libc++] Use __invoke instead of invoke [libc++] Fix build issues with find
[libcxx] revert block_size logic [libc++] Replace std::find with impl
f8e6131
to
bbfccb3
Compare
This PR provides an alternate implementation of std::min_element which is faster in most cases and can leverage clang's auto-vectorization capabilities.
This algorithm is inspired by Argmin in SIMD. You can refer to it for the reason for choosing the algorithm.
The algorithm performs min-reduction and std::find in blocks, to get the global minimum element. Since clang can auto-vectorize min-reduction and soon may also auto-vectorize std::find(#88385), the new algorithm can partially be auto-vectorized, and in the future may get completely auto-vectorized.
You can see the benchmark results in this repo.
Gist of benchmark-
I would encourage the reviewers to perform the benchmark on other systems as well.