-
Notifications
You must be signed in to change notification settings - Fork 0
/
Utils.h
114 lines (95 loc) · 3.38 KB
/
Utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#pragma once
#include <omp.h>
#include <cstdint>
#include <random>
#include <thread>
#include <immintrin.h>
constexpr const uint32_t kCacheLineSize = 64;
constexpr const uint32_t kRamPageBytes = 4096;
static const uint32_t nSysCpus = std::thread::hardware_concurrency();
typedef unsigned __int128 uint128;
constexpr const uint128 kHashBase =
(uint128(244) * uint128(1000*1000*1000) * uint128(1000*1000*1000) + uint128(903443422803031898ULL)) * uint128(1000*1000*1000) * uint128(1000*1000*1000)
+ uint128(471395581046679967ULL);
template<typename T, typename U> constexpr T DivUp(const T a, const U b) {
return (a + T(b) - 1) / T(b);
}
inline uint64_t hash64(uint64_t key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
namespace detail {
template <typename F>
struct FinalAction {
FinalAction(F&& f) : clean_{std::move(f)} {}
~FinalAction() {
if (enabled_) clean_();
}
void Disable() { enabled_ = false; };
private:
F clean_;
bool enabled_{true};
};
} // namespace detail
template <typename F>
detail::FinalAction<F> Finally(F&& f) {
return detail::FinalAction<F>(std::move(f));
}
inline std::mt19937_64 GetSeededRandom() {
unsigned long long seed;
while(!_rdrand64_step(&seed));
std::mt19937_64 rng(seed);
return rng;
}
template <typename T>
void ParallelShuffle(T* data, const size_t count) {
const uint32_t nThreads = std::max<int64_t>(1, std::min<int64_t>(omp_get_max_threads(), count/3));
std::atomic_flag* syncs = static_cast<std::atomic_flag*>(malloc(count * sizeof(std::atomic_flag)));
auto clean_syncs = Finally([&]() { free(syncs); });
#pragma omp parallel for schedule(static, kCacheLineSize / sizeof(std::atomic_flag))
for (size_t i = 0; i < count; i++) {
new (syncs + i) std::atomic_flag ATOMIC_FLAG_INIT;
}
const size_t nPerThread = (count + nThreads - 1) / nThreads;
// The number of threads here is important and must not default to whatever else
#pragma omp parallel for num_threads(nThreads)
for (size_t i = 0; i < nThreads; i++) {
std::mt19937_64 rng = GetSeededRandom();
std::uniform_int_distribution<size_t> dist(0, count - 1);
const size_t iFirst = nPerThread * i;
const size_t iLimit = std::min(nPerThread + iFirst, count);
if (iLimit <= iFirst) {
continue;
}
for (size_t j = iFirst; j < iLimit; j++) {
const size_t fellow = dist(rng);
if (fellow == j) {
continue;
}
const size_t sync1 = std::min(j, fellow);
const size_t sync2 = j ^ fellow ^ sync1;
while (syncs[sync1].test_and_set(std::memory_order_acq_rel)) {
while (syncs[sync1].test(std::memory_order_relaxed)); // keep it hot in cache
}
while (syncs[sync2].test_and_set(std::memory_order_acq_rel)) {
while (syncs[sync2].test(std::memory_order_relaxed)); // keep it hot in cache
}
std::swap(data[sync1], data[sync2]);
syncs[sync2].clear(std::memory_order_release);
syncs[sync1].clear(std::memory_order_release);
}
}
}
namespace std {
template<> struct hash<pair<uint128, uint128>> {
uint128 operator()(const pair<uint128, uint128>& v) const {
return v.first * 1949 + v.second * 2011;
}
};
} // namespace std