-
Notifications
You must be signed in to change notification settings - Fork 4
/
merge.cu
108 lines (83 loc) · 2.53 KB
/
merge.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// Sean Baxter's GTC talk companion source.
// http://nvlabs.github.io/moderngpu/
#include "common.cuh"
template<int NT, int VT, typename T>
__global__ void KernelMergeSimple(const T* a_global, int aCount,
const T* b_global, int bCount, T* merged_global) {
const int NV = NT * VT;
__shared__ T data_shared[NT * VT + 1];
int tid = threadIdx.x;
// Load aCount elements and NV - aCount elements from b_global.
b_global -= aCount;
T x[VT];
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < aCount) x[i] = a_global[index];
else x[i] = b_global[index];
}
// Store all elements to shared memory.
#pragma unroll
for(int i = 0; i < VT; ++i)
data_shared[NT * i + tid] = x[i];
__syncthreads();
// Each thread searches for its Merge Path partition.
int diag = VT * tid;
int mp = MergePath(data_shared, aCount, data_shared + aCount, bCount,
diag, SearchBoundsLower);
// Sequentially merge into register starting from the partition.
int a = mp;
int b = aCount + diag - a;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(b >= NV) p = true;
else if(a >= aCount) p = false;
else p = !(data_shared[b] < data_shared[a]);
x[i] = p ? data_shared[a++] : data_shared[b++];
}
__syncthreads();
// The merged data is now in thread order in register. Transpose through
// shared memory and store to DRAM.
#pragma unroll
for(int i = 0; i < VT; ++i)
data_shared[VT * tid + i] = x[i];
__syncthreads();
#pragma unroll
for(int i = 0; i < VT; ++i)
merged_global[NT * i + tid] = data_shared[NT * i + tid];
}
int main(int argc, char** argv) {
const int NT = 128;
const int VT = 7;
const int NV = NT * VT;
int aCount = NV / 2;
int bCount = NV - aCount;
// Generate random sorted arrays to merge.
std::vector<int> aHost(aCount), bHost(bCount);
for(int i = 0; i < aCount; ++i)
aHost[i] = rand() % 100;
for(int i = 0; i < bCount; ++i)
bHost[i] = rand() % 100;
std::sort(aHost.begin(), aHost.end());
std::sort(bHost.begin(), bHost.end());
int* a_global, *b_global;
cudaMalloc2(&a_global, aHost);
cudaMalloc2(&b_global, bHost);
int* merged_global;
cudaMalloc2(&merged_global, NV);
KernelMergeSimple<NT, VT><<<1, NT>>>(a_global, aCount, b_global, bCount,
merged_global);
std::vector<int> mergedHost(NV);
copyDtoH(&mergedHost[0], merged_global, NV);
cudaFree(a_global);
cudaFree(b_global);
cudaFree(merged_global);
for(int tid = 0; tid < NT; ++tid) {
printf("%3d: \t", tid);
for(int i = 0; i < VT; ++i)
printf("%3d ", mergedHost[VT * tid + i]);
printf("\n");
}
return 0;
}