-
Notifications
You must be signed in to change notification settings - Fork 29
/
lap.h
477 lines (447 loc) · 14.9 KB
/
lap.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
#include <cassert>
#include <cstdio>
#include <limits>
#include <memory>
#include <immintrin.h>
#ifdef __GNUC__
#define always_inline __attribute__((always_inline)) inline
#define restrict __restrict__
#elif _WIN32
#define always_inline __forceinline
#define restrict __restrict
#else
#define always_inline inline
#define restrict
#endif
template <typename idx, typename cost>
always_inline std::tuple<cost, cost, idx, idx>
find_umins_regular(
idx dim, idx i, const cost *restrict assign_cost,
const cost *restrict v) {
const cost *local_cost = &assign_cost[i * dim];
cost umin = local_cost[0] - v[0];
idx j1 = 0;
idx j2 = -1;
cost usubmin = std::numeric_limits<cost>::max();
for (idx j = 1; j < dim; j++) {
cost h = local_cost[j] - v[j];
if (h < usubmin) {
if (h >= umin) {
usubmin = h;
j2 = j;
} else {
usubmin = umin;
umin = h;
j2 = j1;
j1 = j;
}
}
}
return std::make_tuple(umin, usubmin, j1, j2);
}
// These are not constexpr because of typename idx
#define FLOAT_MIN_DIM 64
#define DOUBLE_MIN_DIM 100000 // 64-bit code is actually always slower
template <typename idx>
always_inline std::tuple<float, float, idx, idx>
find_umins_avx2(
idx dim, idx i, const float *restrict assign_cost,
const float *restrict v) {
if (dim < FLOAT_MIN_DIM) {
return find_umins_regular(dim, i, assign_cost, v);
}
const float *local_cost = assign_cost + i * dim;
__m256i idxvec = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
__m256i j1vec = _mm256_set1_epi32(-1), j2vec = _mm256_set1_epi32(-1);
__m256 uminvec = _mm256_set1_ps(std::numeric_limits<float>::max()),
usubminvec = _mm256_set1_ps(std::numeric_limits<float>::max());
for (idx j = 0; j < dim - 7; j += 8) {
__m256 acvec = _mm256_loadu_ps(local_cost + j);
__m256 vvec = _mm256_loadu_ps(v + j);
__m256 h = _mm256_sub_ps(acvec, vvec);
__m256 cmp = _mm256_cmp_ps(h, uminvec, _CMP_LE_OQ);
usubminvec = _mm256_blendv_ps(usubminvec, uminvec, cmp);
j2vec = _mm256_blendv_epi8(
j2vec, j1vec, _mm256_castps_si256(cmp));
uminvec = _mm256_blendv_ps(uminvec, h, cmp);
j1vec = _mm256_blendv_epi8(
j1vec, idxvec, _mm256_castps_si256(cmp));
cmp = _mm256_andnot_ps(cmp, _mm256_cmp_ps(h, usubminvec, _CMP_LT_OQ));
usubminvec = _mm256_blendv_ps(usubminvec, h, cmp);
j2vec = _mm256_blendv_epi8(
j2vec, idxvec, _mm256_castps_si256(cmp));
idxvec = _mm256_add_epi32(idxvec, _mm256_set1_epi32(8));
}
alignas(__m256) float uminmem[8], usubminmem[8];
alignas(__m256) int32_t j1mem[8], j2mem[8];
_mm256_store_ps(uminmem, uminvec);
_mm256_store_ps(usubminmem, usubminvec);
_mm256_store_si256(reinterpret_cast<__m256i*>(j1mem), j1vec);
_mm256_store_si256(reinterpret_cast<__m256i*>(j2mem), j2vec);
idx j1 = -1, j2 = -1;
float umin = std::numeric_limits<float>::max(),
usubmin = std::numeric_limits<float>::max();
for (int vi = 0; vi < 8; vi++) {
float h = uminmem[vi];
if (h < usubmin) {
idx jnew = j1mem[vi];
if (h >= umin) {
usubmin = h;
j2 = jnew;
} else {
usubmin = umin;
umin = h;
j2 = j1;
j1 = jnew;
}
}
}
for (int vi = 0; vi < 8; vi++) {
float h = usubminmem[vi];
if (h < usubmin) {
usubmin = h;
j2 = j2mem[vi];
}
}
for (idx j = dim & 0xFFFFFFF8u; j < dim; j++) {
float h = local_cost[j] - v[j];
if (h < usubmin) {
if (h >= umin) {
usubmin = h;
j2 = j;
} else {
usubmin = umin;
umin = h;
j2 = j1;
j1 = j;
}
}
}
return std::make_tuple(umin, usubmin, j1, j2);
}
template <typename idx>
always_inline std::tuple<double, double, idx, idx>
find_umins_avx2(
idx dim, idx i, const double *restrict assign_cost,
const double *restrict v) {
if (dim < DOUBLE_MIN_DIM) {
return find_umins_regular(dim, i, assign_cost, v);
}
const double *local_cost = assign_cost + i * dim;
__m256i idxvec = _mm256_setr_epi64x(0, 1, 2, 3);
__m256i j1vec = _mm256_set1_epi64x(-1), j2vec = _mm256_set1_epi64x(-1);
__m256d uminvec = _mm256_set1_pd(std::numeric_limits<double>::max()),
usubminvec = _mm256_set1_pd(std::numeric_limits<double>::max());
for (idx j = 0; j < dim - 3; j += 4) {
__m256d acvec = _mm256_loadu_pd(local_cost + j);
__m256d vvec = _mm256_loadu_pd(v + j);
__m256d h = _mm256_sub_pd(acvec, vvec);
__m256d cmp = _mm256_cmp_pd(h, uminvec, _CMP_LE_OQ);
usubminvec = _mm256_blendv_pd(usubminvec, uminvec, cmp);
j2vec = _mm256_blendv_epi8(
j2vec, j1vec, _mm256_castpd_si256(cmp));
uminvec = _mm256_blendv_pd(uminvec, h, cmp);
j1vec = _mm256_blendv_epi8(
j1vec, idxvec, _mm256_castpd_si256(cmp));
cmp = _mm256_andnot_pd(cmp, _mm256_cmp_pd(h, usubminvec, _CMP_LT_OQ));
usubminvec = _mm256_blendv_pd(usubminvec, h, cmp);
j2vec = _mm256_blendv_epi8(
j2vec, idxvec, _mm256_castpd_si256(cmp));
idxvec = _mm256_add_epi64(idxvec, _mm256_set1_epi64x(4));
}
alignas(__m256d) double uminmem[4], usubminmem[4];
alignas(__m256d) int64_t j1mem[4], j2mem[4];
_mm256_store_pd(uminmem, uminvec);
_mm256_store_pd(usubminmem, usubminvec);
_mm256_store_si256(reinterpret_cast<__m256i*>(j1mem), j1vec);
_mm256_store_si256(reinterpret_cast<__m256i*>(j2mem), j2vec);
idx j1 = -1, j2 = -1;
double umin = std::numeric_limits<double>::max(),
usubmin = std::numeric_limits<double>::max();
for (int vi = 0; vi < 4; vi++) {
double h = uminmem[vi];
if (h < usubmin) {
idx jnew = j1mem[vi];
if (h >= umin) {
usubmin = h;
j2 = jnew;
} else {
usubmin = umin;
umin = h;
j2 = j1;
j1 = jnew;
}
}
}
for (int vi = 0; vi < 4; vi++) {
double h = usubminmem[vi];
if (h < usubmin) {
usubmin = h;
j2 = j2mem[vi];
}
}
for (idx j = dim & 0xFFFFFFFCu; j < dim; j++) {
double h = local_cost[j] - v[j];
if (h < usubmin) {
if (h >= umin) {
usubmin = h;
j2 = j;
} else {
usubmin = umin;
umin = h;
j2 = j1;
j1 = j;
}
}
}
return std::make_tuple(umin, usubmin, j1, j2);
}
template <bool avx2, typename idx, typename cost>
always_inline std::tuple<cost, cost, idx, idx>
find_umins(
idx dim, idx i, const cost *restrict assign_cost,
const cost *restrict v) {
if constexpr(avx2) {
return find_umins_avx2(dim, i, assign_cost, v);
} else {
return find_umins_regular(dim, i, assign_cost, v);
}
}
/// @brief Exact Jonker-Volgenant algorithm.
/// @param dim in problem size
/// @param assign_cost in cost matrix
/// @param verbose in indicates whether to report the progress to stdout
/// @param rowsol out column assigned to row in solution / size dim
/// @param colsol out row assigned to column in solution / size dim
/// @param u out dual variables, row reduction numbers / size dim
/// @param v out dual variables, column reduction numbers / size dim
/// @return achieved minimum assignment cost
template <bool avx2, bool verbose, typename idx, typename cost>
cost lap(int dim, const cost *restrict assign_cost,
idx *restrict rowsol, idx *restrict colsol,
cost *restrict u, cost *restrict v) {
auto collist = std::make_unique<idx[]>(dim); // list of columns to be scanned in various ways.
auto matches = std::make_unique<idx[]>(dim); // counts how many times a row could be assigned.
auto d = std::make_unique<cost[]>(dim); // 'cost-distance' in augmenting path calculation.
auto pred = std::make_unique<idx[]>(dim); // row-predecessor of column in augmenting/alternating path.
// init how many times a row will be assigned in the column reduction.
#if _OPENMP >= 201307
#pragma omp simd
#endif
for (idx i = 0; i < dim; i++) {
matches[i] = 0;
}
// COLUMN REDUCTION
for (idx j = dim - 1; j >= 0; j--) { // reverse order gives better results.
// find minimum cost over rows.
cost min = assign_cost[j];
idx imin = 0;
for (idx i = 1; i < dim; i++) {
const cost *local_cost = &assign_cost[i * dim];
if (local_cost[j] < min) {
min = local_cost[j];
imin = i;
}
}
v[j] = min;
if (++matches[imin] == 1) {
// init assignment if minimum row assigned for first time.
rowsol[imin] = j;
colsol[j] = imin;
} else {
colsol[j] = -1; // row already assigned, column not assigned.
}
}
if (verbose) {
printf("lapjv: COLUMN REDUCTION finished\n");
}
// REDUCTION TRANSFER
auto free = matches.get(); // list of unassigned rows.
idx numfree = 0;
for (idx i = 0; i < dim; i++) {
const cost *local_cost = &assign_cost[i * dim];
if (matches[i] == 0) { // fill list of unassigned 'free' rows.
free[numfree++] = i;
} else if (matches[i] == 1) { // transfer reduction from rows that are assigned once.
idx j1 = rowsol[i];
cost min = std::numeric_limits<cost>::max();
for (idx j = 0; j < dim; j++) {
if (j != j1) {
if (local_cost[j] - v[j] < min) {
min = local_cost[j] - v[j];
}
}
}
v[j1] = v[j1] - min;
}
}
if (verbose) {
printf("lapjv: REDUCTION TRANSFER finished\n");
}
// AUGMENTING ROW REDUCTION
for (int loopcnt = 0; loopcnt < 2; loopcnt++) { // loop to be done twice.
// scan all free rows.
// in some cases, a free row may be replaced with another one to be scanned next.
idx k = 0;
idx prevnumfree = numfree;
numfree = 0; // start list of rows still free after augmenting row reduction.
while (k < prevnumfree) {
idx i = free[k++];
// find minimum and second minimum reduced cost over columns.
cost umin, usubmin;
idx j1, j2;
std::tie(umin, usubmin, j1, j2) = find_umins<avx2>(dim, i, assign_cost, v);
idx i0 = colsol[j1];
cost vj1_new = v[j1] - (usubmin - umin);
bool vj1_lowers = vj1_new < v[j1]; // the trick to eliminate the epsilon bug
if (vj1_lowers) {
// change the reduction of the minimum column to increase the minimum
// reduced cost in the row to the subminimum.
v[j1] = vj1_new;
} else if (i0 >= 0) { // minimum and subminimum equal.
// minimum column j1 is assigned.
// swap columns j1 and j2, as j2 may be unassigned.
j1 = j2;
i0 = colsol[j2];
}
// (re-)assign i to j1, possibly de-assigning an i0.
rowsol[i] = j1;
colsol[j1] = i;
if (i0 >= 0) { // minimum column j1 assigned earlier.
if (vj1_lowers) {
// put in current k, and go back to that k.
// continue augmenting path i - j1 with i0.
free[--k] = i0;
} else {
// no further augmenting reduction possible.
// store i0 in list of free rows for next phase.
free[numfree++] = i0;
}
}
}
if (verbose) {
printf("lapjv: AUGMENTING ROW REDUCTION %d / %d\n", loopcnt + 1, 2);
}
} // for loopcnt
// AUGMENT SOLUTION for each free row.
for (idx f = 0; f < numfree; f++) {
idx endofpath;
idx freerow = free[f]; // start row of augmenting path.
if (verbose) {
printf("lapjv: AUGMENT SOLUTION row %d [%d / %d]\n",
freerow, f + 1, numfree);
}
// Dijkstra shortest path algorithm.
// runs until unassigned column added to shortest path tree.
#if _OPENMP >= 201307
#pragma omp simd
#endif
for (idx j = 0; j < dim; j++) {
d[j] = assign_cost[freerow * dim + j] - v[j];
pred[j] = freerow;
collist[j] = j; // init column list.
}
idx low = 0; // columns in 0..low-1 are ready, now none.
idx up = 0; // columns in low..up-1 are to be scanned for current minimum, now none.
// columns in up..dim-1 are to be considered later to find new minimum,
// at this stage the list simply contains all columns
bool unassigned_found = false;
// initialized in the first iteration: low == up == 0
idx last = 0;
cost min = 0;
do {
if (up == low) { // no more columns to be scanned for current minimum.
last = low - 1;
// scan columns for up..dim-1 to find all indices for which new minimum occurs.
// store these indices between low..up-1 (increasing up).
min = d[collist[up++]];
for (idx k = up; k < dim; k++) {
idx j = collist[k];
cost h = d[j];
if (h <= min) {
if (h < min) { // new minimum.
up = low; // restart list at index low.
min = h;
}
// new index with same minimum, put on undex up, and extend list.
collist[k] = collist[up];
collist[up++] = j;
}
}
// check if any of the minimum columns happens to be unassigned.
// if so, we have an augmenting path right away.
for (idx k = low; k < up; k++) {
if (colsol[collist[k]] < 0) {
endofpath = collist[k];
unassigned_found = true;
break;
}
}
}
if (!unassigned_found) {
// update 'distances' between freerow and all unscanned columns, via next scanned column.
idx j1 = collist[low];
low++;
idx i = colsol[j1];
const cost *local_cost = &assign_cost[i * dim];
cost h = local_cost[j1] - v[j1] - min;
for (idx k = up; k < dim; k++) {
idx j = collist[k];
cost v2 = local_cost[j] - v[j] - h;
if (v2 < d[j]) {
pred[j] = i;
if (v2 == min) { // new column found at same minimum value
if (colsol[j] < 0) {
// if unassigned, shortest augmenting path is complete.
endofpath = j;
unassigned_found = true;
break;
} else { // else add to list to be scanned right away.
collist[k] = collist[up];
collist[up++] = j;
}
}
d[j] = v2;
}
}
}
} while (!unassigned_found);
// update column prices.
#if _OPENMP >= 201307
#pragma omp simd
#endif
for (idx k = 0; k <= last; k++) {
idx j1 = collist[k];
v[j1] = v[j1] + d[j1] - min;
}
// reset row and column assignments along the alternating path.
{
idx i;
do {
i = pred[endofpath];
colsol[endofpath] = i;
idx j1 = endofpath;
endofpath = rowsol[i];
rowsol[i] = j1;
} while (i != freerow);
}
}
if (verbose) {
printf("lapjv: AUGMENT SOLUTION finished\n");
}
// calculate optimal cost.
cost lapcost = 0;
#if _OPENMP >= 201307
#pragma omp simd reduction(+:lapcost)
#endif
for (idx i = 0; i < dim; i++) {
const cost *local_cost = &assign_cost[i * dim];
idx j = rowsol[i];
u[i] = local_cost[j] - v[j];
lapcost += local_cost[j];
}
if (verbose) {
printf("lapjv: optimal cost calculated\n");
}
return lapcost;
}