Skip to content

Commit

Permalink
refactor: replace sorting with quick select
Browse files Browse the repository at this point in the history
This change replaces the deterministic optimized merge-sort with a
highly optimized quick select that performs signficantly faster when
publisher count is more than 20. The previous algorithm is very fast
running on a normal CPU but doesn't work very well in BPF because all
instructions (like load, move, ..) have the same cost as comparisons
and are not optimized. Quick select on the other hand, minimized all
the operations in the average case due to its minimal reads and writes.

In 32-publisher setup, quick select reduces the CU from 16.5k to 11.5k
and in the 64-publisher setup 37k to 17.5k. The numbers are the worst
cases running on randomized input.

Worst measured case happens when all elements are equal and is
13k and 24k for 32-publisher and 64-publisher setup respectively.

Unfortunately there is no way to systematically get the the compute
usage out of program test. The `test_benchmark` file has a simple
code that helps running benchmarks on various number of publishers.

Quick select algorithm has a quadratic worst case but it is very
unlikely to happen and no publisher alone can manipulate the list
of prices by themselves.
  • Loading branch information
ali-bahjati committed May 14, 2024
1 parent 9acf172 commit 1524fb8
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 553 deletions.
1 change: 0 additions & 1 deletion program/c/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ cpyth-native: features.h
test: features.h
mkdir -p $(OUT_DIR)/test/
gcc -c ./src/oracle/model/test_price_model.c -o $(OUT_DIR)/test/test_price_model.o -fPIC
gcc -c ./src/oracle/sort/test_sort_stable.c -o $(OUT_DIR)/test/test_sort_stable.o -fPIC
gcc -c ./src/oracle/util/test_align.c -o $(OUT_DIR)/test/test_align.o -fPIC
gcc -c ./src/oracle/util/test_avg.c -o $(OUT_DIR)/test/test_avg.o -fPIC
gcc -c ./src/oracle/util/test_hash.c -o $(OUT_DIR)/test/test_hash.o -fPIC
Expand Down
169 changes: 91 additions & 78 deletions program/c/src/oracle/model/price_model.c
Original file line number Diff line number Diff line change
@@ -1,99 +1,108 @@
#include "price_model.h"
#include "../util/avg.h" /* For avg_2_int64 */

#define SORT_NAME int64_sort_ascending
#define SORT_KEY_T int64_t
#include "../sort/tmpl/sort_stable.c"
/*
* Quick select implementation highly optimized for reducing the number of compute units in BPF.
*
* On Floyd and Rivest's SELECT algorithm" by Krzysztof C. Kiwiel article
* mentions that the average case of this algorithm is is 2.5n + o(n) comparisons.
* The article itself presents a different algorithm (Floyd and Rivest) which
* performs faster in larger arrays but has a higher overhead. Other determinstic
* algorithms like median of medians have a higher overhead (or comibnations of
* them and quick select) and are not suitable for BPF.
*/
int64_t quick_select( int64_t * a, uint64_t n, uint64_t k ) {
while (1) {
if (n == 1) return a[0];

int64_t pivot;

/*
* Select median of 3 as pivot which massively reduces the number of
* comparisons in the average case and also helps in reducing the likelihood
* of worst case time complexity.
*/
{
if (a[0] < a[n / 2]) {
if (a[n / 2] < a[n - 1]) pivot = a[n / 2];
else if (a[0] < a[n - 1]) pivot = a[n - 1];
else pivot = a[0];
} else {
if (a[n / 2] > a[n - 1]) pivot = a[n / 2];
else if (a[0] > a[n - 1]) pivot = a[n - 1];
else pivot = a[0];
}
}

/*
* Partition the array around the pivot. This partitioning is done in a
* way that if the array has a lot of duplicate pivots, it doesn't result in
* quadratic time complexity and evenly distributes the pivots.
*
* When the loop is done, the array looks like any of the following:
* 1. ....ji....
* In this case the elements a[x] for x <= j are <= pivot and a[x] for x >= i are >= pivot.
*
* 2. ....j.i...
* In this case same conditions as above apply but a[j+1] == pivot because the last step
* resulting in j and i in this position is i==j and swapping which means that that element
* is neither < pivot nor > pivot.
*
* In the case of multiple pivot elements, the elements will be evenly distributed because
* i nor j won't move if the element is equal to the pivot and if both be equal to the pivot
* they will swap (which is ineffective) and move forward which results in one pivot to the
* left and one to the right.
*/
uint64_t i = 0, j = n - 1;
while (i <= j) {
if (a[i] < pivot) {
i++;
continue;
}
if (a[j] > pivot) {
j--;
continue;
}

int64_t tmp = a[i];
a[i] = a[j];
a[j] = tmp;
i++;
j--;
}

if (k < j + 1) n = j + 1;
else if (j + 2 == i && k == j + 1) return pivot;
else {
a += j + 1;
n -= j + 1;
k -= j + 1;
}
}
}

int64_t *
void
price_model_core( uint64_t cnt,
int64_t * quote,
int64_t * _p25,
int64_t * _p50,
int64_t * _p75,
void * scratch ) {

/* Sort the quotes. The sorting implementation used here is a highly
optimized mergesort (merge with an unrolled insertion sorting
network small n base cases). The best case is ~0.5 n lg n compares
and the average and worst cases are ~n lg n compares.
While not completely data oblivious, this has quite low variance in
operation count practically and this is _better_ than quicksort's
average case and quicksort's worst case is a computational
denial-of-service and timing attack vulnerable O(n^2). Unlike
quicksort, this is also stable (but this stability does not
currently matter ... it might be a factor in future models).
A data oblivious sorting network approach might be viable here with
and would have a completely deterministic operations count. It
currently isn't used as the best known practical approaches for
general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
while the application probably doesn't need perfect obliviousness,
mergesort is still moderately oblivious and the application can
benefit from mergesort's lower operations cost. (The main drawback
of mergesort over quicksort is that it isn't in place, but memory
footprint isn't an issue here.)
Given the operations cost model (e.g. cache friendliness is not
incorporated), a radix sort might be viable here (O(n) in best /
average / worst). It currently isn't used as we expect invocations
with small-ish n to be common and radix sort would be have large
coefficients on the O(n) and additional fixed overheads that would
make it more expensive than mergesort in this regime.
Note: price_model_cnt_valid( cnt ) implies
int64_sort_ascending_cnt_valid( cnt ) currently.
Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */

int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );

/* Extract the p25
There are many variants with subtle tradeoffs here. One option is
to interpolate when the ideal p25 is bracketed by two samples (akin
to the p50 interpolation above when the number of quotes is even).
That is, for p25, interpolate between quotes floor((cnt-2)/4) and
ceil((cnt-2)/4) with the weights determined by cnt mod 4. The
current preference is to not do that as it is slightly more
complex, doesn't exactly always minimize the current loss function
and is more exposed to the confidence intervals getting skewed by
bum quotes with the number of quotes is small.
Another option is to use the inside quote of the above pair. That
is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
(cnt+1)>>2. The current preference is not to do this as, though
this has stronger bum quote robustness, it results in p25==p50==p75
when cnt==3. (In this case, the above wants to do an interpolation
between quotes 0 and 1 to for the p25 and between quotes 1 and 2
for the p75. But limiting to just the inside quote results in
p25/p50/p75 all using the median quote.)
A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
This is simple, has the same asymptotic behavior for large cnt, has
good behavior in the cnt==3 case and practically as good bum quote
rejection in the moderate cnt case. */

uint64_t p25_idx = cnt >> 2;

*_p25 = sort_quote[p25_idx];
int64_t * _p75) {

/* Extract the p50 */

if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */

uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */

*_p50 = sort_quote[p50_idx];
*_p50 = quick_select(quote, cnt, p50_idx);

} else { /* Even number of quotes (at least 2) */

uint64_t p50_idx_right = cnt >> 1; /* == ceil((cnt-1)/2)> 0 */
uint64_t p50_idx_left = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */

int64_t vl = sort_quote[p50_idx_left ];
int64_t vr = sort_quote[p50_idx_right];
int64_t vl = quick_select(quote, cnt, p50_idx_left);
int64_t vr = quick_select(quote, cnt, p50_idx_right);

/* Compute the average of vl and vr (with floor / round toward
negative infinity rounding and without possibility of
Expand All @@ -102,11 +111,15 @@ price_model_core( uint64_t cnt,
*_p50 = avg_2_int64( vl, vr );
}

/* Extract the p25 */

uint64_t p25_idx = cnt >> 2;

*_p25 = quick_select(quote, cnt, p25_idx);

/* Extract the p75 (this is the mirror image of the p25 case) */

uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;

*_p75 = sort_quote[p75_idx];

return sort_quote;
*_p75 = quick_select(quote, cnt, p75_idx);
}
83 changes: 2 additions & 81 deletions program/c/src/oracle/model/price_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,91 +8,12 @@
extern "C" {
#endif

/* Returns the minimum and maximum number of quotes the implementation
can handle */

static inline uint64_t
price_model_quote_min( void ) {
return (uint64_t)1;
}

static inline uint64_t
price_model_quote_max( void ) {
return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
}

/* price_model_cnt_valid returns non-zero if cnt is a valid value or
zero if not. */

static inline int
price_model_cnt_valid( uint64_t cnt ) {
return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
}

/* price_model_scratch_footprint returns the number of bytes of scratch
space needed for an arbitrarily aligned scratch region required by
price_model to handle price_model_quote_min() to cnt quotes
inclusive. */

static inline uint64_t
price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
/* cnt int64_t's plus worst case alignment padding, no overflow
possible as cnt is valid at this point */
return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
}

/* price_model_core minimizes (to quote precision in a floor / round
toward negative infinity sense) the loss model of the given quotes.
Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
large ... typically a multiple of 3 but this is not required,
quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
p50, p75 point to where to write model outputs, scratch points to a
suitable footprint scratch region).
Returns a pointer to the quotes sorted in ascending order. As such,
the min and max and any other rank statistic can be extracted easily
on return. This location will either be quote itself or to a
location in scratch. Use price_model below for a variant that always
replaces quote with the sorted quotes (potentially has extra ops for
copying). Further, on return, *_p25, *_p50, *_p75 will hold the loss
model minimizing values for the input quotes and the scratch region
was clobbered.
Scratch points to a memory region of arbitrary alignment with at
least price_model_scratch_footprint( cnt ) bytes and it will be
clobbered on output. It is sufficient to use a normally aligned /
normally allocated / normally declared array of cnt int64_t's.
The cost of this function is a fast and low variance (but not
completely data oblivious) O(cnt lg cnt) in the best / average /
worst cases. This function uses no heap / dynamic memory allocation.
It is thread safe provided it passed non-conflicting quote, output
and scratch arrays. It has a bounded call depth ~lg cnt <= ~64 (this
could reduce to O(1) by using a non-recursive sort/select
implementation under the hood if desired). */

int64_t * /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
void
price_model_core( uint64_t cnt, /* Assumes price_model_cnt_valid( cnt ) is true */
int64_t * quote, /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
int64_t * _p25, /* Assumes *_p25 is safe to write to the p25 model output */
int64_t * _p50, /* Assumes *_p50 " */
int64_t * _p75, /* Assumes *_p75 " */
void * scratch ); /* Assumes a suitable scratch region */

/* Same as the above but always returns quote and quote always holds the
sorted quotes on return. */

static inline int64_t *
price_model( uint64_t cnt,
int64_t * quote,
int64_t * _p25,
int64_t * _p50,
int64_t * _p75,
void * scratch ) {
int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
return quote;
}
int64_t * _p75); /* Assumes *_p75 " */

#ifdef __cplusplus
}
Expand Down
22 changes: 15 additions & 7 deletions program/c/src/oracle/model/test_price_model.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ int test_price_model() {
prng_t _prng[1];
prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );

# define N 96
# define N 192

int64_t quote0 [N];
int64_t quote [N];
int64_t val [3];
int64_t scratch[N];

for( int iter=0; iter<10000000; iter++ ) {

/* Generate a random test */

uint64_t cnt = (uint64_t)1 + (uint64_t)(prng_uint32( prng ) % (uint32_t)N); /* In [1,N], approx uniform IID */
Expand All @@ -36,21 +35,30 @@ int test_price_model() {
/* Apply the model */

memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }

price_model_core( cnt, quote, val+0, val+1, val+2 );

/* Validate the results */

qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }

uint64_t p25_idx = cnt>>2;
uint64_t p50_idx = cnt>>1;
uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);

if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (p25)\n" ); return 1; }
if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (p50)\n" ); return 1; }
if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (p75)\n" ); return 1; }
if( val[0]!=quote0[ p25_idx ] ) {
printf( "FAILED (p25). given: %lld, expected: %lld\n", val[0], quote0[ p25_idx ] );
return 1;
}
if( val[1]!=avg_2_int64( quote0[ p50_idx-is_even ], quote0[ p50_idx ] ) ) {
printf( "FAIL (p50). given: %lld, expected: %lld\n", val[1], avg_2_int64( quote0[ p50_idx-is_even ], quote0[ p50_idx ] ) );
return 1;
}
if( val[2]!=quote0[ p75_idx ] ) {
printf( "FAIL (p75). given: %lld, expected: %lld\n", val[2], quote0[ p75_idx ] );
return 1;
}
}

# undef N
Expand Down
Loading

0 comments on commit 1524fb8

Please sign in to comment.