refactor: replace sorting with quick select

This change replaces the deterministic optimized merge-sort with a highly optimized quick select that performs signficantly faster when publisher count is more than 20. The previous algorithm is very fast running on a normal CPU but doesn't work very well in BPF because all instructions (like load, move, ..) have the same cost as comparisons and are not optimized. Quick select on the other hand, minimized all the operations in the average case due to its minimal reads and writes. In 32-publisher setup, quick select reduces the CU from 16.5k to 11.5k and in the 64-publisher setup 37k to 17.5k. The numbers are the worst cases running on randomized input. Worst measured case happens when all elements are equal and is 13k and 24k for 32-publisher and 64-publisher setup respectively. Unfortunately there is no way to systematically get the the compute usage out of program test. The `test_benchmark` file has a simple code that helps running benchmarks on various number of publishers. Quick select algorithm has a quadratic worst case but it is very unlikely to happen and no publisher alone can manipulate the list of prices by themselves.
pyth-network · May 14, 2024 · 1524fb8 · 1524fb8
1 parent 9acf172
commit 1524fb8
Show file tree

Hide file tree

Showing 12 changed files with 192 additions and 553 deletions.
diff --git a/program/c/makefile b/program/c/makefile
@@ -41,7 +41,6 @@ cpyth-native: features.h
 test: features.h
 	mkdir -p $(OUT_DIR)/test/
 	gcc -c ./src/oracle/model/test_price_model.c -o $(OUT_DIR)/test/test_price_model.o -fPIC
-	gcc -c ./src/oracle/sort/test_sort_stable.c -o $(OUT_DIR)/test/test_sort_stable.o -fPIC
 	gcc -c ./src/oracle/util/test_align.c -o $(OUT_DIR)/test/test_align.o -fPIC
 	gcc -c ./src/oracle/util/test_avg.c -o $(OUT_DIR)/test/test_avg.o -fPIC
 	gcc -c ./src/oracle/util/test_hash.c -o $(OUT_DIR)/test/test_hash.o -fPIC

diff --git a/program/c/src/oracle/model/price_model.c b/program/c/src/oracle/model/price_model.c
@@ -1,99 +1,108 @@
 #include "price_model.h"
 #include "../util/avg.h" /* For avg_2_int64 */
 
-#define SORT_NAME  int64_sort_ascending
-#define SORT_KEY_T int64_t
-#include "../sort/tmpl/sort_stable.c"
+/*
+ * Quick select implementation highly optimized for reducing the number of compute units in BPF.
+ *
+ * On Floyd and Rivest's SELECT algorithm" by Krzysztof C. Kiwiel article
+ * mentions that the average case of this algorithm is is 2.5n + o(n) comparisons.
+ * The article itself presents a different algorithm (Floyd and Rivest) which
+ * performs faster in larger arrays but has a higher overhead. Other determinstic
+ * algorithms like median of medians have a higher overhead (or comibnations of
+ * them and quick select) and are not suitable for BPF.
+ */
+int64_t quick_select( int64_t * a, uint64_t n, uint64_t k ) {
+    while (1) {
+        if (n == 1) return a[0];
+
+        int64_t pivot;
+
+        /*
+         * Select median of 3 as pivot which massively reduces the number of
+         * comparisons in the average case and also helps in reducing the likelihood
+         * of worst case time complexity.
+         */
+        {
+            if (a[0] < a[n / 2]) {
+                if (a[n / 2] < a[n - 1]) pivot = a[n / 2];
+                else if (a[0] < a[n - 1]) pivot = a[n - 1];
+                else pivot = a[0];
+            } else {
+                if (a[n / 2] > a[n - 1]) pivot = a[n / 2];
+                else if (a[0] > a[n - 1]) pivot = a[n - 1];
+                else pivot = a[0];
+            }
+        }
+
+        /*
+         * Partition the array around the pivot. This partitioning is done in a
+         * way that if the array has a lot of duplicate pivots, it doesn't result in
+         * quadratic time complexity and evenly distributes the pivots.
+         *
+         * When the loop is done, the array looks like any of the following:
+         * 1. ....ji....
+         * In this case the elements a[x] for x <= j are <= pivot and a[x] for x >= i are >= pivot.
+         *
+         * 2. ....j.i...
+         * In this case same conditions as above apply but a[j+1] == pivot because the last step
+         * resulting in j and i in this position is i==j and swapping which means that that element
+         * is neither < pivot nor > pivot.
+         *
+         * In the case of multiple pivot elements, the elements will be evenly distributed because
+         * i nor j won't move if the element is equal to the pivot and if both be equal to the pivot
+         * they will swap (which is ineffective) and move forward which results in one pivot to the
+         * left and one to the right.
+         */
+        uint64_t i = 0, j = n - 1;
+        while (i <= j) {
+            if (a[i] < pivot) {
+                i++;
+                continue;
+            }
+            if (a[j] > pivot) {
+                j--;
+                continue;
+            }
+
+            int64_t tmp = a[i];
+            a[i] = a[j];
+            a[j] = tmp;
+            i++;
+            j--;
+        }
+
+        if (k < j + 1) n = j + 1;
+        else if (j + 2 == i && k == j + 1) return pivot;
+        else {
+            a += j + 1;
+            n -= j + 1;
+            k -= j + 1;
+        }
+    }
+}
 
-int64_t *
+void
 price_model_core( uint64_t  cnt,
                   int64_t * quote,
                   int64_t * _p25,
                   int64_t * _p50,
-                  int64_t * _p75,
-                  void    * scratch ) {
-
-  /* Sort the quotes.  The sorting implementation used here is a highly
-     optimized mergesort (merge with an unrolled insertion sorting
-     network small n base cases).  The best case is ~0.5 n lg n compares
-     and the average and worst cases are ~n lg n compares.
-
-     While not completely data oblivious, this has quite low variance in
-     operation count practically and this is _better_ than quicksort's
-     average case and quicksort's worst case is a computational
-     denial-of-service and timing attack vulnerable O(n^2).  Unlike
-     quicksort, this is also stable (but this stability does not
-     currently matter ... it might be a factor in future models).
-
-     A data oblivious sorting network approach might be viable here with
-     and would have a completely deterministic operations count.  It
-     currently isn't used as the best known practical approaches for
-     general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
-     while the application probably doesn't need perfect obliviousness,
-     mergesort is still moderately oblivious and the application can
-     benefit from mergesort's lower operations cost.  (The main drawback
-     of mergesort over quicksort is that it isn't in place, but memory
-     footprint isn't an issue here.)
-
-     Given the operations cost model (e.g. cache friendliness is not
-     incorporated), a radix sort might be viable here (O(n) in best /
-     average / worst).  It currently isn't used as we expect invocations
-     with small-ish n to be common and radix sort would be have large
-     coefficients on the O(n) and additional fixed overheads that would
-     make it more expensive than mergesort in this regime.
-
-     Note: price_model_cnt_valid( cnt ) implies
-     int64_sort_ascending_cnt_valid( cnt ) currently.
-
-     Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
-
-  int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
-
-  /* Extract the p25
-
-     There are many variants with subtle tradeoffs here.  One option is
-     to interpolate when the ideal p25 is bracketed by two samples (akin
-     to the p50 interpolation above when the number of quotes is even).
-     That is, for p25, interpolate between quotes floor((cnt-2)/4) and
-     ceil((cnt-2)/4) with the weights determined by cnt mod 4.  The
-     current preference is to not do that as it is slightly more
-     complex, doesn't exactly always minimize the current loss function
-     and is more exposed to the confidence intervals getting skewed by
-     bum quotes with the number of quotes is small.
-
-     Another option is to use the inside quote of the above pair.  That
-     is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
-     (cnt+1)>>2.  The current preference is not to do this as, though
-     this has stronger bum quote robustness, it results in p25==p50==p75
-     when cnt==3.  (In this case, the above wants to do an interpolation
-     between quotes 0 and 1 to for the p25 and between quotes 1 and 2
-     for the p75.  But limiting to just the inside quote results in
-     p25/p50/p75 all using the median quote.)
-
-     A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
-     This is simple, has the same asymptotic behavior for large cnt, has
-     good behavior in the cnt==3 case and practically as good bum quote
-     rejection in the moderate cnt case. */
-
-  uint64_t p25_idx = cnt >> 2;
-
-  *_p25 = sort_quote[p25_idx];
+                  int64_t * _p75) {
 
   /* Extract the p50 */
 
   if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
 
     uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
 
-    *_p50 = sort_quote[p50_idx];
+    *_p50 = quick_select(quote, cnt, p50_idx);
 
   } else { /* Even number of quotes (at least 2) */
 
     uint64_t p50_idx_right = cnt >> 1;                    /* == ceil((cnt-1)/2)> 0 */
     uint64_t p50_idx_left  = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
 
-    int64_t vl = sort_quote[p50_idx_left ];
-    int64_t vr = sort_quote[p50_idx_right];
+    int64_t vl = quick_select(quote, cnt, p50_idx_left);
+    int64_t vr = quick_select(quote, cnt, p50_idx_right);
 
     /* Compute the average of vl and vr (with floor / round toward
        negative infinity rounding and without possibility of
@@ -102,11 +111,15 @@ price_model_core( uint64_t  cnt,
     *_p50 = avg_2_int64( vl, vr );
   }
 
+  /* Extract the p25 */
+
+  uint64_t p25_idx = cnt >> 2;
+
+  *_p25 = quick_select(quote, cnt, p25_idx);
+
   /* Extract the p75 (this is the mirror image of the p25 case) */
 
   uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
 
-  *_p75 = sort_quote[p75_idx];
-
-  return sort_quote;
+  *_p75 = quick_select(quote, cnt, p75_idx);
 }
diff --git a/program/c/src/oracle/model/price_model.h b/program/c/src/oracle/model/price_model.h
@@ -8,91 +8,12 @@
 extern "C" {
 #endif
 
-/* Returns the minimum and maximum number of quotes the implementation
-   can handle */
-
-static inline uint64_t
-price_model_quote_min( void ) {
-  return (uint64_t)1;
-}
-
-static inline uint64_t
-price_model_quote_max( void ) {
-  return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
-}
-
-/* price_model_cnt_valid returns non-zero if cnt is a valid value or
-   zero if not. */
-
-static inline int
-price_model_cnt_valid( uint64_t cnt ) {
-  return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
-}
-
-/* price_model_scratch_footprint returns the number of bytes of scratch
-   space needed for an arbitrarily aligned scratch region required by
-   price_model to handle price_model_quote_min() to cnt quotes
-   inclusive. */
-
-static inline uint64_t
-price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
-  /* cnt int64_t's plus worst case alignment padding, no overflow
-     possible as cnt is valid at this point */
-  return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
-}
-
-/* price_model_core minimizes (to quote precision in a floor / round
-   toward negative infinity sense) the loss model of the given quotes.
-   Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
-   large ... typically a multiple of 3 but this is not required,
-   quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
-   p50, p75 point to where to write model outputs, scratch points to a
-   suitable footprint scratch region).
-
-   Returns a pointer to the quotes sorted in ascending order.  As such,
-   the min and max and any other rank statistic can be extracted easily
-   on return.  This location will either be quote itself or to a
-   location in scratch.  Use price_model below for a variant that always
-   replaces quote with the sorted quotes (potentially has extra ops for
-   copying).  Further, on return, *_p25, *_p50, *_p75 will hold the loss
-   model minimizing values for the input quotes and the scratch region
-   was clobbered.
-
-   Scratch points to a memory region of arbitrary alignment with at
-   least price_model_scratch_footprint( cnt ) bytes and it will be
-   clobbered on output.  It is sufficient to use a normally aligned /
-   normally allocated / normally declared array of cnt int64_t's.
-
-   The cost of this function is a fast and low variance (but not
-   completely data oblivious) O(cnt lg cnt) in the best / average /
-   worst cases.  This function uses no heap / dynamic memory allocation.
-   It is thread safe provided it passed non-conflicting quote, output
-   and scratch arrays.  It has a bounded call depth ~lg cnt <= ~64 (this
-   could reduce to O(1) by using a non-recursive sort/select
-   implementation under the hood if desired). */
-
-int64_t *                              /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
+void
 price_model_core( uint64_t  cnt,       /* Assumes price_model_cnt_valid( cnt ) is true */
                   int64_t * quote,     /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
                   int64_t * _p25,      /* Assumes *_p25 is safe to write to the p25 model output */
                   int64_t * _p50,      /* Assumes *_p50 " */
-                  int64_t * _p75,      /* Assumes *_p75 " */
-                  void    * scratch ); /* Assumes a suitable scratch region */
-
-/* Same as the above but always returns quote and quote always holds the
-   sorted quotes on return. */
-
-static inline int64_t *
-price_model( uint64_t  cnt,
-             int64_t * quote,
-             int64_t * _p25,
-             int64_t * _p50,
-             int64_t * _p75,
-             void    * scratch ) {
-  int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
-  if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
-  return quote;
-}
+                  int64_t * _p75);     /* Assumes *_p75 " */
 
 #ifdef __cplusplus
 }

diff --git a/program/c/src/oracle/model/test_price_model.c b/program/c/src/oracle/model/test_price_model.c
@@ -19,15 +19,14 @@ int test_price_model() {
   prng_t _prng[1];
   prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
 
-# define N 96
+# define N 192
 
   int64_t quote0 [N];
   int64_t quote  [N];
   int64_t val    [3];
   int64_t scratch[N];
 
   for( int iter=0; iter<10000000; iter++ ) {
-
     /* Generate a random test */
 
     uint64_t cnt = (uint64_t)1 + (uint64_t)(prng_uint32( prng ) % (uint32_t)N); /* In [1,N], approx uniform IID */
@@ -36,21 +35,30 @@ int test_price_model() {
     /* Apply the model */
 
     memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
-    if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
+
+    price_model_core( cnt, quote, val+0, val+1, val+2 );
 
     /* Validate the results */
 
     qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
-    if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }
 
     uint64_t p25_idx = cnt>>2;
     uint64_t p50_idx = cnt>>1;
     uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
     uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);
 
-    if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (p25)\n" ); return 1; }
-    if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (p50)\n" ); return 1; }
-    if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (p75)\n" ); return 1; }
+    if( val[0]!=quote0[ p25_idx ] ) {
+        printf( "FAILED (p25). given: %lld, expected: %lld\n", val[0], quote0[ p25_idx ] );
+        return 1;
+    }
+    if( val[1]!=avg_2_int64( quote0[ p50_idx-is_even ], quote0[ p50_idx ] ) ) {
+        printf( "FAIL (p50). given: %lld, expected: %lld\n", val[1], avg_2_int64( quote0[ p50_idx-is_even ], quote0[ p50_idx ] ) );
+        return 1;
+    }
+    if( val[2]!=quote0[ p75_idx ] ) {
+        printf( "FAIL (p75). given: %lld, expected: %lld\n", val[2], quote0[ p75_idx ] );
+        return 1;
+    }
   }
 
 # undef N