-
Notifications
You must be signed in to change notification settings - Fork 13
/
frsqrt.hh
450 lines (339 loc) · 9.89 KB
/
frsqrt.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
#ifndef __FRSQRT_HH
#define __FRSQRT_HH
//
// Functions for the fast computation of reciprocal square roots
// by means of Newton iteration, e.g. compute a good initial
// guess and iterate up to desired precision.
//
// Optimisations for/by
// Altivec : USE_RSQRT_ALTIVEC
// SSE/SSE2 : USE_RSQRT_SSE / USE_RSQRT_SSE2
// Lookup : Lookup table approach by Ken Turkowski
// USE_RSQRT_LOOKUP
// Magic No. : Magic numbers for initial guess
// USE_RSQRT_MAGIC
// Fallback : standard implementation via libm
// USE_RSQRT_LIBM
//
// If no manual override is specified, the optimisation is
// chosen automatically depending on available compiler.
//
// To initialise RSQRT, call init_rsqrt() first. Afterwards
// "rsqrtf" and "rsqrt" compute the reciprocal square root.
//
// manual override
//#define USE_RSQRT_LOOKUP
// define to minimise FPU usage
#define PURE_VECTOR
//
// automatically decide what specific implementation to use
// if not chosen before
//
#if !defined(USE_RSQRT_ALTIVEC) && !defined(USE_RSQRT_SSE2) && \
!defined(USE_RSQRT_SSE) && !defined(USE_RSQRT_LOOKUP) && \
!defined(USE_RSQRT_MAGIC) && !defined(USE_RSQRT_LIBM)
#if defined __GNUC__ && defined __APPLE_ALTIVEC__
# define USE_RSQRT_ALTIVEC
#elif defined __GNUC__ && defined __SSE__
# if defined __SSE2__
# define USE_RSQRT_SSE2
# else
# define USE_RSQRT_SSE
# endif
#else
# define USE_RSQRT_LIBM
#endif
#endif
//////////////////////////////////////////////////////
//
// GCC and Altivec
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_ALTIVEC
#undef USE_RSQRT_ALTIVEC
#include <ppc_intrinsics.h>
inline float
rsqrtf ( float x )
{
float r = __frsqrtes( x );
r *= ((3.0f - r * r * x) * 0.5f);
r *= ((3.0f - r * r * x) * 0.5f);
return r;
}
inline double
rsqrt ( double x )
{
double r = __frsqrte( x );
r *= ((3.0 - r * r * x) * 0.5);
r *= ((3.0 - r * r * x) * 0.5);
r *= ((3.0 - r * r * x) * 0.5);
return r;
}
// dummy routine
inline void init_rsqrt () {}
#endif // USE_RSQRT_ALTIVEC
//////////////////////////////////////////////////////
//
// GCC / ICC with SSE
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_SSE
#undef USE_RSQRT_SSE
#include <xmmintrin.h>
const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
inline float
rsqrtf ( float x )
{
#ifdef PURE_VECTOR
__m128 xx = _mm_load_ss( & x );
__m128 xr = _mm_rsqrt_ss( xx );
__m128 xt;
xt = _mm_mul_ss( xr, xr );
xt = _mm_mul_ss( xt, xx );
xt = _mm_sub_ss( f3, xt );
xt = _mm_mul_ss( xt, f05 );
xr = _mm_mul_ss( xr, xt );
_mm_store_ss( & x, xr );
return x;
#else
float r;
_mm_store_ss( & r, _mm_rsqrt_ss( _mm_load_ss( & x ) ) );
r *= ((3.0f - r * r * x) * 0.5f);
return r;
#endif
}
inline double
rsqrt ( double x )
{
float arg = x;
_mm_store_ss( & arg, _mm_rsqrt_ss( _mm_load_ss( & arg ) ) );
double r = arg;
r *= ((3.0 - r * r * x) * 0.5);
r *= ((3.0 - r * r * x) * 0.5);
return r;
}
// dummy routine
inline void init_rsqrt () {}
#endif // USE_RSQRT_SSE
//////////////////////////////////////////////////////
//
// GCC / ICC with SSE2
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_SSE2
#undef USE_RSQRT_SSE2
#include <emmintrin.h>
const __m128 f3 = _mm_set_ss(3.0f); // 3 as single precision SSE value
const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as single precision SSE value
inline float
rsqrtf ( float x )
{
#ifdef PURE_VECTOR
__m128 xx = _mm_load_ss( & x );
__m128 xr = _mm_rsqrt_ss( xx );
__m128 xt;
xt = _mm_mul_ss( xr, xr );
xt = _mm_mul_ss( xt, xx );
xt = _mm_sub_ss( f3, xt );
xt = _mm_mul_ss( xt, f05 );
xr = _mm_mul_ss( xr, xt );
_mm_store_ss( & x, xr );
return x;
#else
float r;
_mm_store_ss( & r, _mm_rsqrt_ss( _mm_load_ss( & x ) ) );
r *= ((3.0f - r * r * x) * 0.5f);
return r;
#endif
}
const __m128d d3 = _mm_set_sd(3.0); // 3 as double precision SSE value
const __m128d d05 = _mm_set_sd(0.5); // 0.5 as double precision SSE value
inline double
rsqrt ( double x )
{
#ifdef PURE_VECTOR
float fx = x;
__m128d xx = _mm_load_sd( & x );
__m128d xr = _mm_cvtss_sd( xx, _mm_rsqrt_ss( _mm_load_ss( & fx ) ) );
__m128d xt;
xt = _mm_mul_sd( xr, xr );
xt = _mm_mul_sd( xt, xx );
xt = _mm_sub_sd( d3, xt );
xt = _mm_mul_sd( xt, d05 );
xr = _mm_mul_sd( xr, xt );
xt = _mm_mul_sd( xr, xr );
xt = _mm_mul_sd( xt, xx );
xt = _mm_sub_sd( d3, xt );
xt = _mm_mul_sd( xt, d05 );
xr = _mm_mul_sd( xr, xt );
_mm_store_sd( & x, xr );
return x;
#else
float arg = x;
_mm_store_ss( & arg, _mm_rsqrt_ss( _mm_load_ss( & arg ) ) );
double r = arg;
r *= ((3.0 - r * r * x) * 0.5);
r *= ((3.0 - r * r * x) * 0.5);
return r;
#endif
}
// dummy routine
inline void init_rsqrt () {}
#endif // USE_RSQRT_SSE2
//////////////////////////////////////////////////////
//
// lookup table and floating point arithmetics
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_LOOKUP
#undef USE_RSQRT_LOOKUP
#include <math.h>
typedef unsigned char rsqrt_uchar;
typedef unsigned int rsqrt_uint;
typedef unsigned long rsqrt_ulong;
// desired precision
const rsqrt_uint PRECISION = 3;
// Specified parameters
const rsqrt_uint LOOKUP_BITS = 8; // Number of mantissa bits for lookup
const rsqrt_uint EXP_POS = 23; // Position of the exponent
const rsqrt_uint EXP_BIAS = 127; // Bias of exponent
// The mantissa is assumed to be just down from the exponent
// Derived parameters
const rsqrt_uint LOOKUP_POS = (EXP_POS-LOOKUP_BITS); // Position of mantissa lookup
const rsqrt_uint SEED_POS = (EXP_POS-8); // Position of mantissa seed
const rsqrt_uint TABLE_SIZE = (2 << LOOKUP_BITS); // Number of entries in table
const rsqrt_uint LOOKUP_MASK = (TABLE_SIZE - 1); // Mask for table input
// extract exponent
#define GET_EXP(a) (((a) >> EXP_POS) & 0xFF)
// set exponent
#define SET_EXP(a) ((a) << EXP_POS)
// extended mantissa MSB's
#define GET_E_MANT(a) (((a) >> LOOKUP_POS) & LOOKUP_MASK)
// set mantissa 8 MSB's
#define SET_MANT_SEED(a) (rsqrt_ulong(a) << SEED_POS)
static rsqrt_uchar rsqrt_table[ TABLE_SIZE ];
typedef union {
rsqrt_uint i;
float f;
} float2int;
static inline void
init_rsqrt ()
{
static bool rsqrt_init = false;
if ( rsqrt_init )
return;
rsqrt_init = true;
rsqrt_uchar * h = rsqrt_table;
for ( rsqrt_uint f = 0; f < TABLE_SIZE; f++ )
{
float2int fi, fo;
fi.i = ((EXP_BIAS-1) << EXP_POS) | (f << LOOKUP_POS);
fo.f = float( 1.0f / ::sqrtf(fi.f) );
// rounding
*h++ = rsqrt_uchar( ((fo.i + (1<<(SEED_POS-2))) >> SEED_POS) & 0xFF );
}// for
// special case for 1.0
rsqrt_table[ TABLE_SIZE / 2 ] = 0xFF;
}
//
// compute the Inverse Square Root
// of an IEEE single precision floating-point number.
// written by Ken Turkowski.
//
inline float
rsqrtf ( float x )
{
float2int seed;
rsqrt_uint a = reinterpret_cast< float2int * >( & x )->i;
seed.i = (SET_EXP(((3*EXP_BIAS-1) - GET_EXP(a)) >> 1) |
SET_MANT_SEED(rsqrt_table[GET_E_MANT (a)]));
// seed: accurate to LOOKUP_BITS
float r = seed.f;
// first iteration: accurate to 2*LOOKUP_BITS
r *= ((3.0f - r * r * x) * 0.5f);
// second iteration: accurate to 4*LOOKUP_BITS
if ( PRECISION >= 2 )
r *= ((3.0f - r * r * x) * 0.5f);
return r;
}
inline double
rsqrt ( double x )
{
float2int seed;
const float arg = x;
const rsqrt_uint a = reinterpret_cast< const float2int * >( & arg )->i;
seed.i = (SET_EXP(((3*EXP_BIAS-1) - GET_EXP(a)) >> 1) |
SET_MANT_SEED(rsqrt_table[GET_E_MANT (a)]));
// seed: accurate to LOOKUP_BITS
double r = seed.f;
// first iteration: accurate to 2*LOOKUP_BITS
r *= ((3.0 - r * r * x) * 0.5);
// second iteration: accurate to 4*LOOKUP_BITS
if ( PRECISION >= 2 )
r *= ((3.0 - r * r * x) * 0.5);
// third iteration: accurate to 8*LOOKUP_BITS
if ( PRECISION >= 3 )
r *= ((3.0 - r * r * x) * 0.5);
return r;
}
#endif // USE_RSQRT_LOOKUP
//////////////////////////////////////////////////////
//
// Using special (magic) constants as first guess
// (as is implemented in Quake 3 and analysed
// in "Fast Inverse Square Root" by Chris Lomont)
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_MAGIC
#undef USE_RSQRT_MAGIC
inline float
rsqrtf ( float x )
{
const float xhalf = 0.5f * x;
int i = *(int*) & x;
i = 0x5f375a86 - ( i >> 1 );
x = *(float*) & i;
x = x * ( 1.5f - xhalf * x * x );
x = x * ( 1.5f - xhalf * x * x );
x = x * ( 1.5f - xhalf * x * x );
return x;
}
inline double
rsqrt ( double x )
{
const double xhalf = 0.5 * x;
long long i = *(long long*) & x;
i = 0x5fe6ec85e7de30dall - ( i >> 1 );
x = *(double*) & i;
x = x * ( 1.5 - xhalf * x * x );
x = x * ( 1.5 - xhalf * x * x );
x = x * ( 1.5 - xhalf * x * x );
x = x * ( 1.5 - xhalf * x * x );
return x;
}
// dummy routine
inline void init_rsqrt () {}
#endif // USE_RSQRT_MAGIC
//////////////////////////////////////////////////////
//
// direct computation with libm methods
//
//////////////////////////////////////////////////////
#ifdef USE_RSQRT_LIBM
#undef USE_RSQRT_LIBM
#include <math.h>
inline float
rsqrtf ( float x )
{
return 1.0 / ::sqrtf( x ); // requires C99
}
inline double
rsqrt ( double x )
{
return 1.0 / ::sqrt( x );
}
// dummy routine
inline void init_rsqrt () {}
#endif // USE_RSQRT_LIBM
#endif // __FRSQRT_HH