diff --git a/Makefile b/Makefile index 498a46e5..2a5764b4 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,10 @@ test : cd purec; make test cd simd; make testsse2 testavx - cd java; make test clean : - rm -f *~ - cd java; make clean + rm -f *~ *.out cd purec; make clean cd simd; make clean cd tester; make clean + cd gencoef; make clean diff --git a/README b/README index d398cbce..f418ffc2 100644 --- a/README +++ b/README @@ -21,14 +21,25 @@ Main download page : http://shibatch.sourceforge.net/ The trigonometric functions are tested to return values with specified accuracy when the argument is within the following range. -Double precision trigonometric functions : |arg| <= 10000000 -Single precision trigonometric functions : |arg| <= 10000 +Double precision trigonometric functions : [-1e+14, 1e+14] +Single precision trigonometric functions : [-39000, 39000] -- History +2.110 +* The valid range of argument is extended for trig functions +* Specification of each functions regarding to the domain and accuracy is added +* A coefficient generation tool is added +* New testing tools are introduced +* Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh +* SIMD xsin and xcos returned values more than 1 when FMA is enabled +* Pure C cbrt returned incorrect values when the argument is negative +* tan_u1 returned values with more than 1 ulp of error on rare occasions +* Removed support for Java language(because no one seems using this) + 2.100 Added support for AVX-512F and Clang Extended Vectors. 2.90 Added ilogbf. All the reported bugs(listed below) are fixed. diff --git a/purec/Makefile b/purec/Makefile index 5790d0d8..4a771cb2 100644 --- a/purec/Makefile +++ b/purec/Makefile @@ -15,6 +15,10 @@ iut : sleefdp.c sleefsp.c iut.c ../tester/testerspu1 : cd ../tester; make testerspu1 +tester2 : tester2.c tester2f.c sleefdp.c sleefsp.c + $(CC) -ffp-contract=off -Wall -DNDEBUG tester2.c sleefdp.c -o tester2 -lm -lmpfr + $(CC) -ffp-contract=off -Wall -DNDEBUG tester2f.c sleefsp.c -o tester2f -lm -lmpfr + test : iut ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 ../tester/tester ./iut ../tester/testeru1 ./iut @@ -22,4 +26,4 @@ test : iut ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/test ../tester/testerspu1 ./iut clean : - rm -f *~ *.o iut + rm -f *~ *.o iut tester2 tester2f diff --git a/purec/sleefdp.c b/purec/sleefdp.c index 0d3f74da..c0bccae1 100644 --- a/purec/sleefdp.c +++ b/purec/sleefdp.c @@ -9,10 +9,16 @@ #include "nonnumber.h" -#define PI4_A 0.78539816290140151978 -#define PI4_B 4.9604678871439933374e-10 -#define PI4_C 1.1258708853173288931e-18 -#define PI4_D 1.7607799325916000908e-27 +#define PI_A 3.1415926218032836914 +#define PI_B 3.1786509424591713469e-08 +#define PI_C 1.2246467864107188502e-16 +#define PI_D 1.2736634327021899816e-24 + +#define M_2_PI_H 0.63661977236758138243 +#define M_2_PI_L -3.9357353350364971764e-17 + +#define TRIGRANGEMAX 1e+14 +#define SQRT_DBL_MAX 1.3407807929942596355e+154 #define M_4_PI 1.273239544735162542821171882678754627704620361328125 @@ -20,6 +26,25 @@ #define L2L .28235290563031577122588448175013436025525412068e-12 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 +char frstr[1000]; +char *toBC(double d) { + union { + double d; + uint64_t u64; + int64_t i64; + } cnv; + + cnv.d = d; + + int64_t l = cnv.i64; + int e = (int)((l >> 52) & ~(-1L << 11)); + int s = (int)(l >> 63); + l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52)); + + sprintf(frstr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52)); + return frstr; +} + static inline int64_t doubleToRawLongBits(double d) { union { double f; @@ -49,6 +74,7 @@ static inline double mulsign(double x, double y) { static inline double sign(double d) { return mulsign(1, d); } static inline double mla(double x, double y, double z) { return x * y + z; } static inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); } +static inline double xtrunc(double x) { return (double)(int)x; } static inline int xisnan(double x) { return x != x; } static inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; } @@ -56,6 +82,11 @@ static inline int xisminf(double x) { return x == -INFINITY; } static inline int xispinf(double x) { return x == INFINITY; } static inline int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); } +static inline int xisint(double d) { + double x = d - (double)(1 << 31) * (int)(d * (1.0 / (1 << 31))); + return (x == (int)x) || (xfabs(d) >= (double)(1LL << 52)); +} + static inline double pow2i(int q) { return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); } @@ -149,7 +180,10 @@ static inline double2 ddadd_d2_d_d(double x, double y) { double2 r; #ifndef NDEBUG - if (!(checkfp(x) || checkfp(y) || xfabs(x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d_d : %g, %g]", x, y); + if (!(checkfp(x) || checkfp(y) || xfabs(x) >= xfabs(y))) { + fprintf(stderr, "[ddadd_d2_d_d : %g, %g]\n", x, y); + fflush(stderr); + } #endif r.x = x + y; @@ -174,7 +208,10 @@ static inline double2 ddadd_d2_d2_d(double2 x, double y) { double2 r; #ifndef NDEBUG - if (!(checkfp(x.x) || checkfp(y) || xfabs(x.x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d2_d : %g %g]", x.x, y); + if (!(checkfp(x.x) || checkfp(y) || xfabs(x.x) >= xfabs(y))) { + fprintf(stderr, "[ddadd_d2_d2_d : %g %g]\n", x.x, y); + fflush(stderr); + } #endif r.x = x.x + y; @@ -202,7 +239,10 @@ static inline double2 ddadd_d2_d_d2(double x, double2 y) { double2 r; #ifndef NDEBUG - if (!(checkfp(x) || checkfp(y.x) || xfabs(x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]", x, y.x); + if (!(checkfp(x) || checkfp(y.x) || xfabs(x) >= xfabs(y.x))) { + fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]\n", x, y.x); + fflush(stderr); + } #endif r.x = x + y.x; @@ -227,7 +267,10 @@ static inline double2 ddadd_d2_d2_d2(double2 x, double2 y) { double2 r; #ifndef NDEBUG - if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]", x.x, y.x); + if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) { + fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x); + fflush(stderr); + } #endif r.x = x.x + y.x; @@ -253,7 +296,10 @@ static inline double2 ddsub_d2_d2_d2(double2 x, double2 y) { double2 r; #ifndef NDEBUG - if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]", x.x, y.x); + if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) { + fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]\n", x.x, y.x); + fflush(stderr); + } #endif r.x = x.x - y.x; @@ -481,8 +527,9 @@ static double2 atan2k_u1(double2 y, double2 x) { t = ddmul_d2_d2_d(t, u); t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t)); + if (xfabs(s.x) < 1e-200) t = s; t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t); - + return t; } @@ -521,19 +568,22 @@ double xatan_u1(double d) { } double xsin(double d) { - int q; - double u, s; - - q = (int)xrint(d * M_1_PI); - - d = mla(q, -PI4_A*4, d); - d = mla(q, -PI4_B*4, d); - d = mla(q, -PI4_C*4, d); - d = mla(q, -PI4_D*4, d); - + double u, s, t = d; + + int qh = xtrunc(d * (M_1_PI / (1 << 24))); + int ql = xrint(d * M_1_PI - qh * (double)(1 << 24)); + + d = mla(qh, -PI_A * (1 << 24), d); + d = mla(ql, -PI_A, d); + d = mla(qh, -PI_B * (1 << 24), d); + d = mla(ql, -PI_B, d); + d = mla(qh, -PI_C * (1 << 24), d); + d = mla(ql, -PI_C, d); + d = mla((double)qh * (1 << 24) + ql, -PI_D, d); + s = d * d; - if ((q & 1) != 0) d = -d; + if ((ql & 1) != 0) d = -d; u = -7.97255955009037868891952e-18; u = mla(u, s, 2.81009972710863200091251e-15); @@ -547,23 +597,26 @@ double xsin(double d) { u = mla(s, u * d, d); - if (xisnegzero(d)) u = -0.0; + if (!xisinf(t) && (xisnegzero(t) || xfabs(t) > TRIGRANGEMAX)) u = -0.0; return u; } double xsin_u1(double d) { - int q; double u; double2 s, t, x; - q = (int)xrint(d * M_1_PI); - - s = ddadd2_d2_d_d(d, q * (-PI4_A*4)); - s = ddadd2_d2_d2_d(s, q * (-PI4_B*4)); - s = ddadd2_d2_d2_d(s, q * (-PI4_C*4)); - s = ddadd2_d2_d2_d(s, q * (-PI4_D*4)); + int qh = xtrunc(d * (M_1_PI / (1 << 24))); + int ql = xrint(d * M_1_PI - qh * (double)(1 << 24)); + s = ddadd2_d2_d_d (d, qh * (-PI_A * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_A )); + s = ddadd2_d2_d2_d(s, qh * (-PI_B * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_B )); + s = ddadd2_d2_d2_d(s, qh * (-PI_C * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_C )); + s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * -PI_D); + t = s; s = ddsqu_d2_d2(s); @@ -576,30 +629,33 @@ double xsin_u1(double d) { u = mla(u, s.x, 0.00833333333333318056201922); x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); - + x = ddmul_d2_d2_d2(t, x); u = x.x + x.y; - if ((q & 1) != 0) u = -u; - if (xisnegzero(d)) u = -0.0; + if ((ql & 1) != 0) u = -u; + if (!xisinf(d) && (xisnegzero(d) || xfabs(d) > TRIGRANGEMAX)) u = -0.0; return u; } double xcos(double d) { - int q; - double u, s; - - q = 1 + 2*(int)xrint(d * M_1_PI - 0.5); - - d = mla(q, -PI4_A*2, d); - d = mla(q, -PI4_B*2, d); - d = mla(q, -PI4_C*2, d); - d = mla(q, -PI4_D*2, d); - + double u, s, t = d; + + int qh = xtrunc(d * (M_1_PI / (1LL << 23)) - 0.5 * (M_1_PI / (1LL << 23))); + int ql = 2*xrint(d * M_1_PI - 0.5 - qh * (double)(1LL << 23))+1; + + d = mla(qh, -PI_A*0.5*(1LL << 24), d); + d = mla(ql, -PI_A*0.5, d); + d = mla(qh, -PI_B*0.5*(1LL << 24), d); + d = mla(ql, -PI_B*0.5, d); + d = mla(qh, -PI_C*0.5*(1LL << 24), d); + d = mla(ql, -PI_C*0.5, d); + d = mla((double)qh*(1LL << 24) + ql , -PI_D*0.5, d); + s = d * d; - if ((q & 2) == 0) d = -d; + if ((ql & 2) == 0) d = -d; u = -7.97255955009037868891952e-18; u = mla(u, s, 2.81009972710863200091251e-15); @@ -613,22 +669,28 @@ double xcos(double d) { u = mla(s, u * d, d); + if (!xisinf(t) && xfabs(t) > TRIGRANGEMAX) u = 0.0; + return u; } double xcos_u1(double d) { - double u, q; + double u; double2 s, t, x; - d = fabs(d); - - q = mla(2, xrint(d * M_1_PI - 0.5), 1); + d = xfabs(d); - s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + int qh = xtrunc(d * (M_1_PI / (1LL << (23))) - 0.5 * (M_1_PI / (1LL << (23)))); + int ql = 2*xrint(d * M_1_PI - 0.5 - qh * (double)(1LL << (23)))+1; + s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5 )); + s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5)); + t = s; s = ddsqu_d2_d2(s); @@ -646,25 +708,29 @@ double xcos_u1(double d) { u = x.x + x.y; - if ((((int)q) & 2) == 0) u = -u; + if ((((int)ql) & 2) == 0) u = -u; + if (!xisinf(d) && d > TRIGRANGEMAX) u = 0.0; return u; } double2 xsincos(double d) { - int q; double u, s, t; double2 r; - q = (int)xrint(d * (2 * M_1_PI)); - s = d; - s = mla(-q, PI4_A*2, s); - s = mla(-q, PI4_B*2, s); - s = mla(-q, PI4_C*2, s); - s = mla(-q, PI4_D*2, s); + int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24))); + int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24)); + s = mla(qh, -PI_A * 0.5 * (1 << 24), s); + s = mla(ql, -PI_A * 0.5, s); + s = mla(qh, -PI_B * 0.5 * (1 << 24), s); + s = mla(ql, -PI_B * 0.5, s); + s = mla(qh, -PI_C * 0.5 * (1 << 24), s); + s = mla(ql, -PI_C * 0.5, s); + s = mla((double)qh * (1 << 24) + ql, -PI_D * 0.5, s); + t = s; s = s * s; @@ -691,27 +757,31 @@ double2 xsincos(double d) { r.y = u * s + 1; - if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } - if ((q & 2) != 0) { r.x = -r.x; } - if (((q+1) & 2) != 0) { r.y = -r.y; } + if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((ql & 2) != 0) { r.x = -r.x; } + if (((ql+1) & 2) != 0) { r.y = -r.y; } if (xisinf(d)) { r.x = r.y = NAN; } + if (!xisinf(d) && xfabs(d) > TRIGRANGEMAX) { r.x = r.y = 0; } return r; } double2 xsincos_u1(double d) { - int q; double u; double2 r, s, t, x; - q = (int)xrint(d * (2 * M_1_PI)); - - s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24))); + int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24)); + s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5 )); + s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5)); + t = s; s = ddsqu_d2_d2(s); s.x = s.x + s.y; @@ -741,49 +811,54 @@ double2 xsincos_u1(double d) { x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u)); r.y = x.x + x.y; - if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } - if ((q & 2) != 0) { r.x = -r.x; } - if (((q+1) & 2) != 0) { r.y = -r.y; } + if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } + if ((ql & 2) != 0) { r.x = -r.x; } + if (((ql+1) & 2) != 0) { r.y = -r.y; } if (xisinf(d)) { r.x = r.y = NAN; } + if (!xisinf(d) && xfabs(d) > TRIGRANGEMAX) { r.x = r.y = 0; } return r; } double xtan(double d) { - int q; double u, s, x; - q = (int)xrint(d * (2 * M_1_PI)); - - x = mla(q, -PI4_A*2, d); - x = mla(q, -PI4_B*2, x); - x = mla(q, -PI4_C*2, x); - x = mla(q, -PI4_D*2, x); + int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24))); + int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24)); + x = mla(qh, -PI_A * 0.5 * (1 << 24), d); + x = mla(ql, -PI_A * 0.5, x); + x = mla(qh, -PI_B * 0.5 * (1 << 24), x); + x = mla(ql, -PI_B * 0.5, x); + x = mla(qh, -PI_C * 0.5 * (1 << 24), x); + x = mla(ql, -PI_C * 0.5, x); + x = mla((double)qh * (1 << 24) + ql, -PI_D * 0.5, x); + s = x * x; - if ((q & 1) != 0) x = -x; - - u = 1.01419718511083373224408e-05; - u = mla(u, s, -2.59519791585924697698614e-05); - u = mla(u, s, 5.23388081915899855325186e-05); - u = mla(u, s, -3.05033014433946488225616e-05); - u = mla(u, s, 7.14707504084242744267497e-05); - u = mla(u, s, 8.09674518280159187045078e-05); - u = mla(u, s, 0.000244884931879331847054404); - u = mla(u, s, 0.000588505168743587154904506); - u = mla(u, s, 0.00145612788922812427978848); - u = mla(u, s, 0.00359208743836906619142924); - u = mla(u, s, 0.00886323944362401618113356); - u = mla(u, s, 0.0218694882853846389592078); - u = mla(u, s, 0.0539682539781298417636002); - u = mla(u, s, 0.133333333333125941821962); - u = mla(u, s, 0.333333333333334980164153); - + if ((ql & 1) != 0) x = -x; + + u = 9.99583485362149960784268e-06; + u = mla(u, s, -4.31184585467324750724175e-05); + u = mla(u, s, 0.000103573238391744000389851); + u = mla(u, s, -0.000137892809714281708733524); + u = mla(u, s, 0.000157624358465342784274554); + u = mla(u, s, -6.07500301486087879295969e-05); + u = mla(u, s, 0.000148898734751616411290179); + u = mla(u, s, 0.000219040550724571513561967); + u = mla(u, s, 0.000595799595197098359744547); + u = mla(u, s, 0.00145461240472358871965441); + u = mla(u, s, 0.0035923150771440177410343); + u = mla(u, s, 0.00886321546662684547901456); + u = mla(u, s, 0.0218694899718446938985394); + u = mla(u, s, 0.0539682539049961967903002); + u = mla(u, s, 0.133333333334818976423364); + u = mla(u, s, 0.333333333333320047664472); + u = mla(s, u * x, x); - if ((q & 1) != 0) u = 1.0 / u; + if ((ql & 1) != 0) u = 1.0 / u; if (xisinf(d)) u = NAN; @@ -791,18 +866,22 @@ double xtan(double d) { } double xtan_u1(double d) { - int q; double u; double2 s, t, x; - q = (int)xrint(d * M_2_PI); - - s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); - s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); - - if ((q & 1) != 0) s = ddneg_d2_d2(s); + int qh = xtrunc(d * (M_2_PI / (1 << 24))); + s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - qh * (double)(1 << 24)); + int ql = s.x + s.y; + + s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5 )); + s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24))); + s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5 )); + s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5)); + + if ((ql & 1) != 0) s = ddneg_d2_d2(s); t = s; s = ddsqu_d2_d2(s); @@ -825,11 +904,11 @@ double xtan_u1(double d) { x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s)); x = ddmul_d2_d2_d2(t, x); - if ((q & 1) != 0) x = ddrec_d2_d2(x); + if ((ql & 1) != 0) x = ddrec_d2_d2(x); u = x.x + x.y; - if (xisnegzero(d)) u = -0.0; + if (!xisinf(d) && (xisnegzero(d) || xfabs(d) > TRIGRANGEMAX)) u = -0.0; return u; } @@ -852,7 +931,7 @@ double xlog(double d) { t = mla(t, x2, 0.399999999950799600689777); t = mla(t, x2, 0.6666666666667778740063); t = mla(t, x2, 2); - + x = x * t + 0.693147180559945286226764 * e; if (xisinf(d)) x = INFINITY; @@ -884,8 +963,8 @@ double xexp(double d) { u = s * s * u + s + 1; u = ldexpk(u, q); - if (xisminf(d)) u = 0; - + if (d < -1000) u = 0; + return u; } @@ -900,17 +979,21 @@ static inline double2 logk(double d) { x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = ddsqu_d2_d2(x); - t = 0.13860436390467167910856; - t = mla(t, x2.x, 0.131699838841615374240845); - t = mla(t, x2.x, 0.153914168346271945653214); - t = mla(t, x2.x, 0.181816523941564611721589); - t = mla(t, x2.x, 0.22222224632662035403996); - t = mla(t, x2.x, 0.285714285511134091777308); - t = mla(t, x2.x, 0.400000000000914013309483); - t = mla(t, x2.x, 0.666666666666664853302393); + t = 0.116255524079935043668677; + t = mla(t, x2.x, 0.103239680901072952701192); + t = mla(t, x2.x, 0.117754809412463995466069); + t = mla(t, x2.x, 0.13332981086846273921509); + t = mla(t, x2.x, 0.153846227114512262845736); + t = mla(t, x2.x, 0.181818180850050775676507); + t = mla(t, x2.x, 0.222222222230083560345903); + t = mla(t, x2.x, 0.285714285714249172087875); + t = mla(t, x2.x, 0.400000000000000077715612); + double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17); return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e), - ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), + ddmul_d2_d2_d2(ddmul_d2_d2_d2(x2, x), + ddadd2_d2_d2_d2(ddmul_d2_d2_d(x2, t), c)))); } double xlog_u1(double d) { @@ -948,17 +1031,22 @@ static inline double expk(double2 d) { t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); t = ddadd_d2_d_d2(1, t); - return ldexpk(t.x + t.y, q); + + u = ldexpk(t.x + t.y, q); + + if (d.x < -1000) u = 0; + + return u; } double xpow(double x, double y) { - int yisint = (int)y == y; + int yisint = xisint(y); int yisodd = (1 & (int)y) != 0 && yisint; double result = expk(ddmul_d2_d2_d(logk(xfabs(x)), y)); result = xisnan(result) ? INFINITY : result; - result *= (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1))); + result *= (x > 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1))); double efx = mulsign(xfabs(x) - 1, y); if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY); @@ -991,7 +1079,7 @@ static inline double2 expk2(double2 d) { t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); t = ddadd_d2_d_d2(1, t); - return ddscale_d2_d2_d(t, pow2i(q)); + return ddscale_d2_d2_d(ddscale_d2_d2_d(t, 2), pow2i(q-1)); } double xsinh(double x) { @@ -1040,13 +1128,13 @@ static inline double2 logk2(double2 d) { double2 x, x2, m; double t; int e; - + e = ilogbk(d.x * (1.0/0.75)); m = ddscale_d2_d2_d(d, pow2i(-e)); x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); x2 = ddsqu_d2_d2(x); - + t = 0.13860436390467167910856; t = mla(t, x2.x, 0.131699838841615374240845); t = mla(t, x2.x, 0.153914168346271945653214); @@ -1062,21 +1150,27 @@ static inline double2 logk2(double2 d) { double xasinh(double x) { double y = xfabs(x); - double2 d = logk2(ddadd_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(y, y), 1)), y)); + double2 d; + + d = y > 1 ? ddrec_d2_d(x) : dd(y, 0); + d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1)); + d = y > 1 ? ddmul_d2_d2_d(d, y) : d; + + d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x))); y = d.x + d.y; - y = xisinf(x) || xisnan(y) ? INFINITY : y; - y = mulsign(y, x); + y = (xfabs(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(INFINITY, x) : y; y = xisnan(x) ? NAN : y; - + y = xisnegzero(x) ? -0.0 : y; + return y; } double xacosh(double x) { - double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(x, x), -1)), x)); + double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x)); double y = d.x + d.y; - y = xisinf(x) || xisnan(y) ? INFINITY : y; + y = (x > SQRT_DBL_MAX || xisnan(y)) ? INFINITY : y; y = x == 1.0 ? 0.0 : y; y = x < 1.0 ? NAN : y; y = xisnan(x) ? NAN : y; @@ -1149,7 +1243,7 @@ double xcbrt(double d) { // max error : 2 ulps double x, y, q = 1.0; int e, r; - e = ilogbk(d)+1; + e = ilogbk(xfabs(d))+1; d = ldexpk(d, -e); r = (e + 6144) % 3; q = (r == 1) ? 1.2599210498948731647672106 : q; @@ -1178,7 +1272,7 @@ double xcbrt_u1(double d) { double2 q2 = dd(1, 0), u, v; int e, r; - e = ilogbk(d)+1; + e = ilogbk(xfabs(d))+1; d = ldexpk(d, -e); r = (e + 6144) % 3; q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2; @@ -1218,14 +1312,14 @@ double xcbrt_u1(double d) { double xexp2(double a) { double u = expk(ddmul_d2_d2_d(dd(0.69314718055994528623, 2.3190468138462995584e-17), a)); - if (a > 1023) u = INFINITY; + if (a > 1024) u = INFINITY; // log2(DBL_MAX) if (xisminf(a)) u = 0; return u; } double xexp10(double a) { double u = expk(ddmul_d2_d2_d(dd(2.3025850929940459011, -2.1707562233822493508e-16), a)); - if (a > 308) u = INFINITY; + if (a > 308.254715559916743850652254) u = INFINITY; // log10(DBL_MAX) if (xisminf(a)) u = 0; return u; } @@ -1233,8 +1327,8 @@ double xexp10(double a) { double xexpm1(double a) { double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0); double x = d.x + d.y; - if (a > 700) x = INFINITY; - if (a < -0.36043653389117156089696070315825181539851971360337e+2) x = -1; + if (a > 709.782712893383996732223) x = INFINITY; // log(DBL_MAX) + if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0)) if (xisnegzero(a)) x = -0.0; return x; } @@ -1254,7 +1348,7 @@ double xlog1p(double a) { double2 d = logk2(ddadd2_d2_d_d(a, 1)); double x = d.x + d.y; - if (xisinf(a)) x = INFINITY; + if (a > 1e+307) x = INFINITY; if (a < -1) x = NAN; if (a == -1) x = -INFINITY; if (xisnegzero(a)) x = -0.0; diff --git a/purec/sleefsp.c b/purec/sleefsp.c index f88c8b51..b792a636 100644 --- a/purec/sleefsp.c +++ b/purec/sleefsp.c @@ -9,10 +9,16 @@ #include "nonnumber.h" -#define PI4_Af 0.78515625f -#define PI4_Bf 0.00024187564849853515625f -#define PI4_Cf 3.7747668102383613586e-08f -#define PI4_Df 1.2816720341285448015e-12f +#define PI_Af 3.140625f +#define PI_Bf 0.0009670257568359375f +#define PI_Cf 6.2771141529083251953e-07f +#define PI_Df 1.2154201256553420762e-10f + +#define PI_XDf 1.2141754268668591976e-10f +#define PI_XEf 1.2446743939339977025e-13f + +#define TRIGRANGEMAXf 1e+7 // 39000 +#define SQRT_FLT_MAX 18446743523953729536.0 #define L2Uf 0.693145751953125f #define L2Lf 1.428606765330187045e-06f @@ -346,14 +352,14 @@ static inline float2 dfsqrt_f2_f2(float2 d) { float xsinf(float d) { int q; - float u, s; + float u, s, t = d; q = (int)xrintf(d * (float)M_1_PI); - d = mlaf(q, -PI4_Af*4, d); - d = mlaf(q, -PI4_Bf*4, d); - d = mlaf(q, -PI4_Cf*4, d); - d = mlaf(q, -PI4_Df*4, d); + d = mlaf(q, -PI_Af, d); + d = mlaf(q, -PI_Bf, d); + d = mlaf(q, -PI_Cf, d); + d = mlaf(q, -PI_Df, d); s = d * d; @@ -367,8 +373,8 @@ float xsinf(float d) { u = mlaf(s, u * d, d); - if (xisinff(d)) u = NANf; - if (xisnegzerof(d)) u = -0.0f; + if (xisnegzerof(t) || xfabsf(t) > TRIGRANGEMAXf) u = -0.0f; + if (xisinff(t)) u = NANf; return u; } @@ -380,10 +386,10 @@ float xsinf_u1(float d) { q = (int)xrintf(d * (float)M_1_PI); - s = dfadd2_f2_f_f(d, q * (-PI4_Af*4)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*4)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*4)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Df*4)); + s = dfadd2_f2_f_f (d, q * (-PI_Af)); + s = dfadd2_f2_f2_f(s, q * (-PI_Bf)); + s = dfadd2_f2_f2_f(s, q * (-PI_Cf)); + s = dfadd2_f2_f2_f(s, q * (-PI_Df)); t = s; s = dfsqu_f2_f2(s); @@ -398,21 +404,21 @@ float xsinf_u1(float d) { u = x.x + x.y; if ((q & 1) != 0) u = -u; - if (xisnegzerof(d)) u = -0.0f; + if (!xisinff(d) && (xisnegzerof(d) || xfabsf(d) > TRIGRANGEMAXf)) u = -0.0f; return u; } float xcosf(float d) { int q; - float u, s; + float u, s, t = d; q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); - d = mlaf(q, -PI4_Af*2, d); - d = mlaf(q, -PI4_Bf*2, d); - d = mlaf(q, -PI4_Cf*2, d); - d = mlaf(q, -PI4_Df*2, d); + d = mlaf(q, -PI_Af*0.5f, d); + d = mlaf(q, -PI_Bf*0.5f, d); + d = mlaf(q, -PI_Cf*0.5f, d); + d = mlaf(q, -PI_Df*0.5f, d); s = d * d; @@ -425,8 +431,9 @@ float xcosf(float d) { u = mlaf(s, u * d, d); - if (xisinff(d)) u = NANf; - + if (xfabsf(t) > TRIGRANGEMAXf) u = 0.0f; + if (xisinff(t)) u = NANf; + return u; } @@ -434,14 +441,14 @@ float xcosf_u1(float d) { float u, q; float2 s, t, x; - d = fabsf(d); + d = xfabsf(d); q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); - s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + s = dfadd2_f2_f_f (d, q * (-PI_Af*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Bf*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Cf*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Df*0.5f)); t = s; s = dfsqu_f2_f2(s); @@ -456,7 +463,7 @@ float xcosf_u1(float d) { u = x.x + x.y; if ((((int)q) & 2) == 0) u = -u; - + if (!xisinff(d) && d > TRIGRANGEMAXf) u = 0.0f; return u; } @@ -469,10 +476,10 @@ float2 xsincosf(float d) { s = d; - s = mlaf(q, -PI4_Af*2, s); - s = mlaf(q, -PI4_Bf*2, s); - s = mlaf(q, -PI4_Cf*2, s); - s = mlaf(q, -PI4_Df*2, s); + s = mlaf(q, -PI_Af*0.5f, s); + s = mlaf(q, -PI_Bf*0.5f, s); + s = mlaf(q, -PI_Cf*0.5f, s); + s = mlaf(q, -PI_Df*0.5f, s); t = s; @@ -499,6 +506,7 @@ float2 xsincosf(float d) { if ((q & 2) != 0) { r.x = -r.x; } if (((q+1) & 2) != 0) { r.y = -r.y; } + if (xfabsf(d) > TRIGRANGEMAXf) { r.x = r.y = 0; } if (xisinff(d)) { r.x = r.y = NANf; } return r; @@ -511,10 +519,10 @@ float2 xsincosf_u1(float d) { q = (int)xrintf(d * (float)(2 * M_1_PI)); - s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + s = dfadd2_f2_f_f (d, q * (-PI_Af*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Bf*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Cf*0.5f)); + s = dfadd2_f2_f2_f(s, q * (-PI_Df*0.5f)); t = s; s = dfsqu_f2_f2(s); @@ -543,6 +551,7 @@ float2 xsincosf_u1(float d) { if ((q & 2) != 0) { r.x = -r.x; } if (((q+1) & 2) != 0) { r.y = -r.y; } + if (xfabsf(d) > TRIGRANGEMAXf) { r.x = r.y = 0; } if (xisinff(d)) { r.x = r.y = NAN; } return r; @@ -556,10 +565,10 @@ float xtanf(float d) { x = d; - x = mlaf(q, -PI4_Af*2, x); - x = mlaf(q, -PI4_Bf*2, x); - x = mlaf(q, -PI4_Cf*2, x); - x = mlaf(q, -PI4_Df*2, x); + x = mlaf(q, -PI_Af*0.5f, x); + x = mlaf(q, -PI_Bf*0.5f, x); + x = mlaf(q, -PI_Cf*0.5f, x); + x = mlaf(q, -PI_Df*0.5f, x); s = x * x; @@ -587,11 +596,12 @@ float xtanf_u1(float d) { float2 s, t, x; q = (int)xrintf(d * (float)(2 * M_1_PI)); - - s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); - s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + s = dfadd2_f2_f_f (d, q * (-PI_Af*0.5f)); + s = dfadd2_f2_f2_f (s, q * (-PI_Bf*0.5f)); + s = dfadd2_f2_f2_f (s, q * (-PI_Cf*0.5f)); + s = dfadd2_f2_f2_f (s, q * (-PI_XDf*0.5f)); + s = dfadd2_f2_f2_f (s, q * (-PI_XEf*0.5f)); if ((q & 1) != 0) s = dfneg_f2_f2(s); @@ -613,7 +623,7 @@ float xtanf_u1(float d) { u = x.x + x.y; - if (xisnegzerof(d)) u = -0.0f; + if (!xisinff(d) && (xisnegzerof(d) || xfabsf(d) > TRIGRANGEMAXf)) u = -0.0f; return u; } @@ -709,9 +719,6 @@ static float2 atan2kf_u1(float2 y, float2 x) { u = mlaf(u, t.x, -0.142626821994781494140625f); u = mlaf(u, t.x, 0.199983194470405578613281f); - //u = mlaf(u, t.x, -0.333332866430282592773438f); - //t = dfmul_f2_f2_f(t, u); - t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x)); t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t)); t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t); @@ -785,24 +792,29 @@ float xexpf(float d) { s = mlaf(q, -L2Uf, d); s = mlaf(q, -L2Lf, s); +#if 0 u = 0.00136324646882712841033936f; u = mlaf(u, s, 0.00836596917361021041870117f); u = mlaf(u, s, 0.0416710823774337768554688f); u = mlaf(u, s, 0.166665524244308471679688f); u = mlaf(u, s, 0.499999850988388061523438f); - +#else + u = 0.000198527617612853646278381; + u = mlaf(u, s, 0.00139304355252534151077271); + u = mlaf(u, s, 0.00833336077630519866943359); + u = mlaf(u, s, 0.0416664853692054748535156); + u = mlaf(u, s, 0.166666671633720397949219); + u = mlaf(u, s, 0.5); +#endif + u = s * s * u + s + 1.0f; u = ldexpkf(u, q); - if (xisminff(d)) u = 0; + if (d < -104) u = 0; return u; } -//#define L2Af 0.693145751953125 -//#define L2Bf 1.4285906217992305756e-06 -//#define L2Cf 1.619850954759360917e-11 - static inline float expkf(float2 d) { int q = (int)xrintf((d.x + d.y) * R_LN2f); float2 s, t; @@ -811,10 +823,6 @@ static inline float expkf(float2 d) { s = dfadd2_f2_f2_f(d, q * -L2Uf); s = dfadd2_f2_f2_f(s, q * -L2Lf); - //s = dfadd2_f2_f2_f(d, q * -L2Af); - //s = dfadd2_f2_f2_f(s, q * -L2Bf); - //s = dfadd2_f2_f2_f(s, q * -L2Cf); - s = dfnormalize_f2_f2(s); u = 0.00136324646882712841033936f; @@ -826,7 +834,12 @@ static inline float expkf(float2 d) { t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); t = dfadd_f2_f_f2(1, t); - return ldexpkf(t.x + t.y, q); + + u = ldexpkf(t.x + t.y, q); + + if (d.x < -104) u = 0; + + return u; } static inline float2 logkf(float d) { @@ -836,17 +849,19 @@ static inline float2 logkf(float d) { e = ilogbkf(d * (1.0f/0.75f)); m = ldexpkf(d, -e); - + x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); x2 = dfsqu_f2_f2(x); - t = 0.2392828464508056640625f; - t = mlaf(t, x2.x, 0.28518211841583251953125f); - t = mlaf(t, x2.x, 0.400005877017974853515625f); - t = mlaf(t, x2.x, 0.666666686534881591796875f); + t = 0.240320354700088500976562; + t = mlaf(t, x2.x, 0.285112679004669189453125); + t = mlaf(t, x2.x, 0.400007992982864379882812); + float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f); return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), - dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), + dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x), + dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c)))); } float xlogf_u1(float d) { @@ -877,11 +892,11 @@ static inline float2 expk2f(float2 d) { t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); t = dfadd_f2_f_f2(1, t); - return dfscale_f2_f2_f(t, pow2if(q)); + return dfscale_f2_f2_f(dfscale_f2_f2_f(t, 2), pow2if(q-1)); } float xpowf(float x, float y) { - int yisint = (int)y == y; + int yisint = (y == (int)y) || (xfabsf(y) >= (float)(1LL << 23)); int yisodd = (1 & (int)y) != 0 && yisint; float result = expkf(dfmul_f2_f2_f(logkf(xfabsf(x)), y)); @@ -932,7 +947,7 @@ float xtanhf(float x) { d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e)); y = d.x + d.y; - y = xfabsf(x) > 8.664339742f ? 1.0f : y; + y = xfabsf(x) > 18.714973875f ? 1.0f : y; y = xisnanf(y) ? 1.0f : y; y = mulsignf(y, x); y = xisnanf(x) ? NANf : y; @@ -962,21 +977,27 @@ static inline float2 logk2f(float2 d) { float xasinhf(float x) { float y = xfabsf(x); - float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(y, y), 1)), y)); - y = d.x + d.y; + float2 d; - y = xisinff(x) || xisnanf(y) ? INFINITYf : y; - y = mulsignf(y, x); + d = y > 1 ? dfrec_f2_f(x) : df(y, 0); + d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1)); + d = y > 1 ? dfmul_f2_f2_f(d, y) : d; + + d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x))); + y = d.x + d.y; + + y = (xfabsf(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(INFINITYf, x) : y; y = xisnanf(x) ? NANf : y; + y = xisnegzerof(x) ? -0.0f : y; return y; } float xacoshf(float x) { - float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(x, x), -1)), x)); + float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x)); float y = d.x + d.y; - y = xisinff(x) || xisnanf(y) ? INFINITYf : y; + y = (x > SQRT_FLT_MAX || xisnanf(y)) ? INFINITYf : y; y = x == 1.0f ? 0.0f : y; y = x < 1.0f ? NANf : y; y = xisnanf(x) ? NANf : y; @@ -987,7 +1008,7 @@ float xacoshf(float x) { float xatanhf(float x) { float y = xfabsf(x); float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y))); - y = y > 1.0 ? NANf : (y == 1.0 ? INFINITYf : (d.x + d.y) * 0.5f); + y = y > 1.0f ? NANf : (y == 1.0f ? INFINITYf : (d.x + d.y) * 0.5f); y = xisinff(x) || xisnanf(y) ? NANf : y; y = mulsignf(y, x); @@ -998,14 +1019,14 @@ float xatanhf(float x) { float xexp2f(float a) { float u = expkf(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), a)); - if (xispinff(a)) u = INFINITYf; + if (a >= 128) u = INFINITYf; if (xisminff(a)) u = 0; return u; } float xexp10f(float a) { float u = expkf(dfmul_f2_f2_f(df(2.3025851249694824219f, -3.1975436520781386207e-08f), a)); - if (xispinff(a)) u = INFINITYf; + if (a > 38.531839419103626f) u = INFINITYf; if (xisminff(a)) u = 0; return u; } @@ -1013,8 +1034,8 @@ float xexp10f(float a) { float xexpm1f(float a) { float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f); float x = d.x + d.y; - if (a > 88.0f) x = INFINITYf; - if (a < -0.15942385152878742116596338793538061065739925620174e+2f) x = -1; + if (a > 88.72283905206835f) x = INFINITYf; + if (a < -16.635532333438687426013570f) x = -1; if (xisnegzerof(a)) x = -0.0f; return x; } @@ -1034,7 +1055,7 @@ float xlog1pf(float a) { float2 d = logk2f(dfadd2_f2_f_f(a, 1)); float x = d.x + d.y; - if (xisinff(a)) x = INFINITYf; + if (a > 1e+38) x = INFINITYf; if (a < -1) x = NANf; if (a == -1) x = -INFINITYf; if (xisnegzerof(a)) x = -0.0f; @@ -1048,7 +1069,7 @@ float xcbrtf(float d) { float x, y, q = 1.0f; int e, r; - e = ilogbkf(d)+1; + e = ilogbkf(xfabsf(d))+1; d = ldexpkf(d, -e); r = (e + 6144) % 3; q = (r == 1) ? 1.2599210498948731647672106f : q; @@ -1076,7 +1097,7 @@ float xcbrtf_u1(float d) { float2 q2 = df(1, 0), u, v; int e, r; - e = ilogbkf(d)+1; + e = ilogbkf(xfabsf(d))+1; d = ldexpkf(d, -e); r = (e + 6144) % 3; q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2; diff --git a/purec/starttester2.sh b/purec/starttester2.sh new file mode 100644 index 00000000..5fabae17 --- /dev/null +++ b/purec/starttester2.sh @@ -0,0 +1,3 @@ +#!/bin/sh +nohup nice ./tester2 >> ../tester2.out & +nohup nice ./tester2f >> ../tester2.out & diff --git a/purec/stoptester2.sh b/purec/stoptester2.sh new file mode 100644 index 00000000..e777caca --- /dev/null +++ b/purec/stoptester2.sh @@ -0,0 +1,2 @@ +#!/bin/sh +killall tester2 tester2f diff --git a/purec/tester2.c b/purec/tester2.c new file mode 100644 index 00000000..e65252f0 --- /dev/null +++ b/purec/tester2.c @@ -0,0 +1,514 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define _GNU_SOURCE +#include +#include +#include + +#include "sleef.h" + +mpfr_t fra, frb, frc, frd, frw, frx, fry, frz; + +#define DENORMAL_DBL_MIN (4.94066e-324) + +double countULP(double d, mpfr_t c) { + double c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DENORMAL_DBL_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +double countULP2(double d, mpfr_t c) { + double c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DBL_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +typedef union { + double d; + uint64_t u64; + int64_t i64; +} conv_t; + +double rnd_fr() { + conv_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0); +#else + c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); +#endif + } while(!isfinite(c.d)); + return c.d; +} + +double rnd_zo() { + conv_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0); +#else + c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); +#endif + } while(!isfinite(c.d) || c.d < -1 || 1 < c.d); + return c.d; +} + +int main(int argc,char **argv) +{ + mpfr_set_default_prec(256); + mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL); + + conv_t cd; + double d, t; + + + int cnt; + + srandom(time(NULL)); + +#if 0 + cd.d = M_PI; + mpfr_set_d(frx, cd.d, GMP_RNDN); + cd.i64+=3; + printf("%g\n", countULP(cd.d, frx)); +#endif + + const double rangemax = 1e+14; // 2^(24*2-1) + + for(cnt = 0;;cnt++) { + switch(cnt & 7) { + case 0: + d = (2 * (double)random() / RAND_MAX - 1) * rangemax; + break; + case 1: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + case 2: + d = (2 * (double)random() / RAND_MAX - 1) * 1e+7; + break; + case 3: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+7) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + case 4: + d = (2 * (double)random() / RAND_MAX - 1) * 10000; + break; + case 5: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 10000) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + default: + d = rnd_fr(); + break; + } + + if (!isfinite(d)) continue; + + + double2 sc = xsincos(d); + double2 sc2 = xsincos_u1(d); + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsin(d), frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(sc.x, frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + double u2 = countULP(t = xsin_u1(d), frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + double u3 = countULP(t = sc2.x, frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcos(d), frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = sc.y, frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + double u2 = countULP(t = xcos_u1(d), frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + double u3 = countULP(t = sc2.y, frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtan(d), frx); + + if ((fabs(d) < 1e+7 && u0 > 3.5) || (fabs(d) <= rangemax && u0 > 5) || isnan(t)) { + printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xtan_u1(d), frx); + + if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) { + printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + d = rnd_fr(); + double d2 = rnd_fr(), zo = rnd_zo(); + + { + mpfr_set_d(frx, fabs(d), GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog(fabs(d)), frx); + + if (u0 > 3.5) { + printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xlog_u1(fabs(d)), frx); + + if (u1 > 1) { + printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, fabs(d), GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog10(fabs(d)), frx); + + if (u0 > 1) { + printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog1p(d), frx); + + if ((-1 <= d && d <= 1e+307 && u0 > 1) || + (d < -1 && !isnan(t)) || + (d > 1e+307 && !(u0 <= 1 || isinf(t)))) { + printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0); + printf("%g\n", t); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp(d), frx); + + if (u0 > 1) { + printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp2(d), frx); + + if (u0 > 1) { + printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp10(d), frx); + + if (u0 > 1) { + printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpm1(d), frx); + + if (u0 > 1) { + printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_pow(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xpow(d2, d), frx); + + if (u0 > 1) { + printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcbrt(d), frx); + + if (u0 > 3.5) { + printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xcbrt_u1(d), frx); + + if (u1 > 1) { + printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasin(zo), frx); + + if (u0 > 3.5) { + printf("Pure C asin arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xasin_u1(zo), frx); + + if (u1 > 1) { + printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacos(zo), frx); + + if (u0 > 3.5) { + printf("Pure C acos arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xacos_u1(zo), frx); + + if (u1 > 1) { + printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatan(d), frx); + + if (u0 > 3.5) { + printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xatan_u1(d), frx); + + if (u1 > 1) { + printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xatan2(d2, d), frx); + + if (u0 > 3.5) { + printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + + double u1 = countULP2(t = xatan2_u1(d2, d), frx); + + if (u1 > 1) { + printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsinh(d), frx); + + if ((fabs(d) <= 709 && u0 > 1) || + (d > 709 && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcosh(d), frx); + + if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) { + printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtanh(d), frx); + + if (u0 > 1) { + printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinh(d), frx); + + if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) || + (d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacosh(d), frx); + + if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) || + (d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d <= -sqrt(DBL_MAX) && !isnan(t))) { + printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0); + printf("%.20g\n", t); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanh(d), frx); + + if (u0 > 1) { + printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + } +} diff --git a/purec/tester2f.c b/purec/tester2f.c new file mode 100644 index 00000000..7f33e0e3 --- /dev/null +++ b/purec/tester2f.c @@ -0,0 +1,515 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define _GNU_SOURCE +#include +#include +#include + +#include "sleef.h" + +mpfr_t fra, frb, frc, frd, frw, frx, fry, frz; + +#define DENORMAL_FLT_MIN (1.40130e-45f) + +double countULP(float d, mpfr_t c) { + float c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), DENORMAL_FLT_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +double countULP2(float d, mpfr_t c) { + float c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), FLT_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +typedef union { + double d; + uint64_t u64; + int64_t i64; +} conv64_t; + +typedef union { + float f; + uint32_t u32; + int32_t i32; +} conv32_t; + +float rnd_fr() { + conv32_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0); +#else + c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); +#endif + } while(!isfinite(c.f)); + return c.f; +} + +float rnd_zo() { + conv32_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0); +#else + c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); +#endif + } while(!isfinite(c.f) || c.f < -1 || 1 < c.f); + return c.f; +} + +int main(int argc,char **argv) +{ + mpfr_set_default_prec(256); + mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL); + + conv32_t cd; + float d, t; + int cnt; + + srandom(time(NULL)); + +#if 0 + cd.f = M_PI; + mpfr_set_d(frx, cd.f, GMP_RNDN); + cd.i32+=3; + printf("%g\n", countULP(cd.f, frx)); +#endif + + const float rangemax = 39000; + + for(cnt = 0;;cnt++) { + switch(cnt & 7) { + case 0: + d = (2 * (float)random() / RAND_MAX - 1) * rangemax; + break; + case 1: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + case 2: + d = (2 * (float)random() / RAND_MAX - 1) * rangemax; + break; + case 3: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + case 4: + d = (2 * (float)random() / RAND_MAX - 1) * 10000; + break; + case 5: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * 10000) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + default: + d = rnd_fr(); + break; + } + + if (!isfinite(d)) continue; + + float2 sc = xsincosf(d); + float2 sc2 = xsincosf_u1(d); + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xsinf(d), frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sinf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = sc.x, frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincosf sin arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + float u2 = countULP(t = xsinf_u1(d), frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sinf_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + float u3 = countULP(t = sc2.x, frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xcosf(d), frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C cosf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = sc.y, frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincosf cos arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + float u2 = countULP(t = xcosf_u1(d), frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C cosf_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + float u3 = countULP(t = sc2.y, frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf("Pure C sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xtanf(d), frx); + + if ((fabs(d) < rangemax && u0 > 3.5) || isnan(t)) { + printf("Pure C tanf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = xtanf_u1(d), frx); + + if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) { + printf("Pure C tanf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + d = rnd_fr(); + float d2 = rnd_fr(), zo = rnd_zo(); + + { + mpfr_set_d(frx, fabsf(d), GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlogf(fabsf(d)), frx); + + if (u0 > 3.5) { + printf("Pure C logf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xlogf_u1(fabsf(d)), frx); + + if (u1 > 1) { + printf("Pure C logf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, fabsf(d), GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog10f(fabsf(d)), frx); + + if (u0 > 1) { + printf("Pure C log10f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog1pf(d), frx); + + if ((-1 <= d && d <= 1e+38 && u0 > 1) || + (d < -1 && !isnan(t)) || + (d > 1e+38 && !(u0 <= 1 || isinf(t)))) { + printf("Pure C log1pf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpf(d), frx); + + if (u0 > 1) { + printf("Pure C expf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp2f(d), frx); + + if (u0 > 1) { + printf("Pure C exp2f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp10f(d), frx); + + if (u0 > 1) { + printf("Pure C exp10f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpm1f(d), frx); + + if (u0 > 1) { + printf("Pure C expm1f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_pow(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xpowf(d2, d), frx); + + if (u0 > 1) { + printf("Pure C powf arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcbrtf(d), frx); + + if (u0 > 3.5) { + printf("Pure C cbrtf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xcbrtf_u1(d), frx); + + if (u1 > 1) { + printf("Pure C cbrtf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinf(zo), frx); + + if (u0 > 3.5) { + printf("Pure C asinf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xasinf_u1(zo), frx); + + if (u1 > 1) { + printf("Pure C asinf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacosf(zo), frx); + + if (u0 > 3.5) { + printf("Pure C acosf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xacosf_u1(zo), frx); + + if (u1 > 1) { + printf("Pure C acosf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanf(d), frx); + + if (u0 > 3.5) { + printf("Pure C atanf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xatanf_u1(d), frx); + + if (u1 > 1) { + printf("Pure C atanf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xatan2f(d2, d), frx); + + if (u0 > 3.5) { + printf("Pure C atan2f arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + + double u1 = countULP2(t = xatan2f_u1(d2, d), frx); + + if (u1 > 1) { + printf("Pure C atan2f_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsinhf(d), frx); + + if ((fabs(d) <= 88.5 && u0 > 1) || + (d > 88.5 && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d < -88.5 && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf("Pure C sinhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcoshf(d), frx); + + if ((fabs(d) <= 88.5 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) { + printf("Pure C coshf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtanhf(d), frx); + + if (u0 > 1.0001) { + printf("Pure C tanhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinhf(d), frx); + + if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) || + (d >= sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t > 0))) || + (d <= -sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t < 0)))) { + printf("Pure C asinhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacoshf(d), frx); + + if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) || + (d >= sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinff(t) && t > 0))) || + (d <= -sqrt(FLT_MAX) && !isnan(t))) { + printf("Pure C acoshf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanhf(d), frx); + + if (u0 > 1.0001) { + printf("Pure C atanhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + } +} diff --git a/simd/Makefile b/simd/Makefile index 3636c29c..26740bae 100644 --- a/simd/Makefile +++ b/simd/Makefile @@ -1,6 +1,6 @@ -CC=gcc-5 +CC=gcc OPT=-O -Wall -Wno-unused -Wno-attributes -ffp-contract=off -fmax-errors=3 -SDE=sde +SDE=sde64 all : testsse2 testavx testavx2 @@ -19,6 +19,42 @@ iutavx512f : sleefsimddp.c sleefsimdsp.c helperavx512f.h iut.c iutfma4 : sleefsimddp.c sleefsimdsp.c helperfma4.h iut.c $(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 iut.c sleefsimddp.c sleefsimdsp.c -o iutfma4 -lm +# + +tester2 : tester2sse2 tester2avx tester2avx2 tester2avx512f tester2fma4 tester2fsse2 tester2favx tester2favx2 tester2favx512f tester2ffma4 + +tester2sse2 : sleefsimddp.c helpersse2.h tester2simd.c + $(CC) $(OPT) -DENABLE_SSE2 -msse2 tester2simd.c sleefsimddp.c -o tester2sse2 -lm -lmpfr + +tester2avx : sleefsimddp.c helperavx.h tester2simd.c + $(CC) $(OPT) -DENABLE_AVX -mavx tester2simd.c sleefsimddp.c -o tester2avx -lm -lmpfr + +tester2avx2 : sleefsimddp.c helperavx2.h tester2simd.c + $(CC) $(OPT) -DENABLE_AVX2 -mavx2 -mfma tester2simd.c sleefsimddp.c -o tester2avx2 -lm -lmpfr + +tester2avx512f : sleefsimddp.c helperavx512f.h tester2simd.c + $(CC) $(OPT) -DENABLE_AVX512F -mavx512f tester2simd.c sleefsimddp.c -o tester2avx512f -lm -lmpfr + +tester2fma4 : sleefsimddp.c helperfma4.h tester2simd.c + $(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 tester2simd.c sleefsimddp.c -o tester2fma4 -lm -lmpfr + +tester2fsse2 : sleefsimdsp.c helpersse2.h tester2fsimd.c + $(CC) $(OPT) -DENABLE_SSE2 -msse2 tester2fsimd.c sleefsimdsp.c -o tester2fsse2 -lm -lmpfr + +tester2favx : sleefsimdsp.c helperavx.h tester2fsimd.c + $(CC) $(OPT) -DENABLE_AVX -mavx tester2fsimd.c sleefsimdsp.c -o tester2favx -lm -lmpfr + +tester2favx2 : sleefsimdsp.c helperavx2.h tester2fsimd.c + $(CC) $(OPT) -DENABLE_AVX2 -mavx2 -mfma tester2fsimd.c sleefsimdsp.c -o tester2favx2 -lm -lmpfr + +tester2favx512f : sleefsimdsp.c helperavx512f.h tester2fsimd.c + $(CC) $(OPT) -DENABLE_AVX512F -mavx512f tester2fsimd.c sleefsimdsp.c -o tester2favx512f -lm -lmpfr + +tester2ffma4 : sleefsimdsp.c helperfma4.h tester2fsimd.c + $(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 tester2fsimd.c sleefsimdsp.c -o tester2ffma4 -lm -lmpfr + +# + ../tester/tester : cd ../tester; make tester @@ -62,4 +98,4 @@ testfma4 : iutfma4 ../tester/tester ../tester/testeru1 ../tester/testersp ../tes ../tester/testerspu1 ./iutfma4 clean : - rm -f *~ *.o *.s iutsse2 iutavx iutavx2 iutavx512f iutfma4 iutneon iutclangvec + rm -f *~ *.o *.s *.out iutsse2 iutavx iutavx2 iutavx512f iutfma4 iutneon iutclangvec tester2sse2 tester2avx tester2avx2 tester2avx512f tester2fma4 tester2clangvec tester2fsse2 tester2favx tester2favx2 tester2favx512f tester2ffma4 tester2fclangvec diff --git a/simd/Makefile.clang b/simd/Makefile.clang index 9106cd84..7034bbe2 100644 --- a/simd/Makefile.clang +++ b/simd/Makefile.clang @@ -22,6 +22,18 @@ iutfma4 : sleefsimddp.c sleefsimdsp.c helperfma4.h iut.c iutclangvec : sleefsimddp.c sleefsimdsp.c helperclangvec.h iut.c $(CC) $(OPT) -DENABLE_CLANGVEC iut.c sleefsimddp.c sleefsimdsp.c -o iutclangvec -lm +# + +tester2 : tester2clangvec tester2fclangvec + +tester2clangvec : sleefsimddp.c sleefsimddp.c helperclangvec.h tester2simd.c + $(CC) $(OPT) -DENABLE_CLANGVEC -msse2 tester2simd.c sleefsimddp.c -o tester2clangvec -lm -lmpfr + +tester2fclangvec : sleefsimddp.c sleefsimdsp.c helperclangvec.h tester2simd.c + $(CC) $(OPT) -DENABLE_CLANGVEC -msse2 tester2fsimd.c sleefsimdsp.c -o tester2fclangvec -lm -lmpfr + +# + ../tester/tester : cd ../tester; make tester diff --git a/simd/helperavx.h b/simd/helperavx.h index fa40bfe3..f81a850e 100644 --- a/simd/helperavx.h +++ b/simd/helperavx.h @@ -48,6 +48,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) { static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } +static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } static INLINE vint2 vcastu_vi2_vi(vint vi) { diff --git a/simd/helperavx2.h b/simd/helperavx2.h index ce880d65..35985fbd 100644 --- a/simd/helperavx2.h +++ b/simd/helperavx2.h @@ -51,6 +51,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) { static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } +static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } diff --git a/simd/helperavx512f.h b/simd/helperavx512f.h index 5c7f7bc9..04501a36 100644 --- a/simd/helperavx512f.h +++ b/simd/helperavx512f.h @@ -55,6 +55,9 @@ static INLINE vint vtruncate_vi_vd(vdouble vd) { static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); } +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); } + static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi)); } @@ -179,6 +182,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); } +static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m512i)vf; } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m512)vm; } diff --git a/simd/helperclangvec.h b/simd/helperclangvec.h index 858a214b..8f708852 100644 --- a/simd/helperclangvec.h +++ b/simd/helperclangvec.h @@ -1,5 +1,7 @@ #include +// When you change VECTLENDP, you also need to change the same macro in sleefsimd.h + #ifndef VECTLENDP #define VECTLENDP 8 #endif @@ -81,6 +83,8 @@ static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (v static INLINE vdouble vcast_vd_vi(vint vi) { return __builtin_convertvector(vi, vdouble); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return __builtin_convertvector(vd, vint); } static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); } +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); } static INLINE vint vcast_vi_i(int i) { return (vint)(i); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { @@ -187,6 +191,7 @@ static INLINE vfloat vcast_vf_vi2(vint2 vi) { return __builtin_convertvector(vi, static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return __builtin_convertvector(vf, vint2); } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); } static INLINE vint2 vcast_vi2_i(int i) { return (vint2)(i); } +static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } static INLINE vfloat vcast_vf_f(float f) { return (vfloat)(f); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } diff --git a/simd/helperfma4.h b/simd/helperfma4.h index 1209c3dc..43bea24a 100644 --- a/simd/helperfma4.h +++ b/simd/helperfma4.h @@ -51,6 +51,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) { static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } +static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } static INLINE vint2 vcastu_vi2_vi(vint vi) { diff --git a/simd/helperneon32.h b/simd/helperneon32.h index 89b5b598..4ef9882c 100644 --- a/simd/helperneon32.h +++ b/simd/helperneon32.h @@ -59,6 +59,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); } static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); } static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); } +static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; } diff --git a/simd/helpersse2.h b/simd/helpersse2.h index 7595dee1..3c83048d 100644 --- a/simd/helpersse2.h +++ b/simd/helpersse2.h @@ -45,9 +45,18 @@ static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); } -static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_shuffle_epi32(vi, 0x73); } +static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); } static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); } +#ifdef __SSE4_1__ +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } +static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } +#else +static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); } +static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); } +#endif + static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { @@ -149,6 +158,10 @@ static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; } static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (__m128)vm; } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (__m128i)vf; } +#ifndef __SSE4_1__ +static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } +#endif + static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); } diff --git a/simd/sleefsimd.h b/simd/sleefsimd.h index 65897b51..a6d48506 100644 --- a/simd/sleefsimd.h +++ b/simd/sleefsimd.h @@ -7,6 +7,7 @@ #define VECTLENDP 2 #define VECTLENSP 4 +#define SLEEF_ARCH "SSE2" typedef __m128d vdouble; typedef __m128i vint; @@ -38,6 +39,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12 #define VECTLENDP 4 #define VECTLENSP 8 +#define SLEEF_ARCH "AVX" typedef __m256d vdouble; typedef __m128i vint; @@ -78,6 +80,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12 #define VECTLENDP 4 #define VECTLENSP 8 +#define SLEEF_ARCH "AVX2" typedef __m256d vdouble; typedef __m128i vint; @@ -109,6 +112,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12 #define VECTLENDP 8 #define VECTLENSP 16 +#define SLEEF_ARCH "AVX512F" typedef __m512d vdouble; typedef __m256i vint; @@ -140,6 +144,7 @@ static void vstoreui(int32_t *p, vint v) { return _mm256_storeu_si256((__m256i * #define VECTLENDP 2 #define VECTLENSP 4 +#define SLEEF_ARCH "NEON32" //typedef __m128d vdouble; typedef int32x4_t vint; @@ -167,6 +172,7 @@ static void vstoreui2(int32_t *p, vint2 v) { vst1q_s32(p, v); } #define VECTLENDP 2 #define VECTLENSP 4 +#define SLEEF_ARCH "NEON64" //typedef __m128d vdouble; typedef int32x4_t vint; @@ -192,6 +198,7 @@ static void vstoreui2(int32_t *p, vint2 v) { vst1q_s32(p, v); } #ifdef ENABLE_CLANGVEC #define VECTLENDP 8 #define VECTLENSP (VECTLENDP*2) +#define SLEEF_ARCH "CLANGVEC" typedef double vdouble __attribute__((ext_vector_type(VECTLENDP))); typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP))); diff --git a/simd/sleefsimddp.c b/simd/sleefsimddp.c index 5bb563f6..e91f39c4 100644 --- a/simd/sleefsimddp.c +++ b/simd/sleefsimddp.c @@ -46,11 +46,22 @@ // +#define PI_A 3.1415926218032836914 +#define PI_B 3.1786509424591713469e-08 +#define PI_C 1.2246467864107188502e-16 +#define PI_D 1.2736634327021899816e-24 + #define PI4_A 0.78539816290140151978 #define PI4_B 4.9604678871439933374e-10 #define PI4_C 1.1258708853173288931e-18 #define PI4_D 1.7607799325916000908e-27 +#define M_2_PI_H 0.63661977236758138243 +#define M_2_PI_L -3.9357353350364971764e-17 + +#define TRIGRANGEMAX 1e+14 +#define SQRT_DBL_MAX 1.3407807929942596355e+154 + #define M_4_PI 1.273239544735162542821171882678754627704620361328125 #define L2U .69314718055966295651160180568695068359375 @@ -59,17 +70,6 @@ // -#define PI4_Af 0.78515625f -#define PI4_Bf 0.00024187564849853515625f -#define PI4_Cf 3.7747668102383613586e-08f -#define PI4_Df 1.2816720341285448015e-12f - -#define L2Uf 0.693145751953125f -#define L2Lf 1.428606765330187045e-06f -#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f - -// - static INLINE vopmask vsignbit_vo_vd(vdouble d) { return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } @@ -126,6 +126,13 @@ static INLINE vint vilogbk_vi_vd(vdouble d) { } #endif +static INLINE vopmask visint(vdouble d) { + vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (1 << 31)))); + x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(1 << 31)), x, d); + return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x), + vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 52))); +} + // vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); } @@ -139,20 +146,34 @@ vint xilogb(vdouble d) { } vdouble xsin(vdouble d) { - vint q; - vdouble u, s; - - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + vdouble u, s, r = d; +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); - u = vcast_vd_vi(q); + u = vcast_vd_vi(ql); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*4), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*4), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*4), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*4), d); - +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24)))); + vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), + vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)))); + vdouble dql = vcast_vd_vi(ql); + + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A ), d); + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B ), d); + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C ), d); + d = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D), d); +#endif + s = vmul_vd_vd_vd(d, d); - d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)d)); + d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)d)); u = vcast_vd_d(-7.97255955009037868891952e-18); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); @@ -164,26 +185,43 @@ vdouble xsin(vdouble d) { u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); - u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); - u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u); + u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(r), + vor_vo_vo_vo(visnegzero_vo_vd(r), + vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX)))), + vcast_vd_d(-0.0), u); return u; } vdouble xsin_u1(vdouble d) { - vint q; vdouble u; vdouble2 s, t, x; - - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); - u = vcast_vd_vi(q); +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + u = vcast_vd_vi(ql); s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*4))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*4))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*4))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*4))); - +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24)))); + vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), + vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)))); + vdouble dql = vcast_vd_vi(ql); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D))); +#endif + t = s; s = ddsqu_vd2_vd2(s); @@ -200,28 +238,44 @@ vdouble xsin_u1(vdouble d) { x = ddmul_vd2_vd2_vd2(t, x); u = vadd_vd_vd_vd(x.x, x.y); - u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)u)); - u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u); + u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)u)); + u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d), vor_vo_vo_vo(visnegzero_vo_vd(d), + vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))), + vcast_vd_d(-0.0), u); return u; } vdouble xcos(vdouble d) { - vint q; - vdouble u, s; - - q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); - q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); + vdouble u, s, r = d; +#if 0 + vint ql = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); - u = vcast_vd_vi(q); + u = vcast_vd_vi(ql); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), d); d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), d); - +#else + vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-0.5 * M_1_PI / (1 << 23)))); + vint ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), + vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); + ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); + vdouble dql = vcast_vd_vi(ql); + + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5 ), d); + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5 ), d); + d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), d); + d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5 ), d); + d = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D * 0.5), d); +#endif s = vmul_vd_vd_vd(d, d); - d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)d)); + d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)d)); u = vcast_vd_d(-7.97255955009037868891952e-18); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); @@ -233,25 +287,43 @@ vdouble xcos(vdouble d) { u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); - u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); + u = (vdouble)vandnot_vm_vo64_vm(vandnot_vo_vo_vo(visinf_vo_vd(r), vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX))), + (vmask)u); + return u; } vdouble xcos_u1(vdouble d) { - vint q; vdouble u; vdouble2 s, t, x; - - q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); - q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); - u = vcast_vd_vi(q); +#if 0 + vint ql = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); + u = vcast_vd_vi(ql); s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); - +#else + vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-0.5 * M_1_PI / (1 << 23)))); + vint ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), + vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); + ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); + vdouble dql = vcast_vd_vi(ql); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D*0.5))); +#endif + t = s; s = ddsqu_vd2_vd2(s); @@ -268,27 +340,44 @@ vdouble xcos_u1(vdouble d) { x = ddmul_vd2_vd2_vd2(t, x); u = vadd_vd_vd_vd(x.x, x.y); - u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)u)); + u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)u)); + u = (vdouble)vandnot_vm_vo64_vm(vandnot_vo_vo_vo(visinf_vo_vd(d), vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))), + (vmask)u); + return u; } vdouble2 xsincos(vdouble d) { - vint q; vopmask o; vdouble u, s, t, rx, ry; vdouble2 r; - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - s = d; +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - u = vcast_vd_vi(q); + u = vcast_vd_vi(ql); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), s); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), s); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), s); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), s); - +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); + vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), + vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)))); + vdouble dql = vcast_vd_vi(ql); + + s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), s); + s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5 ), s); + s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), s); + s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5 ), s); + s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), s); + s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5 ), s); + s = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D * 0.5), s); +#endif + t = s; s = vmul_vd_vd_vd(s, s); @@ -314,16 +403,20 @@ vdouble2 xsincos(vdouble d) { ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1)); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r.x = vsel_vd_vo_vd_vd(o, rx, ry); r.y = vsel_vd_vo_vd_vd(o, ry, rx); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); + r.x = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.x); + r.y = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.y); + o = visinf_vo_vd(d); r.x = (vdouble)vor_vm_vo64_vm(o, (vmask)r.x); r.y = (vdouble)vor_vm_vo64_vm(o, (vmask)r.y); @@ -332,19 +425,33 @@ vdouble2 xsincos(vdouble d) { } vdouble2 xsincos_u1(vdouble d) { - vint q; vopmask o; vdouble u, rx, ry; vdouble2 r, s, t, x; - - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); - u = vcast_vd_vi(q); +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); + u = vcast_vd_vi(ql); s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); - +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); + vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), + vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)))); + vdouble dql = vcast_vd_vi(ql); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D*0.5))); +#endif + t = s; s = ddsqu_vd2_vd2(s); @@ -375,16 +482,20 @@ vdouble2 xsincos_u1(vdouble d) { x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(s.x, u)); ry = vadd_vd_vd_vd(x.x, x.y); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r.x = vsel_vd_vo_vd_vd(o, rx, ry); r.y = vsel_vd_vo_vd_vd(o, ry, rx); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); + r.x = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.x); + r.y = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.y); + o = visinf_vo_vd(d); r.x = (vdouble)vor_vm_vo64_vm(o, (vmask)r.x); r.y = (vdouble)vor_vm_vo64_vm(o, (vmask)r.y); @@ -393,63 +504,96 @@ vdouble2 xsincos_u1(vdouble d) { } vdouble xtan(vdouble d) { - vint q; vdouble u, s, x; vopmask o; +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - - u = vcast_vd_vi(q); + u = vcast_vd_vi(ql); x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), x); x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), x); x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), x); - +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); + vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), + vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)))); + vdouble dql = vcast_vd_vi(ql); + + x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), d); + x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5 ), x); + x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), x); + x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5 ), x); + x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), x); + x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5 ), x); + x = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D * 0.5), x); +#endif + s = vmul_vd_vd_vd(x, x); - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))); + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); x = (vdouble)vxor_vm_vm_vm(vand_vm_vo64_vm(o, (vmask)vcast_vd_d(-0.0)), (vmask)x); - u = vcast_vd_d(1.01419718511083373224408e-05); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.59519791585924697698614e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(5.23388081915899855325186e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.05033014433946488225616e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(7.14707504084242744267497e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(8.09674518280159187045078e-05)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000244884931879331847054404)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000588505168743587154904506)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145612788922812427978848)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00359208743836906619142924)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886323944362401618113356)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694882853846389592078)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539781298417636002)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333333125941821962)); - u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333334980164153)); + u = vcast_vd_d(9.99583485362149960784268e-06); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-4.31184585467324750724175e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000103573238391744000389851)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000137892809714281708733524)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000157624358465342784274554)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-6.07500301486087879295969e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000148898734751616411290179)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000219040550724571513561967)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000595799595197098359744547)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145461240472358871965441)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0035923150771440177410343)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886321546662684547901456)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694899718446938985394)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539049961967903002)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333334818976423364)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333320047664472)); u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); u = vsel_vd_vo_vd_vd(o, vrec_vd_vd(u), u); u = (vdouble)vor_vm_vo64_vm(visinf_vo_vd(d), (vmask)u); + u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u); return u; } vdouble xtan_u1(vdouble d) { - vint q; vdouble u; vdouble2 s, t, x; vopmask o; +#if 0 + vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); - u = vcast_vd_vi(q); - + u = vcast_vd_vi(ql); s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); - - o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))); +#else + vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); + s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d), + vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(double)(1 << 24)), + vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), + vcast_vd_d(-0.5), vcast_vd_d(0.5)))); + vint ql = vtruncate_vi_vd(vadd_vd_vd_vd(s.x, s.y)); + vdouble dql = vcast_vd_vi(ql); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24)))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5 ))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql), + vcast_vd_d(-PI_D*0.5))); +#endif + + o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); vmask n = vand_vm_vo64_vm(o, (vmask)vcast_vd_d(-0.0)); s.x = (vdouble)vxor_vm_vm_vm((vmask)s.x, n); s.y = (vdouble)vxor_vm_vm_vm((vmask)s.y, n); @@ -479,7 +623,10 @@ vdouble xtan_u1(vdouble d) { u = vadd_vd_vd_vd(x.x, x.y); - u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u); + u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d), + vor_vo_vo_vo(visnegzero_vo_vd(d), + vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))), + vcast_vd_d(-0.0), u); return u; } @@ -755,7 +902,8 @@ vdouble xexp(vdouble d) { u = vldexp_vd_vd_vi(u, q); - u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(d), (vmask)u); + u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), u); + u = (vdouble)vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), (vmask)u); return u; } @@ -776,22 +924,27 @@ static INLINE vdouble2 logk(vdouble d) { x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = ddsqu_vd2_vd2(x); - t = vcast_vd_d(0.13860436390467167910856); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.131699838841615374240845)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153914168346271945653214)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181816523941564611721589)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.22222224632662035403996)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285511134091777308)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000914013309483)); - t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666664853302393)); + t = vcast_vd_d(0.116255524079935043668677); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.103239680901072952701192)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.117754809412463995466069)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.13332981086846273921509)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153846227114512262845736)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181818180850050775676507)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222222230083560345903)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285714249172087875)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000000077715612)); + vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17); #ifndef ENABLE_AVX512F - return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), - vcast_vd_vi(e)), - ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); + return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)), + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), + ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x), + ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c)))); #else return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), e), - ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), + ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x), + ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c)))); #endif } @@ -833,21 +986,24 @@ static INLINE vdouble expk(vdouble2 d) { u = vadd_vd_vd_vd(t.x, t.y); u = vldexp_vd_vd_vi(u, q); + u = (vdouble)vandnot_vm_vo64_vm(vlt_vo_vd_vd(d.x, vcast_vd_d(-1000)), (vmask)u); + return u; } vdouble xpow(vdouble x, vdouble y) { #if 1 - vdouble debug = vcast_vd_vi(vrint_vi_vd(y)); - vopmask yisnint = vneq_vo_vd_vd(vcast_vd_vi(vrint_vi_vd(y)), y); - vopmask yisodd = vandnot_vo_vo_vo(yisnint, vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)))); + vopmask yisint = visint(y); + vopmask yisodd = vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1))), yisint); - vdouble result = expk(ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y)); + vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y); + vdouble result = expk(d); + result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d.x, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), result); result = vmul_vd_vd_vd(result, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(1), - (vdouble)vor_vm_vo64_vm(yisnint, (vmask)vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1))))); + vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(NAN)))); vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y); @@ -897,7 +1053,7 @@ static INLINE vdouble2 expk2(vdouble2 d) { t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); - return ddscale_vd2_vd2_vd(t, vpow2i_vd_vi(q)); + return ddscale_vd2_vd2_vd(ddscale_vd2_vd2_vd(t, vcast_vd_d(2)), vpow2i_vd_vi(vsub_vi_vi_vi(q, vcast_vi_i(1)))); } vdouble xsinh(vdouble x) { @@ -970,21 +1126,32 @@ static INLINE vdouble2 logk2(vdouble2 d) { vdouble xasinh(vdouble x) { vdouble y = vabs_vd_vd(x); - vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(y, y), vcast_vd_d(1))), y)); - y = vadd_vd_vd_vd(d.x, d.y); + vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1)); + vdouble2 d; + + d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0))); + d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1))); + d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d); - y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y); - y = vmulsign_vd_vd_vd(y, x); + d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x))); + y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)), + visnan_vo_vd(y)), + vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), x), y); y = (vdouble)vor_vm_vo64_vm(visnan_vo_vd(x), (vmask)y); + y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y); return y; } vdouble xacosh(vdouble x) { - vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(x, x), vcast_vd_d(-1))), x)); + vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x)); vdouble y = vadd_vd_vd_vd(d.x, d.y); - y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y); + y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)), + visnan_vo_vd(y)), + vcast_vd_d(INFINITY), y); y = (vdouble)vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); y = (vdouble)vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); @@ -999,7 +1166,6 @@ vdouble xatanh(vdouble x) { y = (vdouble)vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), (vmask)vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)))); y = (vdouble)vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), (vmask)y); - y = vmulsign_vd_vd_vd(y, x); y = (vdouble)vor_vm_vo64_vm(visnan_vo_vd(x), (vmask)y); @@ -1009,9 +1175,11 @@ vdouble xatanh(vdouble x) { vdouble xcbrt(vdouble d) { vdouble x, y, q = vcast_vd_d(1.0); vint e, qu, re; - vdouble t, s; + vdouble t; - s = d; +#ifdef ENABLE_GETEXP_DP + vdouble s = d; +#endif e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1)); d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); @@ -1047,11 +1215,13 @@ vdouble xcbrt(vdouble d) { } vdouble xcbrt_u1(vdouble d) { - vdouble x, y, z, t, s; + vdouble x, y, z, t; vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v; vint e, qu, re; - s = d; +#ifdef ENABLE_GETEXP_DP + vdouble s = d; +#endif e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1)); d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); @@ -1101,14 +1271,14 @@ vdouble xcbrt_u1(vdouble d) { vdouble xexp2(vdouble a) { vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a)); - u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1023)), vcast_vd_d(INFINITY), u); + u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1024)), vcast_vd_d(INFINITY), u); u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(a), (vmask)u); return u; } vdouble xexp10(vdouble a) { vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a)); - u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(308)), vcast_vd_d(INFINITY), u); + u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(308.254715559916743850652254)), vcast_vd_d(INFINITY), u); u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(a), (vmask)u); return u; } @@ -1116,8 +1286,8 @@ vdouble xexp10(vdouble a) { vdouble xexpm1(vdouble a) { vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0)); vdouble x = vadd_vd_vd_vd(d.x, d.y); - x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(700)), vcast_vd_d(INFINITY), x); - x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-0.36043653389117156089696070315825181539851971360337e+2)), vcast_vd_d(-1), x); + x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(INFINITY), x); + x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x); x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x); return x; } @@ -1137,7 +1307,7 @@ vdouble xlog1p(vdouble a) { vdouble2 d = logk2(ddadd2_vd2_vd_vd(a, vcast_vd_d(1))); vdouble x = vadd_vd_vd_vd(d.x, d.y); - x = vsel_vd_vo_vd_vd(vispinf_vo_vd(a), vcast_vd_d(INFINITY), x); + x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1e+307)), vcast_vd_d(INFINITY), x); x = (vdouble)vor_vm_vo64_vm(vgt_vo_vd_vd(vcast_vd_d(-1.0), a), (vmask)x); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(a, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), x); x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x); diff --git a/simd/sleefsimdsp.c b/simd/sleefsimdsp.c index 647cfcb0..760cf04e 100644 --- a/simd/sleefsimdsp.c +++ b/simd/sleefsimdsp.c @@ -55,6 +55,17 @@ #define PI4_Cf 3.7747668102383613586e-08f #define PI4_Df 1.2816720341285448015e-12f +#define PI_Af 3.140625f +#define PI_Bf 0.0009670257568359375f +#define PI_Cf 6.2771141529083251953e-07f +#define PI_Df 1.2154201256553420762e-10f + +#define PI_XDf 1.2141754268668591976e-10f +#define PI_XEf 1.2446743939339977025e-13f + +#define TRIGRANGEMAXf 1e+7 // 39000 +#define SQRT_FLT_MAX 18446743523953729536.0 + #define L2Uf 0.693145751953125f #define L2Lf 1.428606765330187045e-06f #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f @@ -132,15 +143,15 @@ vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); } vfloat xsinf(vfloat d) { vint2 q; - vfloat u, s; + vfloat u, s, r = d; q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); u = vcast_vf_vi2(q); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d); s = vmul_vf_vf_vf(d, d); @@ -151,27 +162,29 @@ vfloat xsinf(vfloat d) { u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); - u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); - u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u); + u = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnegzero_vo_vf(r), + vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf))), + vcast_vf_f(-0.0), u); - u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), u); + u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u); return u; } vfloat xcosf(vfloat d) { vint2 q; - vfloat u, s; + vfloat u, s, r = d; q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); u = vcast_vf_vi2(q); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), d); - d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d); s = vmul_vf_vf_vf(d, d); @@ -182,7 +195,10 @@ vfloat xcosf(vfloat d) { u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); - u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); + + u = (vfloat)vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf)), + (vmask)u); u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u); @@ -200,10 +216,10 @@ vfloat2 xsincosf(vfloat d) { s = d; u = vcast_vf_vi2(q); - s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), s); - s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), s); - s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), s); - s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s); t = s; @@ -235,8 +251,11 @@ vfloat2 xsincosf(vfloat d) { o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)); + r.x = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.x); + r.y = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.y); + o = visinf_vo_vf(d); - r.x = (vfloat)vor_vm_vo32_vm(o, (vmask)r.x); r.y = (vfloat)vor_vm_vo32_vm(o, (vmask)r.y); @@ -253,10 +272,10 @@ vfloat xtanf(vfloat d) { x = d; u = vcast_vf_vi2(q); - x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), x); - x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), x); - x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), x); - x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x); s = vmul_vf_vf_vf(x, x); @@ -287,10 +306,10 @@ vfloat xsinf_u1(vfloat d) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI))); u = vcast_vf_vi2(q); - s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4))); + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df))); t = s; s = dfsqu_vf2_vf2(s); @@ -305,7 +324,9 @@ vfloat xsinf_u1(vfloat d) { u = vadd_vf_vf_vf(x.x, x.y); u = (vfloat)vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0)), (vmask)u); - u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), u); + u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d), vor_vo_vo_vo(visnegzero_vo_vf(d), + vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)))), + vcast_vf_f(-0.0), u); return u; } @@ -319,10 +340,10 @@ vfloat xcosf_u1(vfloat d) { q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); u = vcast_vf_vi2(q); - s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f))); t = s; s = dfsqu_vf2_vf2(s); @@ -338,6 +359,9 @@ vfloat xcosf_u1(vfloat d) { u = (vfloat)vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0)), (vmask)u); + u = (vfloat)vandnot_vm_vo32_vm(vandnot_vo_vo_vo(visinf_vo_vf(d), vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))), + (vmask)u); + return u; } @@ -350,10 +374,10 @@ vfloat2 xsincosf_u1(vfloat d) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); u = vcast_vf_vi2(q); - s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f))); t = s; @@ -390,6 +414,10 @@ vfloat2 xsincosf_u1(vfloat d) { o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)); + r.x = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.x); + r.y = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.y); + o = visinf_vo_vf(d); r.x = (vfloat)vor_vm_vo32_vm(o, (vmask)r.x); r.y = (vfloat)vor_vm_vo32_vm(o, (vmask)r.y); @@ -406,10 +434,11 @@ vfloat xtanf_u1(vfloat d) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_2_PI))); u = vcast_vf_vi2(q); - s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); - s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_XDf*0.5f))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_XEf*0.5f))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); vmask n = vand_vm_vo32_vm(o, (vmask)vcast_vf_f(-0.0)); @@ -434,7 +463,10 @@ vfloat xtanf_u1(vfloat d) { u = vadd_vf_vf_vf(x.x, x.y); - u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), u); + u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d), + vor_vo_vo_vo(visnegzero_vo_vf(d), + vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)))), + vcast_vf_f(-0.0f), u); return u; } @@ -667,17 +699,25 @@ vfloat xexpf(vfloat d) { s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); +#if 0 u = vcast_vf_f(0.00136324646882712841033936f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00836596917361021041870117f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416710823774337768554688f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166665524244308471679688f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.499999850988388061523438f)); - +#else + u = vcast_vf_f(0.000198527617612853646278381); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5)); +#endif u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s)); u = vldexp_vf_vf_vi2(u, q); - u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(d), (vmask)u); + u = (vfloat)vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), (vmask)u); return u; } @@ -710,10 +750,12 @@ vfloat xsqrtf(vfloat d) { return vsqrt_vf_vf(d); } #endif vfloat xcbrtf(vfloat d) { - vfloat x, y, q = vcast_vf_f(1.0), t, s; + vfloat x, y, q = vcast_vf_f(1.0), t; vint2 e, qu, re; - s = d; +#ifdef ENABLE_GETEXP_DP + vfloat s = d; +#endif e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1)); d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); @@ -747,11 +789,13 @@ vfloat xcbrtf(vfloat d) { } vfloat xcbrtf_u1(vfloat d) { - vfloat x, y, z, t, s; + vfloat x, y, z, t; vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v; vint2 e, qu, re; - s = d; +#ifdef ENABLE_GETEXP_DP + vfloat s = d; +#endif e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1)); d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); @@ -815,18 +859,22 @@ static INLINE vfloat2 logkf(vfloat d) { x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); x2 = dfsqu_vf2_vf2(x); - t = vcast_vf_f(0.2392828464508056640625f); - t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.28518211841583251953125f)); - t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005877017974853515625f)); - t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666686534881591796875f)); + t = vcast_vf_f(0.240320354700088500976562); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285112679004669189453125)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400007992982864379882812)); + vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f); #ifndef ENABLE_AVX512F return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), - vcast_vf_vi2(e)), - dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); + vcast_vf_vi2(e)), + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), + dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x), + dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c)))); #else return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), e), - dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), + dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x), + dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c)))); #endif } @@ -867,20 +915,26 @@ static INLINE vfloat expkf(vfloat2 d) { u = vadd_vf_vf_vf(t.x, t.y); u = vldexp_vf_vf_vi2(u, q); + u = (vfloat)vandnot_vm_vo32_vm(vlt_vo_vf_vf(d.x, vcast_vf_f(-104)), (vmask)u); + return u; } vfloat xpowf(vfloat x, vfloat y) { #if 1 - vopmask yisnint = vneq_vo_vf_vf(vcast_vf_vi2(vrint_vi2_vf(y)), y); - vopmask yisodd = vandnot_vo_vo_vo(yisnint, veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vrint_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1))); + vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 23))); + vopmask yisodd = vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint); + +#ifdef ENABLE_NEON32 + yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd); +#endif vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y)); result = vmul_vf_vf_vf(result, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(1), - (vfloat)vor_vm_vo32_vm(yisnint, (vmask)vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1), vcast_vf_f(1))))); + vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(NANf)))); vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y); @@ -924,8 +978,8 @@ static INLINE vfloat2 expk2f(vfloat2 d) { t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); - - return dfscale_vf2_vf2_vf(t, vpow2i_vf_vi2(q)); + return dfscale_vf2_vf2_vf(dfscale_vf2_vf2_vf(t, vcast_vf_f(2)), + vpow2i_vf_vi2(vsub_vi2_vi2_vi2(q, vcast_vi2_i(1)))); } vfloat xsinhf(vfloat x) { @@ -997,21 +1051,32 @@ static INLINE vfloat2 logk2f(vfloat2 d) { vfloat xasinhf(vfloat x) { vfloat y = vabs_vf_vf(x); - vfloat2 d = logk2f(dfadd_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(y, y), vcast_vf_f(1))), y)); + vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1)); + vfloat2 d; + + d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0))); + d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1))); + d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d); + + d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x))); y = vadd_vf_vf_vf(d.x, d.y); - y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y); - y = vmulsign_vf_vf_vf(y, x); + y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)), + visnan_vo_vf(y)), + vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), x), y); y = (vfloat)vor_vm_vo32_vm(visnan_vo_vf(x), (vmask)y); + y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y); return y; } vfloat xacoshf(vfloat x) { - vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(x, x), vcast_vf_f(-1))), x)); + vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x)); vfloat y = vadd_vf_vf_vf(d.x, d.y); - y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y); + y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)), + visnan_vo_vf(y)), + vcast_vf_f(INFINITYf), y); y = (vfloat)vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y); @@ -1035,22 +1100,14 @@ vfloat xatanhf(vfloat x) { vfloat xexp2f(vfloat a) { vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), a)); -#ifdef ENABLE_NEON32 - u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(127.0f)), vcast_vf_f(INFINITYf), u); -#else - u = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), u); -#endif + u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(128.0f)), vcast_vf_f(INFINITYf), u); u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(a), (vmask)u); return u; } vfloat xexp10f(vfloat a) { vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(2.3025851249694824219f), vcast_vf_f(-3.1975436520781386207e-08f)), a)); -#ifdef ENABLE_NEON32 - u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(38.0f)), vcast_vf_f(INFINITYf), u); -#else - u = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), u); -#endif + u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(38.531839419103626f)), vcast_vf_f(INFINITYf), u); u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(a), (vmask)u); return u; } @@ -1058,8 +1115,8 @@ vfloat xexp10f(vfloat a) { vfloat xexpm1f(vfloat a) { vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0)); vfloat x = vadd_vf_vf_vf(d.x, d.y); - x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.0f)), vcast_vf_f(INFINITYf), x); - x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-0.15942385152878742116596338793538061065739925620174e+2f)), vcast_vf_f(-1), x); + x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283905206835f)), vcast_vf_f(INFINITYf), x); + x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x); x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x); return x; } @@ -1079,7 +1136,7 @@ vfloat xlog1pf(vfloat a) { vfloat2 d = logk2f(dfadd2_vf2_vf_vf(a, vcast_vf_f(1))); vfloat x = vadd_vf_vf_vf(d.x, d.y); - x = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), x); + x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(1e+38)), vcast_vf_f(INFINITYf), x); x = (vfloat)vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), a), (vmask)x); x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(a, vcast_vf_f(-1)), vcast_vf_f(-INFINITYf), x); x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x); diff --git a/simd/starttester2.sh b/simd/starttester2.sh new file mode 100644 index 00000000..e7a5b6c1 --- /dev/null +++ b/simd/starttester2.sh @@ -0,0 +1,9 @@ +#!/bin/sh +nohup nice ./tester2sse2 >> ../tester2.out & +nohup nice ./tester2fsse2 >> ../tester2.out & +nohup nice ./tester2avx >> ../tester2.out & +nohup nice ./tester2favx >> ../tester2.out & +nohup nice ./tester2avx2 >> ../tester2.out & +nohup nice ./tester2favx2 >> ../tester2.out & +nohup nice sde64 -- ./tester2avx512f >> ../tester2.out & +nohup nice sde64 -- ./tester2favx512f >> ../tester2.out & diff --git a/simd/stoptester2.sh b/simd/stoptester2.sh new file mode 100644 index 00000000..da6503ac --- /dev/null +++ b/simd/stoptester2.sh @@ -0,0 +1,2 @@ +#!/bin/sh +killall tester2avx tester2avx2 tester2sse2 tester2avx512f tester2favx tester2favx2 tester2fsse2 tester2favx512f diff --git a/simd/tester2fsimd.c b/simd/tester2fsimd.c new file mode 100644 index 00000000..7179386f --- /dev/null +++ b/simd/tester2fsimd.c @@ -0,0 +1,520 @@ +#include +#include +#include +#include +#include +#include +#include + +#define _GNU_SOURCE +#include +#include +#include "sleefsimd.h" + +mpfr_t fra, frb, frc, frd, frw, frx, fry, frz; + +#define DENORMAL_FLT_MIN (1.40130e-45f) + +double countULP(float d, mpfr_t c) { + float c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), DENORMAL_FLT_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +double countULP2(float d, mpfr_t c) { + float c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), FLT_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +typedef union { + double d; + uint64_t u64; + int64_t i64; +} conv64_t; + +typedef union { + float f; + uint32_t u32; + int32_t i32; +} conv32_t; + +float rnd_fr() { + conv32_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0); +#else + c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); +#endif + } while(!isfinite(c.f)); + return c.f; +} + +float rnd_zo() { + conv32_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0); +#else + c.u32 = (uint32_t)random() | ((uint32_t)random() << 31); +#endif + } while(!isfinite(c.f) || c.f < -1 || 1 < c.f); + return c.f; +} + +int main(int argc,char **argv) +{ + mpfr_set_default_prec(256); + mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL); + + conv32_t cd; + float d, t; + vfloat vd, vd2, vzo, vad; + vfloat2 sc, sc2; + int cnt; + + srandom(time(NULL)); + +#if 0 + cd.f = M_PI; + mpfr_set_d(frx, cd.f, GMP_RNDN); + cd.i32+=3; + printf("%g\n", countULP(cd.f, frx)); +#endif + + const float rangemax = 39000; + + for(cnt = 0;;cnt++) { + int e = cnt % VECTLENSP; + switch(cnt & 7) { + case 0: + d = (2 * (float)random() / RAND_MAX - 1) * rangemax; + break; + case 1: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + case 2: + d = (2 * (float)random() / RAND_MAX - 1) * rangemax; + break; + case 3: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + case 4: + d = (2 * (float)random() / RAND_MAX - 1) * 10000; + break; + case 5: + cd.f = rint((2 * (float)random() / RAND_MAX - 1) * 10000) * M_PI_4; + cd.i32 += (random() & 31) - 15; + d = cd.f; + break; + default: + d = rnd_fr(); + break; + } + + if (!isfinite(d)) continue; + + vd[e] = d; + sc = xsincosf(vd); + sc2 = xsincosf_u1(vd); + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xsinf(vd)[e], frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sinf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = sc.x[e], frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincosf sin arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + float u2 = countULP(t = xsinf_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sinf_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + float u3 = countULP(t = sc2.x[e], frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xcosf(vd)[e], frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " cosf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = sc.y[e], frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincosf cos arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + float u2 = countULP(t = xcosf_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " cosf_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + float u3 = countULP(t = sc2.y[e], frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + + float u0 = countULP(t = xtanf(vd)[e], frx); + + if ((fabs(d) < rangemax && u0 > 3.5) || isnan(t)) { + printf(SLEEF_ARCH " tanf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + float u1 = countULP(t = xtanf_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) { + printf(SLEEF_ARCH " tanf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + d = rnd_fr(); + float d2 = rnd_fr(), zo = rnd_zo(); + vd[e] = d; + vd2[e] = d2; + vzo[e] = zo; + vad[e] = fabs(d); + + { + mpfr_set_d(frx, fabsf(d), GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlogf(vad)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " logf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xlogf_u1(vad)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " logf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, fabsf(d), GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog10f(vad)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " log10f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog1pf(vd)[e], frx); + + if ((-1 <= d && d <= 1e+38 && u0 > 1) || + (d < -1 && !isnan(t)) || + (d > 1e+38 && !(u0 <= 1 || isinf(t)))) { + printf(SLEEF_ARCH " log1pf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpf(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " expf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp2f(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " exp2f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp10f(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " exp10f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpm1f(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " expm1f arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_pow(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xpowf(vd2, vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " powf arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcbrtf(vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " cbrtf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xcbrtf_u1(vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " cbrtf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinf(vzo)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " asinf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xasinf_u1(vzo)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " asinf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacosf(vzo)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " acosf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xacosf_u1(vzo)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " acosf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanf(vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " atanf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xatanf_u1(vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " atanf_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xatan2f(vd2, vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " atan2f arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + + double u1 = countULP2(t = xatan2f_u1(vd2, vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " atan2f_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsinhf(vd)[e], frx); + + if ((fabs(d) <= 88.5 && u0 > 1) || + (d > 88.5 && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d < -88.5 && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf(SLEEF_ARCH " sinhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcoshf(vd)[e], frx); + + if ((fabs(d) <= 88.5 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) { + printf(SLEEF_ARCH " coshf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtanhf(vd)[e], frx); + + if (u0 > 1.0001) { + printf(SLEEF_ARCH " tanhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinhf(vd)[e], frx); + + if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) || + (d >= sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t > 0))) || + (d <= -sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t < 0)))) { + printf(SLEEF_ARCH " asinhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacoshf(vd)[e], frx); + + if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) || + (d >= sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinff(t) && t > 0))) || + (d <= -sqrt(FLT_MAX) && !isnan(t))) { + printf(SLEEF_ARCH " acoshf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanhf(vd)[e], frx); + + if (u0 > 1.0001) { + printf(SLEEF_ARCH " atanhf arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + } +} diff --git a/simd/tester2simd.c b/simd/tester2simd.c new file mode 100644 index 00000000..f701ee91 --- /dev/null +++ b/simd/tester2simd.c @@ -0,0 +1,515 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define _GNU_SOURCE +#include +#include +#include "sleefsimd.h" + +mpfr_t fra, frb, frc, frd, frw, frx, fry, frz; + +#define DENORMAL_DBL_MIN (4.94066e-324) + +double countULP(double d, mpfr_t c) { + double c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DENORMAL_DBL_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +double countULP2(double d, mpfr_t c) { + double c2 = mpfr_get_d(c, GMP_RNDN); + if (c2 == 0 && d != 0) return 10000; + if (!isfinite(c2) && !isfinite(d)) return 0; + + int e; + frexpl(mpfr_get_d(c, GMP_RNDN), &e); + mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DBL_MIN), GMP_RNDN); + + mpfr_set_d(frd, d, GMP_RNDN); + mpfr_sub(fry, frd, c, GMP_RNDN); + mpfr_div(fry, fry, frw, GMP_RNDN); + double u = fabs(mpfr_get_d(fry, GMP_RNDN)); + + return u; +} + +typedef union { + double d; + uint64_t u64; + int64_t i64; +} conv_t; + +double rnd_fr() { + conv_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0); +#else + c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); +#endif + } while(!isfinite(c.d)); + return c.d; +} + +double rnd_zo() { + conv_t c; + do { +#if 1 + syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0); +#else + c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62); +#endif + } while(!isfinite(c.d) || c.d < -1 || 1 < c.d); + return c.d; +} + +int main(int argc,char **argv) +{ + mpfr_set_default_prec(256); + mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL); + + conv_t cd; + double d, t; + vdouble vd, vd2, vzo, vad; + vdouble2 sc, sc2; + int cnt; + + srandom(time(NULL)); + +#if 0 + cd.d = M_PI; + mpfr_set_d(frx, cd.d, GMP_RNDN); + cd.i64+=3; + printf("%g\n", countULP(cd.d, frx)); +#endif + + const double rangemax = 1e+14; // 2^(24*2-1) + + for(cnt = 0;;cnt++) { + int e = cnt % VECTLENDP; + switch(cnt & 7) { + case 0: + d = (2 * (double)random() / RAND_MAX - 1) * rangemax; + break; + case 1: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * rangemax) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + case 2: + d = (2 * (double)random() / RAND_MAX - 1) * 1e+7; + break; + case 3: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+7) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + case 4: + d = (2 * (double)random() / RAND_MAX - 1) * 10000; + break; + case 5: + cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 10000) * M_PI_4; + cd.i64 += (random() & 31) - 15; + d = cd.d; + break; + default: + d = rnd_fr(); + break; + } + + if (!isfinite(d)) continue; + + vd[e] = d; + sc = xsincos(vd); + sc2 = xsincos_u1(vd); + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsin(vd)[e], frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sin arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = sc.x[e], frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincos sin arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + double u2 = countULP(t = xsin_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sin_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + double u3 = countULP(t = sc2.x[e], frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcos(vd)[e], frx); + + if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " cos arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = sc.y[e], frx); + + if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincos cos arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + + double u2 = countULP(t = xcos_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " cos_u1 arg=%.20g ulp=%.20g\n", d, u2); + fflush(stdout); + } + + double u3 = countULP(t = sc2.y[e], frx); + + if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) { + printf(SLEEF_ARCH " sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtan(vd)[e], frx); + + if ((fabs(d) < 1e+7 && u0 > 3.5) || (fabs(d) <= rangemax && u0 > 5) || isnan(t)) { + printf(SLEEF_ARCH " tan arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xtan_u1(vd)[e], frx); + + if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) { + printf(SLEEF_ARCH " tan_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + d = rnd_fr(); + double d2 = rnd_fr(), zo = rnd_zo(); + vd[e] = d; + vd2[e] = d2; + vzo[e] = zo; + vad[e] = fabs(d); + + { + mpfr_set_d(frx, fabs(d), GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog(vad)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " log arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xlog_u1(vad)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " log_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, fabs(d), GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog10(vad)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " log10 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xlog1p(vd)[e], frx); + + if ((-1 <= d && d <= 1e+307 && u0 > 1) || + (d < -1 && !isnan(t)) || + (d > 1e+307 && !(u0 <= 1 || isinf(t)))) { + printf(SLEEF_ARCH " log1p arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " exp arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp2(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " exp2 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexp10(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " exp10 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xexpm1(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " expm1 arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_pow(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xpow(vd2, vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcbrt(vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " cbrt arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xcbrt_u1(vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasin(vzo)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " asin arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xasin_u1(vzo)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " asin_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, zo, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacos(vzo)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " acos arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xacos_u1(vzo)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " acos_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatan(vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " atan arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + + double u1 = countULP(t = xatan_u1(vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " atan_u1 arg=%.20g ulp=%.20g\n", d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_set_d(fry, d2, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + + double u0 = countULP(t = xatan2(vd2, vd)[e], frx); + + if (u0 > 3.5) { + printf(SLEEF_ARCH " atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0); + fflush(stdout); + } + + double u1 = countULP2(t = xatan2_u1(vd2, vd)[e], frx); + + if (u1 > 1) { + printf(SLEEF_ARCH " atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xsinh(vd)[e], frx); + + if ((fabs(d) <= 709 && u0 > 1) || + (d > 709 && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf(SLEEF_ARCH " sinh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xcosh(vd)[e], frx); + + if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) { + printf(SLEEF_ARCH " cosh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xtanh(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " tanh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xasinh(vd)[e], frx); + + if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) || + (d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) { + printf(SLEEF_ARCH " asinh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xacosh(vd)[e], frx); + + if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) || + (d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) || + (d <= -sqrt(DBL_MAX) && !isnan(t))) { + printf(SLEEF_ARCH " acosh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + + { + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + + double u0 = countULP(t = xatanh(vd)[e], frx); + + if (u0 > 1) { + printf(SLEEF_ARCH " atanh arg=%.20g ulp=%.20g\n", d, u0); + fflush(stdout); + } + } + } +} diff --git a/tester/tester.c b/tester/tester.c index a37acaeb..8626b0b7 100644 --- a/tester/tester.c +++ b/tester/tester.c @@ -1629,6 +1629,7 @@ void do_test() { boolean success = true; for(i=0;i 1000) { - fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); - goto STOP_SIN; + if (u > 20) { + fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u); } } - for(d = -10000000;d < 10000000;d += 200.1) { + for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) { double q = child_sin(d); long double c = sinlfr(d); - double u = countULP(q, c); + double u = countULP(q, c); max = fmax(max, u); - if (u > 1000) { - fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); - goto STOP_SIN; + if (u > 20) { + fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u); } } - int i; + int64_t i; - for(i=1;i<10000;i++) { + for(i=(int64_t)-1e+14;i<(int64_t)1e+14;i+=(int64_t)1e+11) { double start = u2d(d2u(M_PI_4 * i)-20); double end = u2d(d2u(M_PI_4 * i)+20); for(d = start;d <= end;d = u2d(d2u(d)+1)) { double q = child_sin(d); long double c = sinlfr(d); - double u = countULP(q, c); + double u = countULP(q, c); max = fmax(max, u); - if (u > 1000) { - fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); - goto STOP_SIN; + if (u > 20) { + fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u); } } } - STOP_SIN: - fprintf(stderr, "sin : %lf ... ", max); - showResult(max < 5); + showResult(max < 10); } { @@ -2300,23 +2298,23 @@ void do_test() { max = fmax(max, u); } - for(d = -10000000;d < 10000000;d += 200.1) { + for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) { double q = child_cos(d); long double c = coslfr(d); double u = countULP(q, c); max = fmax(max, u); } - int i; + int64_t i; - for(i=1;i<10000;i++) { + for(i=(int64_t)-1e+14;i<(int64_t)1e+14;i+=(int64_t)1e+11) { double start = u2d(d2u(M_PI_4 * i)-20); double end = u2d(d2u(M_PI_4 * i)+20); for(d = start;d <= end;d = u2d(d2u(d)+1)) { double q = child_cos(d); long double c = coslfr(d); - double u = countULP(q, c); + double u = countULP(q, c); max = fmax(max, u); } } @@ -2340,7 +2338,7 @@ void do_test() { } } - for(d = -10000000;d < 10000000;d += 200.1) { + for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) { double2 q = child_sincos(d); long double c = sinlfr(d); double u = fabs((q.x - c) / ulp(c)); @@ -2386,7 +2384,8 @@ void do_test() { max = fmax(max, u); } - for(d = -10000000;d < 10000000;d += 200.1) { + + for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) { double2 q = child_sincos(d); long double c = coslfr(d); double u = fabs((q.y - c) / ulp(c)); @@ -2422,7 +2421,7 @@ void do_test() { max = fmax(max, u); } - for(d = -10000000;d < 10000000;d += 200.1) { + for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) { double q = child_tan(d); long double c = tanlfr(d); double u = countULP(q, c); diff --git a/tester/testersp.c b/tester/testersp.c index 1eb4ef87..08a44d66 100644 --- a/tester/testersp.c +++ b/tester/testersp.c @@ -1723,7 +1723,7 @@ void do_test() { { fprintf(stderr, "asinf denormal/nonnumber test ... "); - float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, +0.0f, -0.0f }; + float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, nextafterf(1, 2), nextafterf(-1, -2), +0.0f, -0.0f }; boolean success = true; for(i=0;i