diff --git a/Makefile b/Makefile
index 498a46e5..2a5764b4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,10 @@
 test :
 	cd purec; make test
 	cd simd; make testsse2 testavx
-	cd java; make test
 
 clean :
-	rm -f *~
-	cd java; make clean
+	rm -f *~ *.out
 	cd purec; make clean
 	cd simd; make clean
 	cd tester; make clean
+	cd gencoef; make clean
diff --git a/README b/README
index d398cbce..f418ffc2 100644
--- a/README
+++ b/README
@@ -21,14 +21,25 @@ Main download page : http://shibatch.sourceforge.net/
 The trigonometric functions are tested to return values with specified
 accuracy when the argument is within the following range.
 
-Double precision trigonometric functions : |arg| <= 10000000
-Single precision trigonometric functions : |arg| <= 10000
+Double precision trigonometric functions : [-1e+14, 1e+14]
+Single precision trigonometric functions : [-39000, 39000]
 
 --
 
 
 History
 
+2.110
+* The valid range of argument is extended for trig functions
+* Specification of each functions regarding to the domain and accuracy is added
+* A coefficient generation tool is added
+* New testing tools are introduced
+* Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
+* SIMD xsin and xcos returned values more than 1 when FMA is enabled
+* Pure C cbrt returned incorrect values when the argument is negative
+* tan_u1 returned values with more than 1 ulp of error on rare occasions
+* Removed support for Java language(because no one seems using this)
+
 2.100 Added support for AVX-512F and Clang Extended Vectors.
 
 2.90 Added ilogbf. All the reported bugs(listed below) are fixed.
diff --git a/purec/Makefile b/purec/Makefile
index 5790d0d8..4a771cb2 100644
--- a/purec/Makefile
+++ b/purec/Makefile
@@ -15,6 +15,10 @@ iut : sleefdp.c sleefsp.c iut.c
 ../tester/testerspu1 :
 	cd ../tester; make testerspu1
 
+tester2 : tester2.c tester2f.c sleefdp.c sleefsp.c
+	$(CC) -ffp-contract=off -Wall -DNDEBUG tester2.c sleefdp.c -o tester2 -lm -lmpfr
+	$(CC) -ffp-contract=off -Wall -DNDEBUG tester2f.c sleefsp.c -o tester2f -lm -lmpfr
+
 test : iut ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1
 	../tester/tester ./iut
 	../tester/testeru1 ./iut
@@ -22,4 +26,4 @@ test : iut ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/test
 	../tester/testerspu1 ./iut
 
 clean :
-	rm -f *~ *.o iut
+	rm -f *~ *.o iut tester2 tester2f
diff --git a/purec/sleefdp.c b/purec/sleefdp.c
index 0d3f74da..c0bccae1 100644
--- a/purec/sleefdp.c
+++ b/purec/sleefdp.c
@@ -9,10 +9,16 @@
 
 #include "nonnumber.h"
 
-#define PI4_A 0.78539816290140151978
-#define PI4_B 4.9604678871439933374e-10
-#define PI4_C 1.1258708853173288931e-18
-#define PI4_D 1.7607799325916000908e-27
+#define PI_A 3.1415926218032836914
+#define PI_B 3.1786509424591713469e-08
+#define PI_C 1.2246467864107188502e-16
+#define PI_D 1.2736634327021899816e-24
+
+#define M_2_PI_H 0.63661977236758138243
+#define M_2_PI_L -3.9357353350364971764e-17
+
+#define TRIGRANGEMAX 1e+14
+#define SQRT_DBL_MAX 1.3407807929942596355e+154
 
 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
 
@@ -20,6 +26,25 @@
 #define L2L .28235290563031577122588448175013436025525412068e-12
 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 
+char frstr[1000];
+char *toBC(double d) {
+  union {
+    double d;
+    uint64_t u64;
+    int64_t i64;
+  } cnv;
+
+  cnv.d = d;
+  
+  int64_t l = cnv.i64;
+  int e = (int)((l >> 52) & ~(-1L << 11));
+  int s = (int)(l >> 63);
+  l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
+
+  sprintf(frstr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
+  return frstr;
+}
+
 static inline int64_t doubleToRawLongBits(double d) {
   union {
     double f;
@@ -49,6 +74,7 @@ static inline double mulsign(double x, double y) {
 static inline double sign(double d) { return mulsign(1, d); }
 static inline double mla(double x, double y, double z) { return x * y + z; }
 static inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
+static inline double xtrunc(double x) { return (double)(int)x; }
 
 static inline int xisnan(double x) { return x != x; }
 static inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; }
@@ -56,6 +82,11 @@ static inline int xisminf(double x) { return x == -INFINITY; }
 static inline int xispinf(double x) { return x == INFINITY; }
 static inline int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); }
 
+static inline int xisint(double d) {
+  double x = d - (double)(1 << 31) * (int)(d * (1.0 / (1 << 31)));
+  return (x == (int)x) || (xfabs(d) >= (double)(1LL << 52));
+}
+
 static inline double pow2i(int q) {
   return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
 }
@@ -149,7 +180,10 @@ static inline double2 ddadd_d2_d_d(double x, double y) {
   double2 r;
 
 #ifndef NDEBUG
-  if (!(checkfp(x) || checkfp(y) || xfabs(x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d_d : %g, %g]", x, y);
+  if (!(checkfp(x) || checkfp(y) || xfabs(x) >= xfabs(y))) {
+    fprintf(stderr, "[ddadd_d2_d_d : %g, %g]\n", x, y);
+    fflush(stderr);
+  }
 #endif
 
   r.x = x + y;
@@ -174,7 +208,10 @@ static inline double2 ddadd_d2_d2_d(double2 x, double y) {
   double2 r;
 
 #ifndef NDEBUG
-  if (!(checkfp(x.x) || checkfp(y) || xfabs(x.x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d2_d : %g %g]", x.x, y);
+  if (!(checkfp(x.x) || checkfp(y) || xfabs(x.x) >= xfabs(y))) {
+    fprintf(stderr, "[ddadd_d2_d2_d : %g %g]\n", x.x, y);
+    fflush(stderr);
+  }
 #endif
 
   r.x = x.x + y;
@@ -202,7 +239,10 @@ static inline double2 ddadd_d2_d_d2(double x, double2 y) {
   double2 r;
 
 #ifndef NDEBUG
-  if (!(checkfp(x) || checkfp(y.x) || xfabs(x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]", x, y.x);
+  if (!(checkfp(x) || checkfp(y.x) || xfabs(x) >= xfabs(y.x))) {
+    fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]\n", x, y.x);
+    fflush(stderr);
+  }
 #endif
 
   r.x = x + y.x;
@@ -227,7 +267,10 @@ static inline double2 ddadd_d2_d2_d2(double2 x, double2 y) {
   double2 r;
 
 #ifndef NDEBUG
-  if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]", x.x, y.x);
+  if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) {
+    fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x);
+    fflush(stderr);
+  }
 #endif
 
   r.x = x.x + y.x;
@@ -253,7 +296,10 @@ static inline double2 ddsub_d2_d2_d2(double2 x, double2 y) {
   double2 r;
 
 #ifndef NDEBUG
-  if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]", x.x, y.x);
+  if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) {
+    fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]\n", x.x, y.x);
+    fflush(stderr);
+  }
 #endif
 
   r.x = x.x - y.x;
@@ -481,8 +527,9 @@ static double2 atan2k_u1(double2 y, double2 x) {
 
   t = ddmul_d2_d2_d(t, u);
   t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t));
+  if (xfabs(s.x) < 1e-200) t = s;
   t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t);
-
+  
   return t;
 }
 
@@ -521,19 +568,22 @@ double xatan_u1(double d) {
 }
 
 double xsin(double d) {
-  int q;
-  double u, s;
-
-  q = (int)xrint(d * M_1_PI);
-
-  d = mla(q, -PI4_A*4, d);
-  d = mla(q, -PI4_B*4, d);
-  d = mla(q, -PI4_C*4, d);
-  d = mla(q, -PI4_D*4, d);
-
+  double u, s, t = d;
+
+  int qh = xtrunc(d * (M_1_PI / (1 << 24)));
+  int ql = xrint(d * M_1_PI - qh * (double)(1 << 24));
+
+  d = mla(qh, -PI_A * (1 << 24), d);
+  d = mla(ql, -PI_A,             d);
+  d = mla(qh, -PI_B * (1 << 24), d);
+  d = mla(ql, -PI_B,             d);
+  d = mla(qh, -PI_C * (1 << 24), d);
+  d = mla(ql, -PI_C,             d);
+  d = mla((double)qh * (1 << 24) + ql, -PI_D, d);
+  
   s = d * d;
 
-  if ((q & 1) != 0) d = -d;
+  if ((ql & 1) != 0) d = -d;
 
   u = -7.97255955009037868891952e-18;
   u = mla(u, s, 2.81009972710863200091251e-15);
@@ -547,23 +597,26 @@ double xsin(double d) {
 
   u = mla(s, u * d, d);
 
-  if (xisnegzero(d)) u = -0.0;
+  if (!xisinf(t) && (xisnegzero(t) || xfabs(t) > TRIGRANGEMAX)) u = -0.0;
 
   return u;
 }
 
 double xsin_u1(double d) {
-  int q;
   double u;
   double2 s, t, x;
 
-  q = (int)xrint(d * M_1_PI);
-
-  s = ddadd2_d2_d_d(d, q * (-PI4_A*4));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_B*4));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_C*4));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_D*4));
+  int qh = xtrunc(d * (M_1_PI / (1 << 24)));
+  int ql = xrint(d * M_1_PI - qh * (double)(1 << 24));
 
+  s = ddadd2_d2_d_d (d, qh * (-PI_A * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_A            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_B * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_B            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_C * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_C            ));
+  s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * -PI_D);
+  
   t = s;
   s = ddsqu_d2_d2(s);
 
@@ -576,30 +629,33 @@ double xsin_u1(double d) {
   u = mla(u, s.x, 0.00833333333333318056201922);
 
   x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));
-
+  
   x = ddmul_d2_d2_d2(t, x);
   u = x.x + x.y;
 
-  if ((q & 1) != 0) u = -u;
-  if (xisnegzero(d)) u = -0.0;
+  if ((ql & 1) != 0) u = -u;
+  if (!xisinf(d) && (xisnegzero(d) || xfabs(d) > TRIGRANGEMAX)) u = -0.0;
 
   return u;
 }
 
 double xcos(double d) {
-  int q;
-  double u, s;
-
-  q = 1 + 2*(int)xrint(d * M_1_PI - 0.5);
-
-  d = mla(q, -PI4_A*2, d);
-  d = mla(q, -PI4_B*2, d);
-  d = mla(q, -PI4_C*2, d);
-  d = mla(q, -PI4_D*2, d);
-
+  double u, s, t = d;
+
+  int qh = xtrunc(d * (M_1_PI / (1LL << 23)) - 0.5 * (M_1_PI / (1LL << 23)));
+  int ql = 2*xrint(d * M_1_PI - 0.5 - qh * (double)(1LL << 23))+1;
+
+  d = mla(qh, -PI_A*0.5*(1LL << 24), d);
+  d = mla(ql, -PI_A*0.5,             d);
+  d = mla(qh, -PI_B*0.5*(1LL << 24), d);
+  d = mla(ql, -PI_B*0.5,             d);
+  d = mla(qh, -PI_C*0.5*(1LL << 24), d);
+  d = mla(ql, -PI_C*0.5,             d);
+  d = mla((double)qh*(1LL << 24) + ql , -PI_D*0.5, d);
+  
   s = d * d;
 
-  if ((q & 2) == 0) d = -d;
+  if ((ql & 2) == 0) d = -d;
 
   u = -7.97255955009037868891952e-18;
   u = mla(u, s, 2.81009972710863200091251e-15);
@@ -613,22 +669,28 @@ double xcos(double d) {
 
   u = mla(s, u * d, d);
 
+  if (!xisinf(t) && xfabs(t) > TRIGRANGEMAX) u = 0.0;
+
   return u;
 }
 
 double xcos_u1(double d) {
-  double u, q;
+  double u;
   double2 s, t, x;
 
-  d = fabs(d);
-
-  q = mla(2, xrint(d * M_1_PI - 0.5), 1);
+  d = xfabs(d);
 
-  s = ddadd2_d2_d_d(d, q * (-PI4_A*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_B*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_C*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_D*2));
+  int qh = xtrunc(d * (M_1_PI / (1LL << (23))) - 0.5 * (M_1_PI / (1LL << (23))));
+  int ql = 2*xrint(d * M_1_PI - 0.5 - qh * (double)(1LL << (23)))+1;
 
+  s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5            ));
+  s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5));
+  
   t = s;
   s = ddsqu_d2_d2(s);
 
@@ -646,25 +708,29 @@ double xcos_u1(double d) {
 
   u = x.x + x.y;
 
-  if ((((int)q) & 2) == 0) u = -u;
+  if ((((int)ql) & 2) == 0) u = -u;
+  if (!xisinf(d) && d > TRIGRANGEMAX) u = 0.0;
 
   return u;
 }
 
 double2 xsincos(double d) {
-  int q;
   double u, s, t;
   double2 r;
 
-  q = (int)xrint(d * (2 * M_1_PI));
-
   s = d;
 
-  s = mla(-q, PI4_A*2, s);
-  s = mla(-q, PI4_B*2, s);
-  s = mla(-q, PI4_C*2, s);
-  s = mla(-q, PI4_D*2, s);
+  int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24)));
+  int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24));
 
+  s = mla(qh, -PI_A * 0.5 * (1 << 24), s);
+  s = mla(ql, -PI_A * 0.5,             s);
+  s = mla(qh, -PI_B * 0.5 * (1 << 24), s);
+  s = mla(ql, -PI_B * 0.5,             s);
+  s = mla(qh, -PI_C * 0.5 * (1 << 24), s);
+  s = mla(ql, -PI_C * 0.5,             s);
+  s = mla((double)qh * (1 << 24) + ql, -PI_D * 0.5, s);
+  
   t = s;
 
   s = s * s;
@@ -691,27 +757,31 @@ double2 xsincos(double d) {
 
   r.y = u * s + 1;
 
-  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
-  if ((q & 2) != 0) { r.x = -r.x; }
-  if (((q+1) & 2) != 0) { r.y = -r.y; }
+  if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((ql & 2) != 0) { r.x = -r.x; }
+  if (((ql+1) & 2) != 0) { r.y = -r.y; }
 
   if (xisinf(d)) { r.x = r.y = NAN; }
+  if (!xisinf(d) && xfabs(d) > TRIGRANGEMAX) { r.x = r.y = 0; }
 
   return r;
 }
 
 double2 xsincos_u1(double d) {
-  int q;
   double u;
   double2 r, s, t, x;
 
-  q = (int)xrint(d * (2 * M_1_PI));
-
-  s = ddadd2_d2_d_d(d, q * (-PI4_A*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_B*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_C*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_D*2));
+  int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24)));
+  int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24));
 
+  s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5            ));
+  s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5));
+  
   t = s;
   s = ddsqu_d2_d2(s);
   s.x = s.x + s.y;
@@ -741,49 +811,54 @@ double2 xsincos_u1(double d) {
   x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u));
   r.y = x.x + x.y;
 
-  if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
-  if ((q & 2) != 0) { r.x = -r.x; }
-  if (((q+1) & 2) != 0) { r.y = -r.y; }
+  if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
+  if ((ql & 2) != 0) { r.x = -r.x; }
+  if (((ql+1) & 2) != 0) { r.y = -r.y; }
 
   if (xisinf(d)) { r.x = r.y = NAN; }
+  if (!xisinf(d) && xfabs(d) > TRIGRANGEMAX) { r.x = r.y = 0; }
 
   return r;
 }
 
 double xtan(double d) {
-  int q;
   double u, s, x;
 
-  q = (int)xrint(d * (2 * M_1_PI));
-
-  x = mla(q, -PI4_A*2, d);
-  x = mla(q, -PI4_B*2, x);
-  x = mla(q, -PI4_C*2, x);
-  x = mla(q, -PI4_D*2, x);
+  int qh = xtrunc(d * ((2 * M_1_PI) / (1 << 24)));
+  int ql = xrint(d * (2 * M_1_PI) - qh * (double)(1 << 24));
 
+  x = mla(qh, -PI_A * 0.5 * (1 << 24), d);
+  x = mla(ql, -PI_A * 0.5,             x);
+  x = mla(qh, -PI_B * 0.5 * (1 << 24), x);
+  x = mla(ql, -PI_B * 0.5,             x);
+  x = mla(qh, -PI_C * 0.5 * (1 << 24), x);
+  x = mla(ql, -PI_C * 0.5,             x);
+  x = mla((double)qh * (1 << 24) + ql, -PI_D * 0.5, x);
+  
   s = x * x;
 
-  if ((q & 1) != 0) x = -x;
-
-  u = 1.01419718511083373224408e-05;
-  u = mla(u, s, -2.59519791585924697698614e-05);
-  u = mla(u, s, 5.23388081915899855325186e-05);
-  u = mla(u, s, -3.05033014433946488225616e-05);
-  u = mla(u, s, 7.14707504084242744267497e-05);
-  u = mla(u, s, 8.09674518280159187045078e-05);
-  u = mla(u, s, 0.000244884931879331847054404);
-  u = mla(u, s, 0.000588505168743587154904506);
-  u = mla(u, s, 0.00145612788922812427978848);
-  u = mla(u, s, 0.00359208743836906619142924);
-  u = mla(u, s, 0.00886323944362401618113356);
-  u = mla(u, s, 0.0218694882853846389592078);
-  u = mla(u, s, 0.0539682539781298417636002);
-  u = mla(u, s, 0.133333333333125941821962);
-  u = mla(u, s, 0.333333333333334980164153);
-
+  if ((ql & 1) != 0) x = -x;
+
+  u = 9.99583485362149960784268e-06;
+  u = mla(u, s, -4.31184585467324750724175e-05);
+  u = mla(u, s, 0.000103573238391744000389851);
+  u = mla(u, s, -0.000137892809714281708733524);
+  u = mla(u, s, 0.000157624358465342784274554);
+  u = mla(u, s, -6.07500301486087879295969e-05);
+  u = mla(u, s, 0.000148898734751616411290179);
+  u = mla(u, s, 0.000219040550724571513561967);
+  u = mla(u, s, 0.000595799595197098359744547);
+  u = mla(u, s, 0.00145461240472358871965441);
+  u = mla(u, s, 0.0035923150771440177410343);
+  u = mla(u, s, 0.00886321546662684547901456);
+  u = mla(u, s, 0.0218694899718446938985394);
+  u = mla(u, s, 0.0539682539049961967903002);
+  u = mla(u, s, 0.133333333334818976423364);
+  u = mla(u, s, 0.333333333333320047664472);
+  
   u = mla(s, u * x, x);
 
-  if ((q & 1) != 0) u = 1.0 / u;
+  if ((ql & 1) != 0) u = 1.0 / u;
 
   if (xisinf(d)) u = NAN;
 
@@ -791,18 +866,22 @@ double xtan(double d) {
 }
 
 double xtan_u1(double d) {
-  int q;
   double u;
   double2 s, t, x;
 
-  q = (int)xrint(d * M_2_PI);
-
-  s = ddadd2_d2_d_d(d, q * (-PI4_A*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_B*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_C*2));
-  s = ddadd2_d2_d2_d(s, q * (-PI4_D*2));
-
-  if ((q & 1) != 0) s = ddneg_d2_d2(s);
+  int qh = xtrunc(d * (M_2_PI / (1 << 24)));
+  s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - qh * (double)(1 << 24));
+  int ql = s.x + s.y;
+  
+  s = ddadd2_d2_d_d (d, qh * (-PI_A*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_A*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_B*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5            ));
+  s = ddadd2_d2_d2_d(s, qh * (-PI_C*0.5 * (1 << 24)));
+  s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5            ));
+  s = ddadd2_d2_d2_d(s, ((double)qh * (1 << 24) + ql) * (-PI_D*0.5));
+  
+  if ((ql & 1) != 0) s = ddneg_d2_d2(s);
 
   t = s;
   s = ddsqu_d2_d2(s);
@@ -825,11 +904,11 @@ double xtan_u1(double d) {
   x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s));
   x = ddmul_d2_d2_d2(t, x);
 
-  if ((q & 1) != 0) x = ddrec_d2_d2(x);
+  if ((ql & 1) != 0) x = ddrec_d2_d2(x);
 
   u = x.x + x.y;
 
-  if (xisnegzero(d)) u = -0.0;
+  if (!xisinf(d) && (xisnegzero(d) || xfabs(d) > TRIGRANGEMAX)) u = -0.0;
   
   return u;
 }
@@ -852,7 +931,7 @@ double xlog(double d) {
   t = mla(t, x2, 0.399999999950799600689777);
   t = mla(t, x2, 0.6666666666667778740063);
   t = mla(t, x2, 2);
-
+  
   x = x * t + 0.693147180559945286226764 * e;
   
   if (xisinf(d)) x = INFINITY;
@@ -884,8 +963,8 @@ double xexp(double d) {
   u = s * s * u + s + 1;
   u = ldexpk(u, q);
 
-  if (xisminf(d)) u = 0;
-
+  if (d < -1000) u = 0;
+  
   return u;
 }
 
@@ -900,17 +979,21 @@ static inline double2 logk(double d) {
   x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
   x2 = ddsqu_d2_d2(x);
 
-  t = 0.13860436390467167910856;
-  t = mla(t, x2.x, 0.131699838841615374240845);
-  t = mla(t, x2.x, 0.153914168346271945653214);
-  t = mla(t, x2.x, 0.181816523941564611721589);
-  t = mla(t, x2.x, 0.22222224632662035403996);
-  t = mla(t, x2.x, 0.285714285511134091777308);
-  t = mla(t, x2.x, 0.400000000000914013309483);
-  t = mla(t, x2.x, 0.666666666666664853302393);
+  t = 0.116255524079935043668677;
+  t = mla(t, x2.x, 0.103239680901072952701192);
+  t = mla(t, x2.x, 0.117754809412463995466069);
+  t = mla(t, x2.x, 0.13332981086846273921509);
+  t = mla(t, x2.x, 0.153846227114512262845736);
+  t = mla(t, x2.x, 0.181818180850050775676507);
+  t = mla(t, x2.x, 0.222222222230083560345903);
+  t = mla(t, x2.x, 0.285714285714249172087875);
+  t = mla(t, x2.x, 0.400000000000000077715612);
+  double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17);
 
   return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e),
-			 ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t)));
+			 ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2),
+					 ddmul_d2_d2_d2(ddmul_d2_d2_d2(x2, x),
+							ddadd2_d2_d2_d2(ddmul_d2_d2_d(x2, t), c))));
 }
 
 double xlog_u1(double d) {
@@ -948,17 +1031,22 @@ static inline double expk(double2 d) {
   t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u));
 
   t = ddadd_d2_d_d2(1, t);
-  return ldexpk(t.x + t.y, q);
+
+  u = ldexpk(t.x + t.y, q);
+
+  if (d.x < -1000) u = 0;
+
+  return u;
 }
 
 double xpow(double x, double y) {
-  int yisint = (int)y == y;
+  int yisint = xisint(y);
   int yisodd = (1 & (int)y) != 0 && yisint;
 
   double result = expk(ddmul_d2_d2_d(logk(xfabs(x)), y));
 
   result = xisnan(result) ? INFINITY : result;
-  result *=  (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1)));
+  result *= (x > 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1)));
 
   double efx = mulsign(xfabs(x) - 1, y);
   if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY);
@@ -991,7 +1079,7 @@ static inline double2 expk2(double2 d) {
   t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u));
 
   t = ddadd_d2_d_d2(1, t);
-  return ddscale_d2_d2_d(t, pow2i(q));
+  return ddscale_d2_d2_d(ddscale_d2_d2_d(t, 2), pow2i(q-1));
 }
 
 double xsinh(double x) {
@@ -1040,13 +1128,13 @@ static inline double2 logk2(double2 d) {
   double2 x, x2, m;
   double t;
   int e;
-
+  
   e = ilogbk(d.x * (1.0/0.75));
   m = ddscale_d2_d2_d(d, pow2i(-e));
 
   x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1));
   x2 = ddsqu_d2_d2(x);
-
+  
   t = 0.13860436390467167910856;
   t = mla(t, x2.x, 0.131699838841615374240845);
   t = mla(t, x2.x, 0.153914168346271945653214);
@@ -1062,21 +1150,27 @@ static inline double2 logk2(double2 d) {
 
 double xasinh(double x) {
   double y = xfabs(x);
-  double2 d = logk2(ddadd_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(y, y),  1)), y));
+  double2 d;
+
+  d = y > 1 ? ddrec_d2_d(x) : dd(y, 0);
+  d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1));
+  d = y > 1 ? ddmul_d2_d2_d(d, y) : d;
+  
+  d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x)));
   y = d.x + d.y;
 
-  y = xisinf(x) || xisnan(y) ? INFINITY : y;
-  y = mulsign(y, x);
+  y = (xfabs(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(INFINITY, x) : y;
   y = xisnan(x) ? NAN : y;
-
+  y = xisnegzero(x) ? -0.0 : y;
+  
   return y;
 }
 
 double xacosh(double x) {
-  double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(x, x), -1)), x));
+  double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x));
   double y = d.x + d.y;
 
-  y = xisinf(x) || xisnan(y) ? INFINITY : y;
+  y = (x > SQRT_DBL_MAX || xisnan(y)) ? INFINITY : y;
   y = x == 1.0 ? 0.0 : y;
   y = x < 1.0 ? NAN : y;
   y = xisnan(x) ? NAN : y;
@@ -1149,7 +1243,7 @@ double xcbrt(double d) { // max error : 2 ulps
   double x, y, q = 1.0;
   int e, r;
 
-  e = ilogbk(d)+1;
+  e = ilogbk(xfabs(d))+1;
   d = ldexpk(d, -e);
   r = (e + 6144) % 3;
   q = (r == 1) ? 1.2599210498948731647672106 : q;
@@ -1178,7 +1272,7 @@ double xcbrt_u1(double d) {
   double2 q2 = dd(1, 0), u, v;
   int e, r;
 
-  e = ilogbk(d)+1;
+  e = ilogbk(xfabs(d))+1;
   d = ldexpk(d, -e);
   r = (e + 6144) % 3;
   q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2;
@@ -1218,14 +1312,14 @@ double xcbrt_u1(double d) {
 
 double xexp2(double a) {
   double u = expk(ddmul_d2_d2_d(dd(0.69314718055994528623, 2.3190468138462995584e-17), a));
-  if (a > 1023) u = INFINITY;
+  if (a > 1024) u = INFINITY; // log2(DBL_MAX)
   if (xisminf(a)) u = 0;
   return u;
 }
 
 double xexp10(double a) {
   double u = expk(ddmul_d2_d2_d(dd(2.3025850929940459011, -2.1707562233822493508e-16), a));
-  if (a > 308) u = INFINITY;
+  if (a > 308.254715559916743850652254) u = INFINITY; // log10(DBL_MAX)
   if (xisminf(a)) u = 0;
   return u;
 }
@@ -1233,8 +1327,8 @@ double xexp10(double a) {
 double xexpm1(double a) {
   double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0);
   double x = d.x + d.y;
-  if (a > 700) x = INFINITY;
-  if (a < -0.36043653389117156089696070315825181539851971360337e+2) x = -1;
+  if (a > 709.782712893383996732223) x = INFINITY; // log(DBL_MAX)
+  if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0))
   if (xisnegzero(a)) x = -0.0;
   return x;
 }
@@ -1254,7 +1348,7 @@ double xlog1p(double a) {
   double2 d = logk2(ddadd2_d2_d_d(a, 1));
   double x = d.x + d.y;
 
-  if (xisinf(a)) x = INFINITY;
+  if (a > 1e+307) x = INFINITY;
   if (a < -1) x = NAN;
   if (a == -1) x = -INFINITY;
   if (xisnegzero(a)) x = -0.0;
diff --git a/purec/sleefsp.c b/purec/sleefsp.c
index f88c8b51..b792a636 100644
--- a/purec/sleefsp.c
+++ b/purec/sleefsp.c
@@ -9,10 +9,16 @@
 
 #include "nonnumber.h"
 
-#define PI4_Af 0.78515625f
-#define PI4_Bf 0.00024187564849853515625f
-#define PI4_Cf 3.7747668102383613586e-08f
-#define PI4_Df 1.2816720341285448015e-12f
+#define PI_Af 3.140625f
+#define PI_Bf 0.0009670257568359375f
+#define PI_Cf 6.2771141529083251953e-07f
+#define PI_Df 1.2154201256553420762e-10f
+
+#define PI_XDf 1.2141754268668591976e-10f
+#define PI_XEf 1.2446743939339977025e-13f
+
+#define TRIGRANGEMAXf 1e+7 // 39000
+#define SQRT_FLT_MAX 18446743523953729536.0
 
 #define L2Uf 0.693145751953125f
 #define L2Lf 1.428606765330187045e-06f
@@ -346,14 +352,14 @@ static inline float2 dfsqrt_f2_f2(float2 d) {
 
 float xsinf(float d) {
   int q;
-  float u, s;
+  float u, s, t = d;
 
   q = (int)xrintf(d * (float)M_1_PI);
 
-  d = mlaf(q, -PI4_Af*4, d);
-  d = mlaf(q, -PI4_Bf*4, d);
-  d = mlaf(q, -PI4_Cf*4, d);
-  d = mlaf(q, -PI4_Df*4, d);
+  d = mlaf(q, -PI_Af, d);
+  d = mlaf(q, -PI_Bf, d);
+  d = mlaf(q, -PI_Cf, d);
+  d = mlaf(q, -PI_Df, d);
 
   s = d * d;
 
@@ -367,8 +373,8 @@ float xsinf(float d) {
 
   u = mlaf(s, u * d, d);
 
-  if (xisinff(d)) u = NANf;
-  if (xisnegzerof(d)) u = -0.0f;
+  if (xisnegzerof(t) || xfabsf(t) > TRIGRANGEMAXf) u = -0.0f;
+  if (xisinff(t)) u = NANf;
 
   return u;
 }
@@ -380,10 +386,10 @@ float xsinf_u1(float d) {
 
   q = (int)xrintf(d * (float)M_1_PI);
 
-  s = dfadd2_f2_f_f(d, q * (-PI4_Af*4));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*4));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*4));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Df*4));
+  s = dfadd2_f2_f_f (d, q * (-PI_Af));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Bf));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Cf));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Df));
 
   t = s;
   s = dfsqu_f2_f2(s);
@@ -398,21 +404,21 @@ float xsinf_u1(float d) {
   u = x.x + x.y;
 
   if ((q & 1) != 0) u = -u;
-  if (xisnegzerof(d)) u = -0.0f;
+  if (!xisinff(d) && (xisnegzerof(d) || xfabsf(d) > TRIGRANGEMAXf)) u = -0.0f;
 
   return u;
 }
 
 float xcosf(float d) {
   int q;
-  float u, s;
+  float u, s, t = d;
 
   q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f);
 
-  d = mlaf(q, -PI4_Af*2, d);
-  d = mlaf(q, -PI4_Bf*2, d);
-  d = mlaf(q, -PI4_Cf*2, d);
-  d = mlaf(q, -PI4_Df*2, d);
+  d = mlaf(q, -PI_Af*0.5f, d);
+  d = mlaf(q, -PI_Bf*0.5f, d);
+  d = mlaf(q, -PI_Cf*0.5f, d);
+  d = mlaf(q, -PI_Df*0.5f, d);
 
   s = d * d;
 
@@ -425,8 +431,9 @@ float xcosf(float d) {
 
   u = mlaf(s, u * d, d);
 
-  if (xisinff(d)) u = NANf;
-
+  if (xfabsf(t) > TRIGRANGEMAXf) u = 0.0f;
+  if (xisinff(t)) u = NANf;
+  
   return u;
 }
 
@@ -434,14 +441,14 @@ float xcosf_u1(float d) {
   float u, q;
   float2 s, t, x;
 
-  d = fabsf(d);
+  d = xfabsf(d);
 
   q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f);
 
-  s = dfadd2_f2_f_f(d, q * (-PI4_Af*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2));
+  s = dfadd2_f2_f_f (d, q * (-PI_Af*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Bf*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Cf*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Df*0.5f));
 
   t = s;
   s = dfsqu_f2_f2(s);
@@ -456,7 +463,7 @@ float xcosf_u1(float d) {
   u = x.x + x.y;
 
   if ((((int)q) & 2) == 0) u = -u;
-
+  if (!xisinff(d) && d > TRIGRANGEMAXf) u = 0.0f;
   return u;
 }
 
@@ -469,10 +476,10 @@ float2 xsincosf(float d) {
 
   s = d;
 
-  s = mlaf(q, -PI4_Af*2, s);
-  s = mlaf(q, -PI4_Bf*2, s);
-  s = mlaf(q, -PI4_Cf*2, s);
-  s = mlaf(q, -PI4_Df*2, s);
+  s = mlaf(q, -PI_Af*0.5f, s);
+  s = mlaf(q, -PI_Bf*0.5f, s);
+  s = mlaf(q, -PI_Cf*0.5f, s);
+  s = mlaf(q, -PI_Df*0.5f, s);
 
   t = s;
 
@@ -499,6 +506,7 @@ float2 xsincosf(float d) {
   if ((q & 2) != 0) { r.x = -r.x; }
   if (((q+1) & 2) != 0) { r.y = -r.y; }
 
+  if (xfabsf(d) > TRIGRANGEMAXf) { r.x = r.y = 0; }
   if (xisinff(d)) { r.x = r.y = NANf; }
 
   return r;
@@ -511,10 +519,10 @@ float2 xsincosf_u1(float d) {
 
   q = (int)xrintf(d * (float)(2 * M_1_PI));
 
-  s = dfadd2_f2_f_f(d, q * (-PI4_Af*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2));
+  s = dfadd2_f2_f_f (d, q * (-PI_Af*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Bf*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Cf*0.5f));
+  s = dfadd2_f2_f2_f(s, q * (-PI_Df*0.5f));
 
   t = s;
   s = dfsqu_f2_f2(s);
@@ -543,6 +551,7 @@ float2 xsincosf_u1(float d) {
   if ((q & 2) != 0) { r.x = -r.x; }
   if (((q+1) & 2) != 0) { r.y = -r.y; }
 
+  if (xfabsf(d) > TRIGRANGEMAXf) { r.x = r.y = 0; }
   if (xisinff(d)) { r.x = r.y = NAN; }
 
   return r;
@@ -556,10 +565,10 @@ float xtanf(float d) {
 
   x = d;
 
-  x = mlaf(q, -PI4_Af*2, x);
-  x = mlaf(q, -PI4_Bf*2, x);
-  x = mlaf(q, -PI4_Cf*2, x);
-  x = mlaf(q, -PI4_Df*2, x);
+  x = mlaf(q, -PI_Af*0.5f, x);
+  x = mlaf(q, -PI_Bf*0.5f, x);
+  x = mlaf(q, -PI_Cf*0.5f, x);
+  x = mlaf(q, -PI_Df*0.5f, x);
 
   s = x * x;
 
@@ -587,11 +596,12 @@ float xtanf_u1(float d) {
   float2 s, t, x;
 
   q = (int)xrintf(d * (float)(2 * M_1_PI));
-
-  s = dfadd2_f2_f_f(d, q * (-PI4_Af*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2));
-  s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2));
+  
+  s = dfadd2_f2_f_f  (d, q * (-PI_Af*0.5f));
+  s = dfadd2_f2_f2_f (s, q * (-PI_Bf*0.5f));
+  s = dfadd2_f2_f2_f (s, q * (-PI_Cf*0.5f));
+  s = dfadd2_f2_f2_f (s, q * (-PI_XDf*0.5f));
+  s = dfadd2_f2_f2_f (s, q * (-PI_XEf*0.5f));
 
   if ((q & 1) != 0) s = dfneg_f2_f2(s);
 
@@ -613,7 +623,7 @@ float xtanf_u1(float d) {
 
   u = x.x + x.y;
 
-  if (xisnegzerof(d)) u = -0.0f;
+  if (!xisinff(d) && (xisnegzerof(d) || xfabsf(d) > TRIGRANGEMAXf)) u = -0.0f;
 
   return u;
 }
@@ -709,9 +719,6 @@ static float2 atan2kf_u1(float2 y, float2 x) {
   u = mlaf(u, t.x, -0.142626821994781494140625f);
   u = mlaf(u, t.x, 0.199983194470405578613281f);
 
-  //u = mlaf(u, t.x, -0.333332866430282592773438f);
-  //t = dfmul_f2_f2_f(t, u);
-
   t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x));
   t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t));
   t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t);
@@ -785,24 +792,29 @@ float xexpf(float d) {
   s = mlaf(q, -L2Uf, d);
   s = mlaf(q, -L2Lf, s);
 
+#if 0
   u = 0.00136324646882712841033936f;
   u = mlaf(u, s, 0.00836596917361021041870117f);
   u = mlaf(u, s, 0.0416710823774337768554688f);
   u = mlaf(u, s, 0.166665524244308471679688f);
   u = mlaf(u, s, 0.499999850988388061523438f);
-
+#else
+  u = 0.000198527617612853646278381;
+  u = mlaf(u, s, 0.00139304355252534151077271);
+  u = mlaf(u, s, 0.00833336077630519866943359);
+  u = mlaf(u, s, 0.0416664853692054748535156);
+  u = mlaf(u, s, 0.166666671633720397949219);
+  u = mlaf(u, s, 0.5);
+#endif
+  
   u = s * s * u + s + 1.0f;
   u = ldexpkf(u, q);
 
-  if (xisminff(d)) u = 0;
+  if (d < -104) u = 0;
 
   return u;
 }
 
-//#define L2Af 0.693145751953125
-//#define L2Bf 1.4285906217992305756e-06
-//#define L2Cf 1.619850954759360917e-11
-
 static inline float expkf(float2 d) {
   int q = (int)xrintf((d.x + d.y) * R_LN2f);
   float2 s, t;
@@ -811,10 +823,6 @@ static inline float expkf(float2 d) {
   s = dfadd2_f2_f2_f(d, q * -L2Uf);
   s = dfadd2_f2_f2_f(s, q * -L2Lf);
 
-  //s = dfadd2_f2_f2_f(d, q * -L2Af);
-  //s = dfadd2_f2_f2_f(s, q * -L2Bf);
-  //s = dfadd2_f2_f2_f(s, q * -L2Cf);
-
   s = dfnormalize_f2_f2(s);
 
   u = 0.00136324646882712841033936f;
@@ -826,7 +834,12 @@ static inline float expkf(float2 d) {
   t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u));
 
   t = dfadd_f2_f_f2(1, t);
-  return ldexpkf(t.x + t.y, q);
+
+  u = ldexpkf(t.x + t.y, q);
+
+  if (d.x < -104) u = 0;
+  
+  return u;
 }
 
 static inline float2 logkf(float d) {
@@ -836,17 +849,19 @@ static inline float2 logkf(float d) {
 
   e = ilogbkf(d * (1.0f/0.75f));
   m = ldexpkf(d, -e);
-
+  
   x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
   x2 = dfsqu_f2_f2(x);
   
-  t = 0.2392828464508056640625f;
-  t = mlaf(t, x2.x, 0.28518211841583251953125f);
-  t = mlaf(t, x2.x, 0.400005877017974853515625f);
-  t = mlaf(t, x2.x, 0.666666686534881591796875f);
+  t = 0.240320354700088500976562;
+  t = mlaf(t, x2.x, 0.285112679004669189453125);
+  t = mlaf(t, x2.x, 0.400007992982864379882812);
+  float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f);
 
   return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e),
-			 dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t)));
+			 dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2),
+					 dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x),
+							dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c))));
 }
 
 float xlogf_u1(float d) {
@@ -877,11 +892,11 @@ static inline float2 expk2f(float2 d) {
   t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u));
 
   t = dfadd_f2_f_f2(1, t);
-  return dfscale_f2_f2_f(t, pow2if(q));
+  return dfscale_f2_f2_f(dfscale_f2_f2_f(t, 2), pow2if(q-1));
 }
 
 float xpowf(float x, float y) {
-  int yisint = (int)y == y;
+  int yisint = (y == (int)y) || (xfabsf(y) >= (float)(1LL << 23));
   int yisodd = (1 & (int)y) != 0 && yisint;
 
   float result = expkf(dfmul_f2_f2_f(logkf(xfabsf(x)), y));
@@ -932,7 +947,7 @@ float xtanhf(float x) {
   d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e));
   y = d.x + d.y;
 
-  y = xfabsf(x) > 8.664339742f ? 1.0f : y;
+  y = xfabsf(x) > 18.714973875f ? 1.0f : y;
   y = xisnanf(y) ? 1.0f : y;
   y = mulsignf(y, x);
   y = xisnanf(x) ? NANf : y;
@@ -962,21 +977,27 @@ static inline float2 logk2f(float2 d) {
 
 float xasinhf(float x) {
   float y = xfabsf(x);
-  float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(y, y),  1)), y));
-  y = d.x + d.y;
+  float2 d;
 
-  y = xisinff(x) || xisnanf(y) ? INFINITYf : y;
-  y = mulsignf(y, x);
+  d = y > 1 ? dfrec_f2_f(x) : df(y, 0);
+  d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1));
+  d = y > 1 ? dfmul_f2_f2_f(d, y) : d;
+  
+  d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x)));
+  y = d.x + d.y;
+  
+  y = (xfabsf(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(INFINITYf, x) : y;
   y = xisnanf(x) ? NANf : y;
+  y = xisnegzerof(x) ? -0.0f : y;
 
   return y;
 }
 
 float xacoshf(float x) {
-  float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(x, x), -1)), x));
+  float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x));
   float y = d.x + d.y;
 
-  y = xisinff(x) || xisnanf(y) ? INFINITYf : y;
+  y = (x > SQRT_FLT_MAX || xisnanf(y)) ? INFINITYf : y;
   y = x == 1.0f ? 0.0f : y;
   y = x < 1.0f ? NANf : y;
   y = xisnanf(x) ? NANf : y;
@@ -987,7 +1008,7 @@ float xacoshf(float x) {
 float xatanhf(float x) {
   float y = xfabsf(x);
   float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y)));
-  y = y > 1.0 ? NANf : (y == 1.0 ? INFINITYf : (d.x + d.y) * 0.5f);
+  y = y > 1.0f ? NANf : (y == 1.0f ? INFINITYf : (d.x + d.y) * 0.5f);
 
   y = xisinff(x) || xisnanf(y) ? NANf : y;
   y = mulsignf(y, x);
@@ -998,14 +1019,14 @@ float xatanhf(float x) {
 
 float xexp2f(float a) {
   float u = expkf(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), a));
-  if (xispinff(a)) u = INFINITYf;
+  if (a >= 128) u = INFINITYf;
   if (xisminff(a)) u = 0;
   return u;
 }
 
 float xexp10f(float a) {
   float u = expkf(dfmul_f2_f2_f(df(2.3025851249694824219f, -3.1975436520781386207e-08f), a));
-  if (xispinff(a)) u = INFINITYf;
+  if (a > 38.531839419103626f) u = INFINITYf;
   if (xisminff(a)) u = 0;
   return u;
 }
@@ -1013,8 +1034,8 @@ float xexp10f(float a) {
 float xexpm1f(float a) {
   float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f);
   float x = d.x + d.y;
-  if (a > 88.0f) x = INFINITYf;
-  if (a < -0.15942385152878742116596338793538061065739925620174e+2f) x = -1;
+  if (a > 88.72283905206835f) x = INFINITYf;
+  if (a < -16.635532333438687426013570f) x = -1;
   if (xisnegzerof(a)) x = -0.0f;
   return x;
 }
@@ -1034,7 +1055,7 @@ float xlog1pf(float a) {
   float2 d = logk2f(dfadd2_f2_f_f(a, 1));
   float x = d.x + d.y;
 
-  if (xisinff(a)) x = INFINITYf;
+  if (a > 1e+38) x = INFINITYf;
   if (a < -1) x = NANf;
   if (a == -1) x = -INFINITYf;
   if (xisnegzerof(a)) x = -0.0f;
@@ -1048,7 +1069,7 @@ float xcbrtf(float d) {
   float x, y, q = 1.0f;
   int e, r;
 
-  e = ilogbkf(d)+1;
+  e = ilogbkf(xfabsf(d))+1;
   d = ldexpkf(d, -e);
   r = (e + 6144) % 3;
   q = (r == 1) ? 1.2599210498948731647672106f : q;
@@ -1076,7 +1097,7 @@ float xcbrtf_u1(float d) {
   float2 q2 = df(1, 0), u, v;
   int e, r;
 
-  e = ilogbkf(d)+1;
+  e = ilogbkf(xfabsf(d))+1;
   d = ldexpkf(d, -e);
   r = (e + 6144) % 3;
   q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2;
diff --git a/purec/starttester2.sh b/purec/starttester2.sh
new file mode 100644
index 00000000..5fabae17
--- /dev/null
+++ b/purec/starttester2.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+nohup nice ./tester2 >> ../tester2.out &
+nohup nice ./tester2f >> ../tester2.out &
diff --git a/purec/stoptester2.sh b/purec/stoptester2.sh
new file mode 100644
index 00000000..e777caca
--- /dev/null
+++ b/purec/stoptester2.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+killall tester2 tester2f
diff --git a/purec/tester2.c b/purec/tester2.c
new file mode 100644
index 00000000..e65252f0
--- /dev/null
+++ b/purec/tester2.c
@@ -0,0 +1,514 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/random.h>
+
+#include "sleef.h"
+
+mpfr_t fra, frb, frc, frd, frw, frx, fry, frz;
+
+#define DENORMAL_DBL_MIN (4.94066e-324)
+
+double countULP(double d, mpfr_t c) {
+  double c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DENORMAL_DBL_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+double countULP2(double d, mpfr_t c) {
+  double c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DBL_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+typedef union {
+  double d;
+  uint64_t u64;
+  int64_t i64;
+} conv_t;
+
+double rnd_fr() {
+  conv_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isfinite(c.d));
+  return c.d;
+}
+
+double rnd_zo() {
+  conv_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isfinite(c.d) || c.d < -1 || 1 < c.d);
+  return c.d;
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_set_default_prec(256);
+  mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL);
+
+  conv_t cd;
+  double d, t;
+
+
+  int cnt;
+  
+  srandom(time(NULL));
+
+#if 0
+  cd.d = M_PI;
+  mpfr_set_d(frx, cd.d, GMP_RNDN);
+  cd.i64+=3;
+  printf("%g\n", countULP(cd.d, frx));
+#endif
+
+  const double rangemax = 1e+14; // 2^(24*2-1)
+  
+  for(cnt = 0;;cnt++) {
+    switch(cnt & 7) {
+    case 0:
+      d = (2 * (double)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 1:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    case 2:
+      d = (2 * (double)random() / RAND_MAX - 1) * 1e+7;
+      break;
+    case 3:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+7) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    case 4:
+      d = (2 * (double)random() / RAND_MAX - 1) * 10000;
+      break;
+    case 5:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 10000) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    default:
+      d = rnd_fr();
+      break;
+    }
+
+    if (!isfinite(d)) continue;
+
+    
+    double2 sc = xsincos(d);
+    double2 sc2 = xsincos_u1(d);
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsin(d), frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(sc.x, frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      double u2 = countULP(t = xsin_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      double u3 = countULP(t = sc2.x, frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcos(d), frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = sc.y, frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      double u2 = countULP(t = xcos_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      double u3 = countULP(t = sc2.y, frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtan(d), frx);
+      
+      if ((fabs(d) < 1e+7 && u0 > 3.5) || (fabs(d) <= rangemax && u0 > 5) || isnan(t)) {
+	printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xtan_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) {
+	printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    d = rnd_fr();
+    double d2 = rnd_fr(), zo = rnd_zo();
+    
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog(fabs(d)), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xlog_u1(fabs(d)), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog10(fabs(d)), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_log1p(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog1p(d), frx);
+      
+      if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
+	  (d < -1 && !isnan(t)) ||
+	  (d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
+	printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0);
+	printf("%g\n", t);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp2(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp2(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp10(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_expm1(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpm1(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_pow(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xpow(d2, d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cbrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcbrt(d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xcbrt_u1(d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_asin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasin(zo), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C asin arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xasin_u1(zo), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_acos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacos(zo), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C acos arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xacos_u1(zo), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan(d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xatan_u1(d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_atan2(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan2(d2, d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP2(t = xatan2_u1(d2, d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsinh(d), frx);
+      
+      if ((fabs(d) <= 709 && u0 > 1) ||
+	  (d >  709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcosh(d), frx);
+      
+      if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
+	printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtanh(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_asinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinh(d), frx);
+      
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+	  (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_acosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacosh(d), frx);
+      
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+	  (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(DBL_MAX) && !isnan(t))) {
+	printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0);
+	printf("%.20g\n", t);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanh(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+  }
+}
diff --git a/purec/tester2f.c b/purec/tester2f.c
new file mode 100644
index 00000000..7f33e0e3
--- /dev/null
+++ b/purec/tester2f.c
@@ -0,0 +1,515 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/random.h>
+
+#include "sleef.h"
+
+mpfr_t fra, frb, frc, frd, frw, frx, fry, frz;
+
+#define DENORMAL_FLT_MIN (1.40130e-45f)
+
+double countULP(float d, mpfr_t c) {
+  float c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), DENORMAL_FLT_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+double countULP2(float d, mpfr_t c) {
+  float c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), FLT_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+typedef union {
+  double d;
+  uint64_t u64;
+  int64_t i64;
+} conv64_t;
+
+typedef union {
+  float f;
+  uint32_t u32;
+  int32_t i32;
+} conv32_t;
+
+float rnd_fr() {
+  conv32_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0);
+#else
+    c.u32 = (uint32_t)random() | ((uint32_t)random() << 31);
+#endif
+  } while(!isfinite(c.f));
+  return c.f;
+}
+
+float rnd_zo() {
+  conv32_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0);
+#else
+    c.u32 = (uint32_t)random() | ((uint32_t)random() << 31);
+#endif
+  } while(!isfinite(c.f) || c.f < -1 || 1 < c.f);
+  return c.f;
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_set_default_prec(256);
+  mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL);
+
+  conv32_t cd;
+  float d, t;
+  int cnt;
+  
+  srandom(time(NULL));
+
+#if 0
+  cd.f = M_PI;
+  mpfr_set_d(frx, cd.f, GMP_RNDN);
+  cd.i32+=3;
+  printf("%g\n", countULP(cd.f, frx));
+#endif
+
+  const float rangemax = 39000;
+  
+  for(cnt = 0;;cnt++) {
+    switch(cnt & 7) {
+    case 0:
+      d = (2 * (float)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 1:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    case 2:
+      d = (2 * (float)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 3:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    case 4:
+      d = (2 * (float)random() / RAND_MAX - 1) * 10000;
+      break;
+    case 5:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * 10000) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    default:
+      d = rnd_fr();
+      break;
+    }
+
+    if (!isfinite(d)) continue;
+
+    float2 sc = xsincosf(d);
+    float2 sc2 = xsincosf_u1(d);
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sin(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xsinf(d), frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sinf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = sc.x, frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincosf sin arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      float u2 = countULP(t = xsinf_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sinf_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      float u3 = countULP(t = sc2.x, frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cos(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xcosf(d), frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C cosf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = sc.y, frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincosf cos arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      float u2 = countULP(t = xcosf_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C cosf_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      float u3 = countULP(t = sc2.y, frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf("Pure C sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tan(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xtanf(d), frx);
+      
+      if ((fabs(d) < rangemax && u0 > 3.5) || isnan(t)) {
+	printf("Pure C tanf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = xtanf_u1(d), frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) {
+	printf("Pure C tanf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    d = rnd_fr();
+    float d2 = rnd_fr(), zo = rnd_zo();
+
+    {
+      mpfr_set_d(frx, fabsf(d), GMP_RNDN);
+      mpfr_log(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlogf(fabsf(d)), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C logf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xlogf_u1(fabsf(d)), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C logf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, fabsf(d), GMP_RNDN);
+      mpfr_log10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog10f(fabsf(d)), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C log10f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_log1p(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog1pf(d), frx);
+      
+      if ((-1 <= d && d <= 1e+38 && u0 > 1) ||
+	  (d < -1 && !isnan(t)) ||
+	  (d > 1e+38 && !(u0 <= 1 || isinf(t)))) {
+	printf("Pure C log1pf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpf(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C expf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp2(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp2f(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C exp2f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp10f(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C exp10f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_expm1(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpm1f(d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C expm1f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_pow(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xpowf(d2, d), frx);
+      
+      if (u0 > 1) {
+	printf("Pure C powf arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cbrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcbrtf(d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C cbrtf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xcbrtf_u1(d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C cbrtf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_asin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinf(zo), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C asinf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xasinf_u1(zo), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C asinf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_acos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacosf(zo), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C acosf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xacosf_u1(zo), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C acosf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanf(d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C atanf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xatanf_u1(d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C atanf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_atan2(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan2f(d2, d), frx);
+      
+      if (u0 > 3.5) {
+	printf("Pure C atan2f arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP2(t = xatan2f_u1(d2, d), frx);
+      
+      if (u1 > 1) {
+	printf("Pure C atan2f_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsinhf(d), frx);
+      
+      if ((fabs(d) <= 88.5 && u0 > 1) ||
+	  (d >  88.5 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d < -88.5 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf("Pure C sinhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcoshf(d), frx);
+      
+      if ((fabs(d) <= 88.5 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
+	printf("Pure C coshf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtanhf(d), frx);
+      
+      if (u0 > 1.0001) {
+	printf("Pure C tanhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_asinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinhf(d), frx);
+      
+      if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) ||
+	  (d >=  sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t < 0)))) {
+	printf("Pure C asinhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_acosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacoshf(d), frx);
+      
+      if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) ||
+	  (d >=  sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinff(t) && t > 0))) ||
+	  (d <= -sqrt(FLT_MAX) && !isnan(t))) {
+	printf("Pure C acoshf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanhf(d), frx);
+      
+      if (u0 > 1.0001) {
+	printf("Pure C atanhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+  }
+}
diff --git a/simd/Makefile b/simd/Makefile
index 3636c29c..26740bae 100644
--- a/simd/Makefile
+++ b/simd/Makefile
@@ -1,6 +1,6 @@
-CC=gcc-5
+CC=gcc
 OPT=-O -Wall -Wno-unused -Wno-attributes -ffp-contract=off -fmax-errors=3
-SDE=sde
+SDE=sde64
 
 all : testsse2 testavx testavx2
 
@@ -19,6 +19,42 @@ iutavx512f : sleefsimddp.c sleefsimdsp.c helperavx512f.h iut.c
 iutfma4 : sleefsimddp.c sleefsimdsp.c helperfma4.h iut.c
 	$(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 iut.c sleefsimddp.c sleefsimdsp.c -o iutfma4 -lm
 
+#
+
+tester2 : tester2sse2 tester2avx tester2avx2 tester2avx512f tester2fma4 tester2fsse2 tester2favx tester2favx2 tester2favx512f tester2ffma4
+
+tester2sse2 : sleefsimddp.c helpersse2.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_SSE2 -msse2 tester2simd.c sleefsimddp.c -o tester2sse2 -lm -lmpfr
+
+tester2avx : sleefsimddp.c helperavx.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_AVX -mavx tester2simd.c sleefsimddp.c -o tester2avx -lm -lmpfr
+
+tester2avx2 : sleefsimddp.c helperavx2.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_AVX2 -mavx2 -mfma tester2simd.c sleefsimddp.c -o tester2avx2 -lm -lmpfr
+
+tester2avx512f : sleefsimddp.c helperavx512f.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_AVX512F -mavx512f tester2simd.c sleefsimddp.c -o tester2avx512f -lm -lmpfr
+
+tester2fma4 : sleefsimddp.c helperfma4.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 tester2simd.c sleefsimddp.c -o tester2fma4 -lm -lmpfr
+
+tester2fsse2 : sleefsimdsp.c helpersse2.h tester2fsimd.c
+	$(CC) $(OPT) -DENABLE_SSE2 -msse2 tester2fsimd.c sleefsimdsp.c -o tester2fsse2 -lm -lmpfr
+
+tester2favx : sleefsimdsp.c helperavx.h tester2fsimd.c
+	$(CC) $(OPT) -DENABLE_AVX -mavx tester2fsimd.c sleefsimdsp.c -o tester2favx -lm -lmpfr
+
+tester2favx2 : sleefsimdsp.c helperavx2.h tester2fsimd.c
+	$(CC) $(OPT) -DENABLE_AVX2 -mavx2 -mfma tester2fsimd.c sleefsimdsp.c -o tester2favx2 -lm -lmpfr
+
+tester2favx512f : sleefsimdsp.c helperavx512f.h tester2fsimd.c
+	$(CC) $(OPT) -DENABLE_AVX512F -mavx512f tester2fsimd.c sleefsimdsp.c -o tester2favx512f -lm -lmpfr
+
+tester2ffma4 : sleefsimdsp.c helperfma4.h tester2fsimd.c
+	$(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 tester2fsimd.c sleefsimdsp.c -o tester2ffma4 -lm -lmpfr
+
+#
+
 ../tester/tester :
 	cd ../tester; make tester
 
@@ -62,4 +98,4 @@ testfma4 : iutfma4 ../tester/tester ../tester/testeru1 ../tester/testersp ../tes
 	../tester/testerspu1 ./iutfma4
 
 clean :
-	rm -f *~ *.o *.s iutsse2 iutavx iutavx2 iutavx512f iutfma4 iutneon iutclangvec
+	rm -f *~ *.o *.s *.out iutsse2 iutavx iutavx2 iutavx512f iutfma4 iutneon iutclangvec tester2sse2 tester2avx tester2avx2 tester2avx512f tester2fma4 tester2clangvec tester2fsse2 tester2favx tester2favx2 tester2favx512f tester2ffma4 tester2fclangvec
diff --git a/simd/Makefile.clang b/simd/Makefile.clang
index 9106cd84..7034bbe2 100644
--- a/simd/Makefile.clang
+++ b/simd/Makefile.clang
@@ -22,6 +22,18 @@ iutfma4 : sleefsimddp.c sleefsimdsp.c helperfma4.h iut.c
 iutclangvec : sleefsimddp.c sleefsimdsp.c helperclangvec.h iut.c
 	$(CC) $(OPT) -DENABLE_CLANGVEC iut.c sleefsimddp.c sleefsimdsp.c -o iutclangvec -lm
 
+#
+
+tester2 : tester2clangvec tester2fclangvec
+
+tester2clangvec : sleefsimddp.c sleefsimddp.c helperclangvec.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_CLANGVEC -msse2 tester2simd.c sleefsimddp.c -o tester2clangvec -lm -lmpfr
+
+tester2fclangvec : sleefsimddp.c sleefsimdsp.c helperclangvec.h tester2simd.c
+	$(CC) $(OPT) -DENABLE_CLANGVEC -msse2 tester2fsimd.c sleefsimdsp.c -o tester2fclangvec -lm -lmpfr
+
+#
+
 ../tester/tester :
 	cd ../tester; make tester
 
diff --git a/simd/helperavx.h b/simd/helperavx.h
index fa40bfe3..f81a850e 100644
--- a/simd/helperavx.h
+++ b/simd/helperavx.h
@@ -48,6 +48,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) {
 
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vint2 vcastu_vi2_vi(vint vi) {
diff --git a/simd/helperavx2.h b/simd/helperavx2.h
index ce880d65..35985fbd 100644
--- a/simd/helperavx2.h
+++ b/simd/helperavx2.h
@@ -51,6 +51,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) {
 
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 
diff --git a/simd/helperavx512f.h b/simd/helperavx512f.h
index 5c7f7bc9..04501a36 100644
--- a/simd/helperavx512f.h
+++ b/simd/helperavx512f.h
@@ -55,6 +55,9 @@ static INLINE vint vtruncate_vi_vd(vdouble vd) {
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
 
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+
 static INLINE vint2 vcastu_vi2_vi(vint vi) {
   return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
 }
@@ -179,6 +182,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); }
 static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m512i)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m512)vm; }
 
diff --git a/simd/helperclangvec.h b/simd/helperclangvec.h
index 858a214b..8f708852 100644
--- a/simd/helperclangvec.h
+++ b/simd/helperclangvec.h
@@ -1,5 +1,7 @@
 #include <stdint.h>
 
+// When you change VECTLENDP, you also need to change the same macro in sleefsimd.h
+
 #ifndef VECTLENDP
 #define VECTLENDP 8
 #endif
@@ -81,6 +83,8 @@ static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (v
 static INLINE vdouble vcast_vd_vi(vint vi) { return __builtin_convertvector(vi, vdouble); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return __builtin_convertvector(vd, vint); }
 static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
 static INLINE vint vcast_vi_i(int i) { return (vint)(i); }
 
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
@@ -187,6 +191,7 @@ static INLINE vfloat vcast_vf_vi2(vint2 vi) { return __builtin_convertvector(vi,
 static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return __builtin_convertvector(vf, vint2); }
 static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
 static INLINE vint2 vcast_vi2_i(int i) { return (vint2)(i); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 
 static INLINE vfloat vcast_vf_f(float f) { return (vfloat)(f); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
diff --git a/simd/helperfma4.h b/simd/helperfma4.h
index 1209c3dc..43bea24a 100644
--- a/simd/helperfma4.h
+++ b/simd/helperfma4.h
@@ -51,6 +51,9 @@ static INLINE vopmask vcast_vo64_vo32(vopmask o) {
 
 static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
 static INLINE vint2 vcastu_vi2_vi(vint vi) {
diff --git a/simd/helperneon32.h b/simd/helperneon32.h
index 89b5b598..4ef9882c 100644
--- a/simd/helperneon32.h
+++ b/simd/helperneon32.h
@@ -59,6 +59,7 @@ static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
 static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
 static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
 static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
 static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
 static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
diff --git a/simd/helpersse2.h b/simd/helpersse2.h
index 7595dee1..3c83048d 100644
--- a/simd/helpersse2.h
+++ b/simd/helpersse2.h
@@ -45,9 +45,18 @@ static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
 static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
 static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
 static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
-static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_shuffle_epi32(vi, 0x73); }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
 static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
 
+#ifdef __SSE4_1__
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+#else
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+#endif
+
 static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
 
 static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
@@ -149,6 +158,10 @@ static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; }
 static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (__m128)vm; }
 static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (__m128i)vf; }
 
+#ifndef __SSE4_1__
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+#endif
+
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
 static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
diff --git a/simd/sleefsimd.h b/simd/sleefsimd.h
index 65897b51..a6d48506 100644
--- a/simd/sleefsimd.h
+++ b/simd/sleefsimd.h
@@ -7,6 +7,7 @@
 
 #define VECTLENDP 2
 #define VECTLENSP 4
+#define SLEEF_ARCH "SSE2"
 
 typedef __m128d vdouble;
 typedef __m128i vint;
@@ -38,6 +39,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12
 
 #define VECTLENDP 4
 #define VECTLENSP 8
+#define SLEEF_ARCH "AVX"
 
 typedef __m256d vdouble;
 typedef __m128i vint;
@@ -78,6 +80,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12
 
 #define VECTLENDP 4
 #define VECTLENSP 8
+#define SLEEF_ARCH "AVX2"
 
 typedef __m256d vdouble;
 typedef __m128i vint;
@@ -109,6 +112,7 @@ static void vstoreui(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, (__m12
 
 #define VECTLENDP 8
 #define VECTLENSP 16
+#define SLEEF_ARCH "AVX512F"
 
 typedef __m512d vdouble;
 typedef __m256i vint;
@@ -140,6 +144,7 @@ static void vstoreui(int32_t *p, vint v) { return _mm256_storeu_si256((__m256i *
 
 #define VECTLENDP 2
 #define VECTLENSP 4
+#define SLEEF_ARCH "NEON32"
 
 //typedef __m128d vdouble;
 typedef int32x4_t vint;
@@ -167,6 +172,7 @@ static void vstoreui2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 
 #define VECTLENDP 2
 #define VECTLENSP 4
+#define SLEEF_ARCH "NEON64"
 
 //typedef __m128d vdouble;
 typedef int32x4_t vint;
@@ -192,6 +198,7 @@ static void vstoreui2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
 #ifdef ENABLE_CLANGVEC
 #define VECTLENDP 8
 #define VECTLENSP (VECTLENDP*2)
+#define SLEEF_ARCH "CLANGVEC"
 
 typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
 typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
diff --git a/simd/sleefsimddp.c b/simd/sleefsimddp.c
index 5bb563f6..e91f39c4 100644
--- a/simd/sleefsimddp.c
+++ b/simd/sleefsimddp.c
@@ -46,11 +46,22 @@
 
 //
 
+#define PI_A 3.1415926218032836914
+#define PI_B 3.1786509424591713469e-08
+#define PI_C 1.2246467864107188502e-16
+#define PI_D 1.2736634327021899816e-24
+
 #define PI4_A 0.78539816290140151978
 #define PI4_B 4.9604678871439933374e-10
 #define PI4_C 1.1258708853173288931e-18
 #define PI4_D 1.7607799325916000908e-27
 
+#define M_2_PI_H 0.63661977236758138243
+#define M_2_PI_L -3.9357353350364971764e-17
+
+#define TRIGRANGEMAX 1e+14
+#define SQRT_DBL_MAX 1.3407807929942596355e+154
+
 #define M_4_PI 1.273239544735162542821171882678754627704620361328125
 
 #define L2U .69314718055966295651160180568695068359375
@@ -59,17 +70,6 @@
 
 //
 
-#define PI4_Af 0.78515625f
-#define PI4_Bf 0.00024187564849853515625f
-#define PI4_Cf 3.7747668102383613586e-08f
-#define PI4_Df 1.2816720341285448015e-12f
-
-#define L2Uf 0.693145751953125f
-#define L2Lf 1.428606765330187045e-06f
-#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
-
-//
-
 static INLINE vopmask vsignbit_vo_vd(vdouble d) {
   return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
 }
@@ -126,6 +126,13 @@ static INLINE vint vilogbk_vi_vd(vdouble d) {
 }
 #endif
 
+static INLINE vopmask visint(vdouble d) {
+  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (1 << 31))));
+  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(1 << 31)), x, d);
+  return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x),
+		      vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 52)));
+}
+
 //
 
 vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); }
@@ -139,20 +146,34 @@ vint xilogb(vdouble d) {
 }
 
 vdouble xsin(vdouble d) {
-  vint q;
-  vdouble u, s;
-
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
+  vdouble u, s, r = d;
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
 
-  u = vcast_vd_vi(q);
+  u = vcast_vd_vi(ql);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*4), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*4), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*4), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*4), d);
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+				      vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24))));
+  vdouble dql = vcast_vd_vi(ql);
+
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A            ), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B            ), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C            ), d);
+  d = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+		       vcast_vd_d(-PI_D), d);
+#endif
+  
   s = vmul_vd_vd_vd(d, d);
 
-  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)d));
+  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)d));
 
   u = vcast_vd_d(-7.97255955009037868891952e-18);
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15));
@@ -164,26 +185,43 @@ vdouble xsin(vdouble d) {
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815));
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));
 
-  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d);
+  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
 
-  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u);
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(r),
+					vor_vo_vo_vo(visnegzero_vo_vd(r),
+						     vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX)))),
+		       vcast_vd_d(-0.0), u);
   
   return u;
 }
 
 vdouble xsin_u1(vdouble d) {
-  vint q;
   vdouble u;
   vdouble2 s, t, x;
-
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
-  u = vcast_vd_vi(q);
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
+  u = vcast_vd_vi(ql);
 
   s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*4)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*4)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*4)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*4)));
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+				      vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24))));
+  vdouble dql = vcast_vd_vi(ql);
+
+  s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+					 vcast_vd_d(-PI_D)));
+#endif
+  
   t = s;
   s = ddsqu_vd2_vd2(s);
 
@@ -200,28 +238,44 @@ vdouble xsin_u1(vdouble d) {
   x = ddmul_vd2_vd2_vd2(t, x);
   u = vadd_vd_vd_vd(x.x, x.y);
 
-  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)u));
-  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u);
+  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), (vmask)vcast_vd_d(-0.0)), (vmask)u));
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d), vor_vo_vo_vo(visnegzero_vo_vd(d),
+								      vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))),
+		       vcast_vd_d(-0.0), u);
 
   return u;
 }
 
 vdouble xcos(vdouble d) {
-  vint q;
-  vdouble u, s;
-
-  q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
-  q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1));
+  vdouble u, s, r = d;
+#if 0
+  vint ql = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
+  ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
 
-  u = vcast_vd_vi(q);
+  u = vcast_vd_vi(ql);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), d);
   d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), d);
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-0.5 * M_1_PI / (1 << 23))));
+  vint ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+				      vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
+  ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
+  vdouble dql = vcast_vd_vi(ql);
+
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5            ), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5            ), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5            ), d);
+  d = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+		       vcast_vd_d(-PI_D * 0.5), d);
+#endif
   s = vmul_vd_vd_vd(d, d);
 
-  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)d));
+  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)d));
 
   u = vcast_vd_d(-7.97255955009037868891952e-18);
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15));
@@ -233,25 +287,43 @@ vdouble xcos(vdouble d) {
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815));
   u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));
 
-  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d);
+  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
 
+  u = (vdouble)vandnot_vm_vo64_vm(vandnot_vo_vo_vo(visinf_vo_vd(r), vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX))),
+				  (vmask)u);
+  
   return u;
 }
 
 vdouble xcos_u1(vdouble d) {
-  vint q;
   vdouble u;
   vdouble2 s, t, x;
-
-  q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
-  q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1));
-  u = vcast_vd_vi(q);
+#if 0
+  vint ql = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
+  ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
+  u = vcast_vd_vi(ql);
 
   s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2)));
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-0.5 * M_1_PI / (1 << 23))));
+  vint ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+				      vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
+  ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
+  vdouble dql = vcast_vd_vi(ql);
+
+  s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+					 vcast_vd_d(-PI_D*0.5)));
+#endif
+  
   t = s;
   s = ddsqu_vd2_vd2(s);
 
@@ -268,27 +340,44 @@ vdouble xcos_u1(vdouble d) {
   x = ddmul_vd2_vd2_vd2(t, x);
   u = vadd_vd_vd_vd(x.x, x.y);
 
-  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)u));
+  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), (vmask)vcast_vd_d(-0.0)), (vmask)u));
 
+  u = (vdouble)vandnot_vm_vo64_vm(vandnot_vo_vo_vo(visinf_vo_vd(d), vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))),
+				  (vmask)u);
+  
   return u;
 }
 
 vdouble2 xsincos(vdouble d) {
-  vint q;
   vopmask o;
   vdouble u, s, t, rx, ry;
   vdouble2 r;
 
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
-
   s = d;
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
 
-  u = vcast_vd_vi(q);
+  u = vcast_vd_vi(ql);
   s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), s);
   s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), s);
   s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), s);
   s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), s);
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)),
+				      vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24))));
+  vdouble dql = vcast_vd_vi(ql);
+
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5            ), s);
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5            ), s);
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5            ), s);
+  s = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+		       vcast_vd_d(-PI_D * 0.5), s);
+#endif
+  
   t = s;
 
   s = vmul_vd_vd_vd(s, s);
@@ -314,16 +403,20 @@ vdouble2 xsincos(vdouble d) {
 
   ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
   r.x = vsel_vd_vo_vd_vd(o, rx, ry);
   r.y = vsel_vd_vo_vd_vd(o, ry, rx);
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
   r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
   r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
 
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));
+  r.x = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.x);
+  r.y = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.y);
+  
   o = visinf_vo_vd(d);
   r.x = (vdouble)vor_vm_vo64_vm(o, (vmask)r.x);
   r.y = (vdouble)vor_vm_vo64_vm(o, (vmask)r.y);
@@ -332,19 +425,33 @@ vdouble2 xsincos(vdouble d) {
 }
 
 vdouble2 xsincos_u1(vdouble d) {
-  vint q;
   vopmask o;
   vdouble u, rx, ry;
   vdouble2 r, s, t, x;
-
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
-  u = vcast_vd_vi(q);
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
+  u = vcast_vd_vi(ql);
 
   s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2)));
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)),
+				      vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24))));
+  vdouble dql = vcast_vd_vi(ql);
+
+  s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+					 vcast_vd_d(-PI_D*0.5)));
+#endif
+  
   t = s;
 
   s = ddsqu_vd2_vd2(s);
@@ -375,16 +482,20 @@ vdouble2 xsincos_u1(vdouble d) {
   x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(s.x, u));
   ry = vadd_vd_vd_vd(x.x, x.y);
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
   r.x = vsel_vd_vo_vd_vd(o, rx, ry);
   r.y = vsel_vd_vo_vd_vd(o, ry, rx);
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
   r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
   r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
 
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));
+  r.x = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.x);
+  r.y = (vdouble)vandnot_vm_vo64_vm(o, (vmask)r.y);
+
   o = visinf_vo_vd(d);
   r.x = (vdouble)vor_vm_vo64_vm(o, (vmask)r.x);
   r.y = (vdouble)vor_vm_vo64_vm(o, (vmask)r.y);
@@ -393,63 +504,96 @@ vdouble2 xsincos_u1(vdouble d) {
 }
 
 vdouble xtan(vdouble d) {
-  vint q;
   vdouble u, s, x;
   vopmask o;
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
 
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
-
-  u = vcast_vd_vi(q);
+  u = vcast_vd_vi(ql);
   x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d);
   x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), x);
   x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), x);
   x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), x);
-
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)),
+				      vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24))));
+  vdouble dql = vcast_vd_vi(ql);
+
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5 * (1 << 24)), d);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5            ), x);
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5 * (1 << 24)), x);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5            ), x);
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5 * (1 << 24)), x);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5            ), x);
+  x = vmla_vd_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+		       vcast_vd_d(-PI_D * 0.5), x);
+#endif
+  
   s = vmul_vd_vd_vd(x, x);
 
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
   x = (vdouble)vxor_vm_vm_vm(vand_vm_vo64_vm(o, (vmask)vcast_vd_d(-0.0)), (vmask)x);
 
-  u = vcast_vd_d(1.01419718511083373224408e-05);
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(5.23388081915899855325186e-05));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(7.14707504084242744267497e-05));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(8.09674518280159187045078e-05));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000244884931879331847054404));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000588505168743587154904506));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145612788922812427978848));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00359208743836906619142924));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886323944362401618113356));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694882853846389592078));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539781298417636002));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333333125941821962));
-  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333334980164153));
+  u = vcast_vd_d(9.99583485362149960784268e-06);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-4.31184585467324750724175e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000103573238391744000389851));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000137892809714281708733524));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000157624358465342784274554));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-6.07500301486087879295969e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000148898734751616411290179));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000219040550724571513561967));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000595799595197098359744547));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145461240472358871965441));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0035923150771440177410343));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886321546662684547901456));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694899718446938985394));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539049961967903002));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333334818976423364));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333320047664472));
 
   u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);
 
   u = vsel_vd_vo_vd_vd(o, vrec_vd_vd(u), u);
 
   u = (vdouble)vor_vm_vo64_vm(visinf_vo_vd(d), (vmask)u);
+  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u);
 
   return u;
 }
 
 vdouble xtan_u1(vdouble d) {
-  vint q;
   vdouble u;
   vdouble2 s, t, x;
   vopmask o;
+#if 0
+  vint ql = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
 
-  q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI)));
-  u = vcast_vd_vi(q);
-
+  u = vcast_vd_vi(ql);
   s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2)));
   s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2)));
-
-  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)));
+#else
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),
+			vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(double)(1 << 24)),
+					 vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),
+							  vcast_vd_d(-0.5), vcast_vd_d(0.5))));
+  vint ql = vtruncate_vi_vd(vadd_vd_vd_vd(s.x, s.y));
+  vdouble dql = vcast_vd_vi(ql);
+  
+  s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_A*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5 * (1 << 24))));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmla_vd_vd_vd_vd(dqh, vcast_vd_d(1 << 24), dql),
+					 vcast_vd_d(-PI_D*0.5)));
+#endif
+  
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
   vmask n = vand_vm_vo64_vm(o, (vmask)vcast_vd_d(-0.0));
   s.x = (vdouble)vxor_vm_vm_vm((vmask)s.x, n);
   s.y = (vdouble)vxor_vm_vm_vm((vmask)s.y, n);
@@ -479,7 +623,10 @@ vdouble xtan_u1(vdouble d) {
 
   u = vadd_vd_vd_vd(x.x, x.y);
 
-  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u);
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d),
+					vor_vo_vo_vo(visnegzero_vo_vd(d),
+						     vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))),
+		       vcast_vd_d(-0.0), u);
 
   return u;
 }
@@ -755,7 +902,8 @@ vdouble xexp(vdouble d) {
 
   u = vldexp_vd_vd_vi(u, q);
 
-  u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(d), (vmask)u);
+  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), u);
+  u = (vdouble)vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), (vmask)u);
 
   return u;
 }
@@ -776,22 +924,27 @@ static INLINE vdouble2 logk(vdouble d) {
   x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
   x2 = ddsqu_vd2_vd2(x);
 
-  t = vcast_vd_d(0.13860436390467167910856);
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.131699838841615374240845));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153914168346271945653214));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181816523941564611721589));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.22222224632662035403996));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285511134091777308));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000914013309483));
-  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666664853302393));
+  t = vcast_vd_d(0.116255524079935043668677);
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.103239680901072952701192));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.117754809412463995466069));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.13332981086846273921509));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153846227114512262845736));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181818180850050775676507));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222222230083560345903));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285714249172087875));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000000077715612));
+  vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17);
 
 #ifndef ENABLE_AVX512F
-  return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-		       vcast_vd_vi(e)),
-		ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t)));
+  return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)),
+			    ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)),
+					       ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x),
+								 ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c))));
 #else
   return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), e),
-			    ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t)));
+			    ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)),
+					       ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x),
+								 ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c))));
 #endif
 }
 
@@ -833,21 +986,24 @@ static INLINE vdouble expk(vdouble2 d) {
   u = vadd_vd_vd_vd(t.x, t.y);
   u = vldexp_vd_vd_vi(u, q);
 
+  u = (vdouble)vandnot_vm_vo64_vm(vlt_vo_vd_vd(d.x, vcast_vd_d(-1000)), (vmask)u);
+  
   return u;
 }
 
 vdouble xpow(vdouble x, vdouble y) {
 #if 1
-  vdouble debug = vcast_vd_vi(vrint_vi_vd(y));
-  vopmask yisnint = vneq_vo_vd_vd(vcast_vd_vi(vrint_vi_vd(y)), y);
-  vopmask yisodd = vandnot_vo_vo_vo(yisnint, vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1))));
+  vopmask yisint = visint(y);
+  vopmask yisodd = vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1))), yisint);
 
-  vdouble result = expk(ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y));
+  vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y);
+  vdouble result = expk(d);
+  result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d.x, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), result);
 
   result = vmul_vd_vd_vd(result,
 			 vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)),
 					  vcast_vd_d(1),
-					  (vdouble)vor_vm_vo64_vm(yisnint, (vmask)vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)))));
+					  vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(NAN))));
 
   vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y);
 
@@ -897,7 +1053,7 @@ static INLINE vdouble2 expk2(vdouble2 d) {
 
   t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t);
 
-  return ddscale_vd2_vd2_vd(t, vpow2i_vd_vi(q));
+  return ddscale_vd2_vd2_vd(ddscale_vd2_vd2_vd(t, vcast_vd_d(2)), vpow2i_vd_vi(vsub_vi_vi_vi(q, vcast_vi_i(1))));
 }
 
 vdouble xsinh(vdouble x) {
@@ -970,21 +1126,32 @@ static INLINE vdouble2 logk2(vdouble2 d) {
 
 vdouble xasinh(vdouble x) {
   vdouble y = vabs_vd_vd(x);
-  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(y, y),  vcast_vd_d(1))), y));
-  y = vadd_vd_vd_vd(d.x, d.y);
+  vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1));
+  vdouble2 d;
+  
+  d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0)));
+  d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1)));
+  d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d);
 
-  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y);
-  y = vmulsign_vd_vd_vd(y, x);
+  d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x)));
+  y = vadd_vd_vd_vd(d.x, d.y);
+  
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
+				    visnan_vo_vd(y)),
+		       vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), x), y);
   y = (vdouble)vor_vm_vo64_vm(visnan_vo_vd(x), (vmask)y);
+  y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y);
 
   return y;
 }
 
 vdouble xacosh(vdouble x) {
-  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(x, x), vcast_vd_d(-1))), x));
+  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x));
   vdouble y = vadd_vd_vd_vd(d.x, d.y);
 
-  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y);
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
+				    visnan_vo_vd(y)),
+		       vcast_vd_d(INFINITY), y);
   y = (vdouble)vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), (vmask)y);
 
   y = (vdouble)vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), (vmask)y);
@@ -999,7 +1166,6 @@ vdouble xatanh(vdouble x) {
   y = (vdouble)vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), (vmask)vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5))));
 
   y = (vdouble)vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), (vmask)y);
-
   y = vmulsign_vd_vd_vd(y, x);
   y = (vdouble)vor_vm_vo64_vm(visnan_vo_vd(x), (vmask)y);
 
@@ -1009,9 +1175,11 @@ vdouble xatanh(vdouble x) {
 vdouble xcbrt(vdouble d) {
   vdouble x, y, q = vcast_vd_d(1.0);
   vint e, qu, re;
-  vdouble t, s;
+  vdouble t;
 
-  s = d;
+#ifdef ENABLE_GETEXP_DP
+  vdouble s = d;
+#endif
   e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
   d = vldexp_vd_vd_vi(d, vneg_vi_vi(e));
 
@@ -1047,11 +1215,13 @@ vdouble xcbrt(vdouble d) {
 }
 
 vdouble xcbrt_u1(vdouble d) {
-  vdouble x, y, z, t, s;
+  vdouble x, y, z, t;
   vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v;
   vint e, qu, re;
 
-  s = d;
+#ifdef ENABLE_GETEXP_DP
+  vdouble s = d;
+#endif
   e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
   d = vldexp_vd_vd_vi(d, vneg_vi_vi(e));
 
@@ -1101,14 +1271,14 @@ vdouble xcbrt_u1(vdouble d) {
 
 vdouble xexp2(vdouble a) {
   vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
-  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1023)), vcast_vd_d(INFINITY), u);
+  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1024)), vcast_vd_d(INFINITY), u);
   u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(a), (vmask)u);
   return u;
 }
 
 vdouble xexp10(vdouble a) {
   vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
-  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(308)), vcast_vd_d(INFINITY), u);
+  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(308.254715559916743850652254)), vcast_vd_d(INFINITY), u);
   u = (vdouble)vandnot_vm_vo64_vm(visminf_vo_vd(a), (vmask)u);
   return u;
 }
@@ -1116,8 +1286,8 @@ vdouble xexp10(vdouble a) {
 vdouble xexpm1(vdouble a) {
   vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
   vdouble x = vadd_vd_vd_vd(d.x, d.y);
-  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(700)), vcast_vd_d(INFINITY), x);
-  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-0.36043653389117156089696070315825181539851971360337e+2)), vcast_vd_d(-1), x);
+  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(INFINITY), x);
+  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x);
   x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x);
   return x;
 }
@@ -1137,7 +1307,7 @@ vdouble xlog1p(vdouble a) {
   vdouble2 d = logk2(ddadd2_vd2_vd_vd(a, vcast_vd_d(1)));
   vdouble x = vadd_vd_vd_vd(d.x, d.y);
 
-  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(a), vcast_vd_d(INFINITY), x);
+  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(1e+307)), vcast_vd_d(INFINITY), x);
   x = (vdouble)vor_vm_vo64_vm(vgt_vo_vd_vd(vcast_vd_d(-1.0), a), (vmask)x);
   x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(a, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), x);
   x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x);
diff --git a/simd/sleefsimdsp.c b/simd/sleefsimdsp.c
index 647cfcb0..760cf04e 100644
--- a/simd/sleefsimdsp.c
+++ b/simd/sleefsimdsp.c
@@ -55,6 +55,17 @@
 #define PI4_Cf 3.7747668102383613586e-08f
 #define PI4_Df 1.2816720341285448015e-12f
 
+#define PI_Af 3.140625f
+#define PI_Bf 0.0009670257568359375f
+#define PI_Cf 6.2771141529083251953e-07f
+#define PI_Df 1.2154201256553420762e-10f
+
+#define PI_XDf 1.2141754268668591976e-10f
+#define PI_XEf 1.2446743939339977025e-13f
+
+#define TRIGRANGEMAXf 1e+7 // 39000
+#define SQRT_FLT_MAX 18446743523953729536.0
+
 #define L2Uf 0.693145751953125f
 #define L2Lf 1.428606765330187045e-06f
 #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
@@ -132,15 +143,15 @@ vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); }
 
 vfloat xsinf(vfloat d) {
   vint2 q;
-  vfloat u, s;
+  vfloat u, s, r = d;
 
   q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));
   u = vcast_vf_vi2(q);
 
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d);
 
   s = vmul_vf_vf_vf(d, d);
 
@@ -151,27 +162,29 @@ vfloat xsinf(vfloat d) {
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));
 
-  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d);
+  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);
 
-  u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u);
+  u = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnegzero_vo_vf(r),
+				    vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf))),
+		       vcast_vf_f(-0.0), u);
 
-  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), u);
+  u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u);
 
   return u;
 }
 
 vfloat xcosf(vfloat d) {
   vint2 q;
-  vfloat u, s;
+  vfloat u, s, r = d;
 
   q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
   q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));
 
   u = vcast_vf_vi2(q);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), d);
-  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d);
 
   s = vmul_vf_vf_vf(d, d);
 
@@ -182,7 +195,10 @@ vfloat xcosf(vfloat d) {
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));
 
-  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d);
+  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);
+
+  u = (vfloat)vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf)),
+				 (vmask)u);
 
   u = (vfloat)vor_vm_vo32_vm(visinf_vo_vf(d), (vmask)u);
 
@@ -200,10 +216,10 @@ vfloat2 xsincosf(vfloat d) {
   s = d;
 
   u = vcast_vf_vi2(q);
-  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), s);
-  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), s);
-  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), s);
-  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s);
 
   t = s;
 
@@ -235,8 +251,11 @@ vfloat2 xsincosf(vfloat d) {
   o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
   r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
 
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));
+  r.x = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.x);
+  r.y = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.y);
+  
   o = visinf_vo_vf(d);
-
   r.x = (vfloat)vor_vm_vo32_vm(o, (vmask)r.x);
   r.y = (vfloat)vor_vm_vo32_vm(o, (vmask)r.y);
 
@@ -253,10 +272,10 @@ vfloat xtanf(vfloat d) {
   x = d;
 
   u = vcast_vf_vi2(q);
-  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), x);
-  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), x);
-  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), x);
-  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x);
 
   s = vmul_vf_vf_vf(x, x);
 
@@ -287,10 +306,10 @@ vfloat xsinf_u1(vfloat d) {
   q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));
   u = vcast_vf_vi2(q);
 
-  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4)));
+  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df)));
 
   t = s;
   s = dfsqu_vf2_vf2(s);
@@ -305,7 +324,9 @@ vfloat xsinf_u1(vfloat d) {
   u = vadd_vf_vf_vf(x.x, x.y);
 
   u = (vfloat)vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0)), (vmask)u);
-  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), u);
+  u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d), vor_vo_vo_vo(visnegzero_vo_vf(d),
+								      vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)))),
+		       vcast_vf_f(-0.0), u);
 
   return u;
 }
@@ -319,10 +340,10 @@ vfloat xcosf_u1(vfloat d) {
   q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));
   u = vcast_vf_vi2(q);
 
-  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2)));
+  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f)));
 
   t = s;
   s = dfsqu_vf2_vf2(s);
@@ -338,6 +359,9 @@ vfloat xcosf_u1(vfloat d) {
 
   u = (vfloat)vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0)), (vmask)u);
 
+  u = (vfloat)vandnot_vm_vo32_vm(vandnot_vo_vo_vo(visinf_vo_vf(d), vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))),
+				 (vmask)u);
+  
   return u;
 }
 
@@ -350,10 +374,10 @@ vfloat2 xsincosf_u1(vfloat d) {
   q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
   u = vcast_vf_vi2(q);
 
-  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2)));
+  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f)));
 
   t = s;
 
@@ -390,6 +414,10 @@ vfloat2 xsincosf_u1(vfloat d) {
   o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
   r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
 
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));
+  r.x = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.x);
+  r.y = (vfloat)vandnot_vm_vo32_vm(o, (vmask)r.y);
+  
   o = visinf_vo_vf(d);
   r.x = (vfloat)vor_vm_vo32_vm(o, (vmask)r.x);
   r.y = (vfloat)vor_vm_vo32_vm(o, (vmask)r.y);
@@ -406,10 +434,11 @@ vfloat xtanf_u1(vfloat d) {
   q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_2_PI)));
   u = vcast_vf_vi2(q);
 
-  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2)));
-  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2)));
+  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_XDf*0.5f)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_XEf*0.5f)));
 
   o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
   vmask n = vand_vm_vo32_vm(o, (vmask)vcast_vf_f(-0.0));
@@ -434,7 +463,10 @@ vfloat xtanf_u1(vfloat d) {
 
   u = vadd_vf_vf_vf(x.x, x.y);
 
-  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), u);
+  u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d),
+					vor_vo_vo_vo(visnegzero_vo_vf(d),
+						     vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)))),
+		       vcast_vf_f(-0.0f), u);
   
   return u;
 }
@@ -667,17 +699,25 @@ vfloat xexpf(vfloat d) {
   s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);
   s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);
 
+#if 0
   u = vcast_vf_f(0.00136324646882712841033936f);
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00836596917361021041870117f));
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416710823774337768554688f));
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166665524244308471679688f));
   u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
+#else
+  u = vcast_vf_f(0.000198527617612853646278381);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));
+#endif
   u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s));
 
   u = vldexp_vf_vf_vi2(u, q);
 
-  u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(d), (vmask)u);
+  u = (vfloat)vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), (vmask)u);
 
   return u;
 }
@@ -710,10 +750,12 @@ vfloat xsqrtf(vfloat d) { return vsqrt_vf_vf(d); }
 #endif
 
 vfloat xcbrtf(vfloat d) {
-  vfloat x, y, q = vcast_vf_f(1.0), t, s;
+  vfloat x, y, q = vcast_vf_f(1.0), t;
   vint2 e, qu, re;
 
-  s = d;
+#ifdef ENABLE_GETEXP_DP
+  vfloat s = d;
+#endif
   e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
   d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e));
 
@@ -747,11 +789,13 @@ vfloat xcbrtf(vfloat d) {
 }
 
 vfloat xcbrtf_u1(vfloat d) {
-  vfloat x, y, z, t, s;
+  vfloat x, y, z, t;
   vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v;
   vint2 e, qu, re;
 
-  s = d;
+#ifdef ENABLE_GETEXP_DP
+  vfloat s = d;
+#endif
   e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
   d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e));
 
@@ -815,18 +859,22 @@ static INLINE vfloat2 logkf(vfloat d) {
   x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
   x2 = dfsqu_vf2_vf2(x);
 
-  t = vcast_vf_f(0.2392828464508056640625f);
-  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.28518211841583251953125f));
-  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005877017974853515625f));
-  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666686534881591796875f));
+  t = vcast_vf_f(0.240320354700088500976562);
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285112679004669189453125));
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400007992982864379882812));
+  vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f);
 
 #ifndef ENABLE_AVX512F
   return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)),
-		       vcast_vf_vi2(e)),
-		dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t)));
+					     vcast_vf_vi2(e)),
+			    dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)),
+					       dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x),
+								 dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c))));
 #else
   return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), e),
-			    dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t)));
+			    dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)),
+					       dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x),
+								 dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c))));
 #endif
 }
 
@@ -867,20 +915,26 @@ static INLINE vfloat expkf(vfloat2 d) {
   u = vadd_vf_vf_vf(t.x, t.y);
   u = vldexp_vf_vf_vi2(u, q);
 
+  u = (vfloat)vandnot_vm_vo32_vm(vlt_vo_vf_vf(d.x, vcast_vf_f(-104)), (vmask)u);
+  
   return u;
 }
 
 vfloat xpowf(vfloat x, vfloat y) {
 #if 1
-  vopmask yisnint = vneq_vo_vf_vf(vcast_vf_vi2(vrint_vi2_vf(y)), y);
-  vopmask yisodd = vandnot_vo_vo_vo(yisnint, veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vrint_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)));
+  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 23)));
+  vopmask yisodd = vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint);
+
+#ifdef ENABLE_NEON32
+  yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd);
+#endif
 
   vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y));
 
   result = vmul_vf_vf_vf(result,
 			 vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)),
 					  vcast_vf_f(1),
-					  (vfloat)vor_vm_vo32_vm(yisnint, (vmask)vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1), vcast_vf_f(1)))));
+					  vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(NANf))));
 
   vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y);
 
@@ -924,8 +978,8 @@ static INLINE vfloat2 expk2f(vfloat2 d) {
   t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u));
 
   t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);
-
-  return dfscale_vf2_vf2_vf(t, vpow2i_vf_vi2(q));
+  return dfscale_vf2_vf2_vf(dfscale_vf2_vf2_vf(t, vcast_vf_f(2)),
+			    vpow2i_vf_vi2(vsub_vi2_vi2_vi2(q, vcast_vi2_i(1))));
 }
 
 vfloat xsinhf(vfloat x) {
@@ -997,21 +1051,32 @@ static INLINE vfloat2 logk2f(vfloat2 d) {
 
 vfloat xasinhf(vfloat x) {
   vfloat y = vabs_vf_vf(x);
-  vfloat2 d = logk2f(dfadd_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(y, y),  vcast_vf_f(1))), y));
+  vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1));
+  vfloat2 d;
+  
+  d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0)));
+  d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1)));
+  d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d);
+
+  d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x)));
   y = vadd_vf_vf_vf(d.x, d.y);
 
-  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y);
-  y = vmulsign_vf_vf_vf(y, x);
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
+				    visnan_vo_vf(y)),
+		       vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), x), y);
   y = (vfloat)vor_vm_vo32_vm(visnan_vo_vf(x), (vmask)y);
+  y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y);
 
   return y;
 }
 
 vfloat xacoshf(vfloat x) {
-  vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(x, x), vcast_vf_f(-1))), x));
+  vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x));
   vfloat y = vadd_vf_vf_vf(d.x, d.y);
 
-  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y);
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
+				    visnan_vo_vf(y)),
+		       vcast_vf_f(INFINITYf), y);
 
   y = (vfloat)vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y);
 
@@ -1035,22 +1100,14 @@ vfloat xatanhf(vfloat x) {
 
 vfloat xexp2f(vfloat a) {
   vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), a));
-#ifdef ENABLE_NEON32
-  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(127.0f)), vcast_vf_f(INFINITYf), u);
-#else
-  u = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), u);
-#endif
+  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(128.0f)), vcast_vf_f(INFINITYf), u);
   u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(a), (vmask)u);
   return u;
 }
 
 vfloat xexp10f(vfloat a) {
   vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(2.3025851249694824219f), vcast_vf_f(-3.1975436520781386207e-08f)), a));
-#ifdef ENABLE_NEON32
-  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(38.0f)), vcast_vf_f(INFINITYf), u);
-#else
-  u = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), u);
-#endif
+  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(38.531839419103626f)), vcast_vf_f(INFINITYf), u);
   u = (vfloat)vandnot_vm_vo32_vm(visminf_vo_vf(a), (vmask)u);
   return u;
 }
@@ -1058,8 +1115,8 @@ vfloat xexp10f(vfloat a) {
 vfloat xexpm1f(vfloat a) {
   vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0));
   vfloat x = vadd_vf_vf_vf(d.x, d.y);
-  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.0f)), vcast_vf_f(INFINITYf), x);
-  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-0.15942385152878742116596338793538061065739925620174e+2f)), vcast_vf_f(-1), x);
+  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283905206835f)), vcast_vf_f(INFINITYf), x);
+  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x);
   x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x);
   return x;
 }
@@ -1079,7 +1136,7 @@ vfloat xlog1pf(vfloat a) {
   vfloat2 d = logk2f(dfadd2_vf2_vf_vf(a, vcast_vf_f(1)));
   vfloat x = vadd_vf_vf_vf(d.x, d.y);
 
-  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(a), vcast_vf_f(INFINITYf), x);
+  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(1e+38)), vcast_vf_f(INFINITYf), x);
   x = (vfloat)vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), a), (vmask)x);
   x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(a, vcast_vf_f(-1)), vcast_vf_f(-INFINITYf), x);
   x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x);
diff --git a/simd/starttester2.sh b/simd/starttester2.sh
new file mode 100644
index 00000000..e7a5b6c1
--- /dev/null
+++ b/simd/starttester2.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+nohup nice ./tester2sse2 >> ../tester2.out &
+nohup nice ./tester2fsse2 >> ../tester2.out &
+nohup nice ./tester2avx >> ../tester2.out &
+nohup nice ./tester2favx >> ../tester2.out &
+nohup nice ./tester2avx2 >> ../tester2.out &
+nohup nice ./tester2favx2 >> ../tester2.out &
+nohup nice sde64 -- ./tester2avx512f >> ../tester2.out &
+nohup nice sde64 -- ./tester2favx512f >> ../tester2.out &
diff --git a/simd/stoptester2.sh b/simd/stoptester2.sh
new file mode 100644
index 00000000..da6503ac
--- /dev/null
+++ b/simd/stoptester2.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+killall tester2avx tester2avx2 tester2sse2 tester2avx512f tester2favx tester2favx2 tester2fsse2 tester2favx512f
diff --git a/simd/tester2fsimd.c b/simd/tester2fsimd.c
new file mode 100644
index 00000000..7179386f
--- /dev/null
+++ b/simd/tester2fsimd.c
@@ -0,0 +1,520 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include "sleefsimd.h"
+
+mpfr_t fra, frb, frc, frd, frw, frx, fry, frz;
+
+#define DENORMAL_FLT_MIN (1.40130e-45f)
+
+double countULP(float d, mpfr_t c) {
+  float c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), DENORMAL_FLT_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+double countULP2(float d, mpfr_t c) {
+  float c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-24), FLT_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+typedef union {
+  double d;
+  uint64_t u64;
+  int64_t i64;
+} conv64_t;
+
+typedef union {
+  float f;
+  uint32_t u32;
+  int32_t i32;
+} conv32_t;
+
+float rnd_fr() {
+  conv32_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0);
+#else
+    c.u32 = (uint32_t)random() | ((uint32_t)random() << 31);
+#endif
+  } while(!isfinite(c.f));
+  return c.f;
+}
+
+float rnd_zo() {
+  conv32_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u32, sizeof(c.u32), 0);
+#else
+    c.u32 = (uint32_t)random() | ((uint32_t)random() << 31);
+#endif
+  } while(!isfinite(c.f) || c.f < -1 || 1 < c.f);
+  return c.f;
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_set_default_prec(256);
+  mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL);
+
+  conv32_t cd;
+  float d, t;
+  vfloat vd, vd2, vzo, vad;
+  vfloat2 sc, sc2;
+  int cnt;
+  
+  srandom(time(NULL));
+
+#if 0
+  cd.f = M_PI;
+  mpfr_set_d(frx, cd.f, GMP_RNDN);
+  cd.i32+=3;
+  printf("%g\n", countULP(cd.f, frx));
+#endif
+
+  const float rangemax = 39000;
+  
+  for(cnt = 0;;cnt++) {
+    int e = cnt % VECTLENSP;
+    switch(cnt & 7) {
+    case 0:
+      d = (2 * (float)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 1:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    case 2:
+      d = (2 * (float)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 3:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    case 4:
+      d = (2 * (float)random() / RAND_MAX - 1) * 10000;
+      break;
+    case 5:
+      cd.f = rint((2 * (float)random() / RAND_MAX - 1) * 10000) * M_PI_4;
+      cd.i32 += (random() & 31) - 15;
+      d = cd.f;
+      break;
+    default:
+      d = rnd_fr();
+      break;
+    }
+
+    if (!isfinite(d)) continue;
+
+    vd[e] = d;
+    sc = xsincosf(vd);
+    sc2 = xsincosf_u1(vd);
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sin(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xsinf(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sinf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = sc.x[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincosf sin arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      float u2 = countULP(t = xsinf_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sinf_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      float u3 = countULP(t = sc2.x[e], frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincosf_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cos(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xcosf(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " cosf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = sc.y[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincosf cos arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      float u2 = countULP(t = xcosf_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " cosf_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      float u3 = countULP(t = sc2.y[e], frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincosf_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tan(frx, frx, GMP_RNDN);
+
+      float u0 = countULP(t = xtanf(vd)[e], frx);
+      
+      if ((fabs(d) < rangemax && u0 > 3.5) || isnan(t)) {
+	printf(SLEEF_ARCH " tanf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      float u1 = countULP(t = xtanf_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) {
+	printf(SLEEF_ARCH " tanf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    d = rnd_fr();
+    float d2 = rnd_fr(), zo = rnd_zo();
+    vd[e] = d;
+    vd2[e] = d2;
+    vzo[e] = zo;
+    vad[e] = fabs(d);
+
+    {
+      mpfr_set_d(frx, fabsf(d), GMP_RNDN);
+      mpfr_log(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlogf(vad)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " logf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xlogf_u1(vad)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " logf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, fabsf(d), GMP_RNDN);
+      mpfr_log10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog10f(vad)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " log10f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_log1p(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog1pf(vd)[e], frx);
+      
+      if ((-1 <= d && d <= 1e+38 && u0 > 1) ||
+	  (d < -1 && !isnan(t)) ||
+	  (d > 1e+38 && !(u0 <= 1 || isinf(t)))) {
+	printf(SLEEF_ARCH " log1pf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpf(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " expf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp2(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp2f(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " exp2f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp10f(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " exp10f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_expm1(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpm1f(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " expm1f arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_pow(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xpowf(vd2, vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " powf arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cbrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcbrtf(vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " cbrtf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xcbrtf_u1(vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " cbrtf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_asin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinf(vzo)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " asinf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xasinf_u1(vzo)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " asinf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_acos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacosf(vzo)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " acosf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xacosf_u1(vzo)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " acosf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanf(vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " atanf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xatanf_u1(vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " atanf_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_atan2(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan2f(vd2, vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " atan2f arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP2(t = xatan2f_u1(vd2, vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " atan2f_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsinhf(vd)[e], frx);
+      
+      if ((fabs(d) <= 88.5 && u0 > 1) ||
+	  (d >  88.5 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d < -88.5 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf(SLEEF_ARCH " sinhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcoshf(vd)[e], frx);
+      
+      if ((fabs(d) <= 88.5 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
+	printf(SLEEF_ARCH " coshf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtanhf(vd)[e], frx);
+      
+      if (u0 > 1.0001) {
+	printf(SLEEF_ARCH " tanhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_asinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinhf(vd)[e], frx);
+      
+      if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) ||
+	  (d >=  sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinf(t) && t < 0)))) {
+	printf(SLEEF_ARCH " asinhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_acosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacoshf(vd)[e], frx);
+      
+      if ((fabs(d) < sqrt(FLT_MAX) && u0 > 1.0001) ||
+	  (d >=  sqrt(FLT_MAX) && !(u0 <= 1.0001 || (isinff(t) && t > 0))) ||
+	  (d <= -sqrt(FLT_MAX) && !isnan(t))) {
+	printf(SLEEF_ARCH " acoshf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanhf(vd)[e], frx);
+      
+      if (u0 > 1.0001) {
+	printf(SLEEF_ARCH " atanhf arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+  }
+}
diff --git a/simd/tester2simd.c b/simd/tester2simd.c
new file mode 100644
index 00000000..f701ee91
--- /dev/null
+++ b/simd/tester2simd.c
@@ -0,0 +1,515 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpfr.h>
+#include <time.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include "sleefsimd.h"
+
+mpfr_t fra, frb, frc, frd, frw, frx, fry, frz;
+
+#define DENORMAL_DBL_MIN (4.94066e-324)
+
+double countULP(double d, mpfr_t c) {
+  double c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DENORMAL_DBL_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+double countULP2(double d, mpfr_t c) {
+  double c2 = mpfr_get_d(c, GMP_RNDN);
+  if (c2 == 0 && d != 0) return 10000;
+  if (!isfinite(c2) && !isfinite(d)) return 0;
+
+  int e;
+  frexpl(mpfr_get_d(c, GMP_RNDN), &e);
+  mpfr_set_ld(frw, fmaxl(ldexpl(1.0, e-53), DBL_MIN), GMP_RNDN);
+
+  mpfr_set_d(frd, d, GMP_RNDN);
+  mpfr_sub(fry, frd, c, GMP_RNDN);
+  mpfr_div(fry, fry, frw, GMP_RNDN);
+  double u = fabs(mpfr_get_d(fry, GMP_RNDN));
+
+  return u;
+}
+
+typedef union {
+  double d;
+  uint64_t u64;
+  int64_t i64;
+} conv_t;
+
+double rnd_fr() {
+  conv_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isfinite(c.d));
+  return c.d;
+}
+
+double rnd_zo() {
+  conv_t c;
+  do {
+#if 1
+    syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
+#else
+    c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
+#endif
+  } while(!isfinite(c.d) || c.d < -1 || 1 < c.d);
+  return c.d;
+}
+
+int main(int argc,char **argv)
+{
+  mpfr_set_default_prec(256);
+  mpfr_inits(fra, frb, frc, frd, frw, frx, fry, frz, NULL);
+
+  conv_t cd;
+  double d, t;
+  vdouble vd, vd2, vzo, vad;
+  vdouble2 sc, sc2;
+  int cnt;
+  
+  srandom(time(NULL));
+
+#if 0
+  cd.d = M_PI;
+  mpfr_set_d(frx, cd.d, GMP_RNDN);
+  cd.i64+=3;
+  printf("%g\n", countULP(cd.d, frx));
+#endif
+
+  const double rangemax = 1e+14; // 2^(24*2-1)
+  
+  for(cnt = 0;;cnt++) {
+    int e = cnt % VECTLENDP;
+    switch(cnt & 7) {
+    case 0:
+      d = (2 * (double)random() / RAND_MAX - 1) * rangemax;
+      break;
+    case 1:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * rangemax) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    case 2:
+      d = (2 * (double)random() / RAND_MAX - 1) * 1e+7;
+      break;
+    case 3:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+7) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    case 4:
+      d = (2 * (double)random() / RAND_MAX - 1) * 10000;
+      break;
+    case 5:
+      cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 10000) * M_PI_4;
+      cd.i64 += (random() & 31) - 15;
+      d = cd.d;
+      break;
+    default:
+      d = rnd_fr();
+      break;
+    }
+
+    if (!isfinite(d)) continue;
+
+    vd[e] = d;
+    sc = xsincos(vd);
+    sc2 = xsincos_u1(vd);
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsin(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sin arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = sc.x[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincos sin arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      double u2 = countULP(t = xsin_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      double u3 = countULP(t = sc2.x[e], frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcos(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u0 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " cos arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = sc.y[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 3.5) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincos cos arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+
+      double u2 = countULP(t = xcos_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u2 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
+	fflush(stdout);
+      }
+
+      double u3 = countULP(t = sc2.y[e], frx);
+      
+      if ((fabs(d) <= rangemax && u3 > 1) || fabs(t) > 1 || !isfinite(t)) {
+	printf(SLEEF_ARCH " sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtan(vd)[e], frx);
+      
+      if ((fabs(d) < 1e+7 && u0 > 3.5) || (fabs(d) <= rangemax && u0 > 5) || isnan(t)) {
+	printf(SLEEF_ARCH " tan arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xtan_u1(vd)[e], frx);
+      
+      if ((fabs(d) <= rangemax && u1 > 1) || isnan(t)) {
+	printf(SLEEF_ARCH " tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    d = rnd_fr();
+    double d2 = rnd_fr(), zo = rnd_zo();
+    vd[e] = d;
+    vd2[e] = d2;
+    vzo[e] = zo;
+    vad[e] = fabs(d);
+    
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog(vad)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " log arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xlog_u1(vad)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " log_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, fabs(d), GMP_RNDN);
+      mpfr_log10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog10(vad)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " log10 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_log1p(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xlog1p(vd)[e], frx);
+      
+      if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
+	  (d < -1 && !isnan(t)) ||
+	  (d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
+	printf(SLEEF_ARCH " log1p arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " exp arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp2(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp2(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " exp2 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_exp10(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexp10(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " exp10 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_expm1(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xexpm1(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " expm1 arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_pow(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xpow(vd2, vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cbrt(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcbrt(vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " cbrt arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xcbrt_u1(vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_asin(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasin(vzo)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " asin arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xasin_u1(vzo)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " asin_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+
+    {
+      mpfr_set_d(frx, zo, GMP_RNDN);
+      mpfr_acos(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacos(vzo)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " acos arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xacos_u1(vzo)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " acos_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atan(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan(vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " atan arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP(t = xatan_u1(vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_set_d(fry, d2, GMP_RNDN);
+      mpfr_atan2(frx, fry, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatan2(vd2, vd)[e], frx);
+      
+      if (u0 > 3.5) {
+	printf(SLEEF_ARCH " atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
+	fflush(stdout);
+      }
+
+      double u1 = countULP2(t = xatan2_u1(vd2, vd)[e], frx);
+      
+      if (u1 > 1) {
+	printf(SLEEF_ARCH " atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_sinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xsinh(vd)[e], frx);
+      
+      if ((fabs(d) <= 709 && u0 > 1) ||
+	  (d >  709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf(SLEEF_ARCH " sinh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_cosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xcosh(vd)[e], frx);
+      
+      if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
+	printf(SLEEF_ARCH " cosh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_tanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xtanh(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " tanh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_asinh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xasinh(vd)[e], frx);
+      
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+	  (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
+	printf(SLEEF_ARCH " asinh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_acosh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xacosh(vd)[e], frx);
+      
+      if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
+	  (d >=  sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
+	  (d <= -sqrt(DBL_MAX) && !isnan(t))) {
+	printf(SLEEF_ARCH " acosh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+    
+    {
+      mpfr_set_d(frx, d, GMP_RNDN);
+      mpfr_atanh(frx, frx, GMP_RNDN);
+
+      double u0 = countULP(t = xatanh(vd)[e], frx);
+      
+      if (u0 > 1) {
+	printf(SLEEF_ARCH " atanh arg=%.20g ulp=%.20g\n", d, u0);
+	fflush(stdout);
+      }
+    }
+  }
+}
diff --git a/tester/tester.c b/tester/tester.c
index a37acaeb..8626b0b7 100644
--- a/tester/tester.c
+++ b/tester/tester.c
@@ -1629,6 +1629,7 @@ void do_test() {
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
       if (!cmpDenorm(child_sin(xa[i]), sinfr(xa[i]))) {
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], child_sin(xa[i]), sinfr(xa[i]));
 	success = false;
 	break;
       }
@@ -1646,7 +1647,7 @@ void do_test() {
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
       double2 q = child_sincos(xa[i]);
       if (!cmpDenorm(q.x, sinfr(xa[i]))) {
-	fprintf(stderr, "xa = %.20g, d = %.20g, c = %.20g\n", xa[i], q.x, sinfr(xa[i]));
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], q.x, sinfr(xa[i]));
 	success = false;
 	break;
       }
@@ -1663,6 +1664,7 @@ void do_test() {
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
       if (!cmpDenorm(child_cos(xa[i]), cosfr(xa[i]))) {
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], child_cos(xa[i]), cosfr(xa[i]));
 	success = false;
 	break;
       }
@@ -1696,6 +1698,7 @@ void do_test() {
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
       if (!cmpDenorm(child_tan(xa[i]), tanfr(xa[i]))) {
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], child_tan(xa[i]), tanfr(xa[i]));
 	success = false;
 	break;
       }
@@ -1707,12 +1710,12 @@ void do_test() {
   {
     fprintf(stderr, "asin denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, nextafter(1, 2), nextafter(-1, -2), +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
       if (!cmpDenorm(child_asin(xa[i]), asinfr(xa[i]))) {
-	fprintf(stderr, "xa = %.20g, d = %.20g, c = %.20g\n", xa[i], child_asin(xa[i]), asinfr(xa[i]));
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], child_asin(xa[i]), asinfr(xa[i]));
 	success = false;
 	break;
       }
@@ -1724,7 +1727,7 @@ void do_test() {
   {
     fprintf(stderr, "acos denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, nextafter(1, 2), nextafter(-1, -2), +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1756,7 +1759,7 @@ void do_test() {
   {
     fprintf(stderr, "log denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, nextafter(0, -1), -1, +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1772,7 +1775,7 @@ void do_test() {
   {
     fprintf(stderr, "exp denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 10000, -10000, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 10000, -10000, +0.0, -0.0, -1e+10, +1e+10 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1972,7 +1975,7 @@ void do_test() {
   {
     fprintf(stderr, "log10 denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, nextafter(0, -1), +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1988,7 +1991,7 @@ void do_test() {
   {
     fprintf(stderr, "log1p denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, -2, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, nextafter(-1, -2), -2, +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -2239,7 +2242,7 @@ void do_test() {
     fprintf(stderr, "ilogb : ");
     showResult(success);
   }
-
+  
   {
     double d, max = 0;
 
@@ -2248,46 +2251,41 @@ void do_test() {
       long double c = sinlfr(d);
       double u = countULP(q, c);
       max = fmax(max, u);
-      if (u > 1000) {
-	fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c));
-	goto STOP_SIN;
+      if (u > 20) {
+	fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u);
       }
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_sin(d);
       long double c = sinlfr(d);
-       double u = countULP(q, c);
+      double u = countULP(q, c);
       max = fmax(max, u);
-      if (u > 1000) {
-	fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c));
-	goto STOP_SIN;
+      if (u > 20) {
+	fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u);
       }
     }
 
-    int i;
+    int64_t i;
 
-    for(i=1;i<10000;i++) {
+    for(i=(int64_t)-1e+14;i<(int64_t)1e+14;i+=(int64_t)1e+11) {
       double start = u2d(d2u(M_PI_4 * i)-20);
       double end = u2d(d2u(M_PI_4 * i)+20);
 
       for(d = start;d <= end;d = u2d(d2u(d)+1)) {
 	double q = child_sin(d);
 	long double c = sinlfr(d);
-	 double u = countULP(q, c);
+	double u = countULP(q, c);
 	max = fmax(max, u);
-	if (u > 1000) {
-	  fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c));
-	  goto STOP_SIN;
+	if (u > 20) {
+	  fprintf(stderr, "arg = %.20g, correct = %.20g, test = %.20g, ulp = %g\n", d, (double)c, q, u);
 	}
       }
     }
 
-  STOP_SIN:
-
     fprintf(stderr, "sin : %lf ... ", max);
 
-    showResult(max < 5);
+    showResult(max < 10);
   }
 
   {
@@ -2300,23 +2298,23 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_cos(d);
       long double c = coslfr(d);
        double u = countULP(q, c);
       max = fmax(max, u);
     }
 
-    int i;
+    int64_t i;
 
-    for(i=1;i<10000;i++) {
+    for(i=(int64_t)-1e+14;i<(int64_t)1e+14;i+=(int64_t)1e+11) {
       double start = u2d(d2u(M_PI_4 * i)-20);
       double end = u2d(d2u(M_PI_4 * i)+20);
 
       for(d = start;d <= end;d = u2d(d2u(d)+1)) {
 	double q = child_cos(d);
 	long double c = coslfr(d);
-	 double u = countULP(q, c);
+	double u = countULP(q, c);
 	max = fmax(max, u);
       }
     }
@@ -2340,7 +2338,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double2 q = child_sincos(d);
       long double c = sinlfr(d);
       double u = fabs((q.x - c) / ulp(c));
@@ -2386,7 +2384,8 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double2 q = child_sincos(d);
       long double c = coslfr(d);
       double u = fabs((q.y - c) / ulp(c));
@@ -2422,7 +2421,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_tan(d);
       long double c = tanlfr(d);
        double u = countULP(q, c);
diff --git a/tester/testersp.c b/tester/testersp.c
index 1eb4ef87..08a44d66 100644
--- a/tester/testersp.c
+++ b/tester/testersp.c
@@ -1723,7 +1723,7 @@ void do_test() {
   {
     fprintf(stderr, "asinf denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, nextafterf(1, 2), nextafterf(-1, -2), +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -1740,7 +1740,7 @@ void do_test() {
   {
     fprintf(stderr, "acosf denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, nextafterf(1, 2), nextafterf(-1, -2), +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -1773,7 +1773,7 @@ void do_test() {
   {
     fprintf(stderr, "logf denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, flushToZero(nextafterf(0, -1)), -1, +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -1988,7 +1988,7 @@ void do_test() {
   {
     fprintf(stderr, "log10f denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, flushToZero(nextafterf(0, -1)), +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -2004,7 +2004,7 @@ void do_test() {
   {
     fprintf(stderr, "log1pf denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, -2, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, nextafterf(-1, -2), -2, +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -2259,7 +2259,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_sinf(d);
       double c = sinlfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2272,7 +2272,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2305,7 +2305,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_cosf(d);
       double c = coslfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2314,7 +2314,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2345,7 +2345,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float2 q = child_sincosf(d);
       long double c = sinlfr(d);
       double u = fabs((q.x - c) / ulp(c));
@@ -2358,7 +2358,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2395,7 +2395,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float2 q = child_sincosf(d);
       double c = coslfr(d);
       double u = fabs((q.y - c) / ulp(c));
@@ -2408,7 +2408,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2437,7 +2437,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_tanf(d);
       double c = tanlfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2446,7 +2446,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
diff --git a/tester/testerspu1.c b/tester/testerspu1.c
index c6375c0b..e99910a7 100644
--- a/tester/testerspu1.c
+++ b/tester/testerspu1.c
@@ -1713,7 +1713,7 @@ void do_test() {
   {
     fprintf(stderr, "asinf_u1 denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, nextafterf(1, 2), nextafterf(-1, -2), +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -1730,7 +1730,7 @@ void do_test() {
   {
     fprintf(stderr, "acosf_u1 denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 2, -2, 1, -1, nextafterf(1, 2), nextafterf(-1, -2), +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
@@ -1763,11 +1763,12 @@ void do_test() {
   {
     fprintf(stderr, "logf_u1 denormal/nonnumber test ... ");
 
-    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, -1, +0.0f, -0.0f };
+    float xa[] = { NANf, POSITIVE_INFINITYf, NEGATIVE_INFINITYf, 0, flushToZero(nextafterf(0, -1)), -1, +0.0f, -0.0f };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(float) && success;i++) {
       if (!cmpDenorm(child_logf(xa[i]), logfr(xa[i]))) {
+	fprintf(stderr, "arg = %.20g, test = %.20g, correct = %.20g\n", xa[i], child_logf(xa[i]), logfr(xa[i]));
 	success = false;
 	break;
       }
@@ -2120,7 +2121,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_sinf(d);
       double c = sinlfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2133,7 +2134,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2166,7 +2167,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_cosf(d);
       double c = coslfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2175,7 +2176,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2206,7 +2207,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float2 q = child_sincosf(d);
       long double c = sinlfr(d);
       double u = fabs((q.x - c) / ulp(c));
@@ -2219,7 +2220,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2256,7 +2257,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float2 q = child_sincosf(d);
       double c = coslfr(d);
       double u = fabs((q.y - c) / ulp(c));
@@ -2269,7 +2270,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
@@ -2298,7 +2299,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000;d < 10000;d += 0.201) {
+    for(d = -40000;d < 40000;d += 0.201) {
       float q = child_tanf(d);
       double c = tanlfr(flushToZero(d));
       double u = countULP(q, c);
@@ -2307,7 +2308,7 @@ void do_test() {
 
     int i;
 
-    for(i=1;i<10000;i++) {
+    for(i=1;i<40000;i++) {
       float start = u2f(f2u(M_PI_4 * i)-20);
       float end = u2f(f2u(M_PI_4 * i)+20);
 
diff --git a/tester/testeru1.c b/tester/testeru1.c
index 7ed2dedb..e2337f9f 100644
--- a/tester/testeru1.c
+++ b/tester/testeru1.c
@@ -1695,7 +1695,7 @@ void do_test() {
   {
     fprintf(stderr, "asin_u1 denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, nextafter(1, 2), nextafter(-1, -2), +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1712,7 +1712,7 @@ void do_test() {
   {
     fprintf(stderr, "acos_u1 denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 2, -2, 1, -1, nextafter(1, 2), nextafter(-1, -2), +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -1746,7 +1746,7 @@ void do_test() {
   {
     fprintf(stderr, "log_u1 denormal/nonnumber test ... ");
 
-    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, -1, +0.0, -0.0 };
+    double xa[] = { NAN, POSITIVE_INFINITY, NEGATIVE_INFINITY, 0, nextafter(0, -1), -1, +0.0, -0.0 };
 
     boolean success = true;
     for(i=0;i<sizeof(xa)/sizeof(double) && success;i++) {
@@ -2032,7 +2032,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_sin(d);
       long double c = sinlfr(d);
       double u = countULP(q, c);
@@ -2078,7 +2078,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_cos(d);
       long double c = coslfr(d);
       double u = countULP(q, c);
@@ -2118,7 +2118,7 @@ void do_test() {
       }
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double2 q = child_sincos(d);
       long double c = sinlfr(d);
       double u = fabs((q.x - c) / ulp(c));
@@ -2164,7 +2164,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double2 q = child_sincos(d);
       long double c = coslfr(d);
       double u = fabs((q.y - c) / ulp(c));
@@ -2200,7 +2200,7 @@ void do_test() {
       max = fmax(max, u);
     }
 
-    for(d = -10000000;d < 10000000;d += 200.1) {
+    for(d = -1e+14;d < 1e+14;d += (1e+9 + 0.1)) {
       double q = child_tan(d);
       long double c = tanlfr(d);
       double u = countULP(q, c);