diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..00a27188 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +test : + cd java; make test + cd purec; make test + cd simd; make testsse2 testavx + +clean : + rm -f *~ + cd java; make clean + cd purec; make clean + cd simd; make clean + cd tester; make clean diff --git a/README b/README new file mode 100644 index 00000000..c7bb3124 --- /dev/null +++ b/README @@ -0,0 +1,58 @@ +In this library, functions for evaluating some elementary functions +are implemented. The algorithm is intentended for efficient evaluation +utilizing SIMD instruction sets like SSE or AVX, but it is also fast +using usual scalar operations. + +The package contains a few directories in which implementation in the +corresponding languages are contained. You can run "make test" in +order to test the functions in each directory. + +The software is in public domain. You can use the software without any +obligation. + + +Author : Naoki Shibata + +Main download page : http://shibatch.sourceforge.net/ + + + +History + +2.80 Added support for ARM NEON. Added higher accuracy single +precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1, +acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1. + +2.70 Added higher accuracy functions : sin_u1, cos_u1, sincos_u1, +tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and +cbrt_u1. These functions evaluate the corresponding function with at +most 1 ulp of error. + +2.60 Added the remaining single precision functions : powf, sinhf, +coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4 +(for AMD Bulldozer). Added more test cases. Fixed minor bugs (which +degraded accuracy in some rare cases). + +2.50 Added support for AVX2. SLEEF now compiles with ICC. + +2.40 Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf, +sinf and cosf. Removed support for Go language. + +2.31 Added sincosf. + +2.30 Added single precision functions : sinf, cosf, tanf, asinf, +acosf, atanf, logf, expf, atan2f and cbrtf. + +2.20 Added exp2, exp10, expm1, log10, log1p, and cbrt. + +2.10 asin() and acos() are back. Added ilogb() and ldexp(). Added +hyperbolic functions. Eliminated dependency on frexp, ldexp, fabs, +isnan and isinf. + +2.00 All of the algorithm has been updated. Both accuracy and speed +are improved since version 1.10. Denormal number handling is also +improved. + +1.10 AVX support is added. Accuracy tester is added. + +1.00 Initial release diff --git a/java/IUT.java b/java/IUT.java new file mode 100644 index 00000000..adbfefa2 --- /dev/null +++ b/java/IUT.java @@ -0,0 +1,296 @@ +import java.io.*; + +import org.naokishibata.sleef.*; + +public class IUT { + static long hexToLong(String s) { + long ret = 0; + for(int i=0;i>> 4) & 0x7fffffffffffffffL; + str = Character.forDigit(d, 16) + str; + } + return str; + } + + public static void main(String[] args) throws Exception { + LineNumberReader lnr = new LineNumberReader(new InputStreamReader(System.in)); + + for(;;) { + String s = lnr.readLine(); + if (s == null) break; + + if (s.startsWith("atan2 ")) { + String[] a = s.split(" "); + long y = hexToLong(a[1]); + long x = hexToLong(a[2]); + double d = FastMath.atan2(Double.longBitsToDouble(y), Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("pow ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + long y = hexToLong(a[2]); + double d = FastMath.pow(Double.longBitsToDouble(x), Double.longBitsToDouble(y)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("sincos ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + FastMath.double2 d2 = FastMath.sincos(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d2.x)) + " " + longToHex(Double.doubleToRawLongBits(d2.y))); + } else if (s.startsWith("sin ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.sin(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("cos ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.cos(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("tan ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.tan(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("asin ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.asin(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("acos ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.acos(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("atan ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.atan(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("log ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.log(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("exp ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.exp(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("sinh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.sinh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("cosh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.cosh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("tanh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.tanh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("asinh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.asinh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("acosh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.acosh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("atanh ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.atanh(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("sqrt ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.sqrt(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("cbrt ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.cbrt(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("exp2 ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.exp2(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("exp10 ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.exp10(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("expm1 ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.expm1(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("log10 ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.log10(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("log1p ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + double d = FastMath.log1p(Double.longBitsToDouble(x)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("ldexp ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]), y = hexToLong(a[2]); + double d = FastMath.ldexp(Double.longBitsToDouble(x), (int)Double.longBitsToDouble(y)); + System.out.println(longToHex(Double.doubleToRawLongBits(d))); + } else if (s.startsWith("sinf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.sinf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("cosf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.cosf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("sincosf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + FastMath.float2 d2 = FastMath.sincosf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d2.x)) + " " + longToHex(Float.floatToRawIntBits(d2.y))); + } else if (s.startsWith("tanf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.tanf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("asinf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.asinf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("acosf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.acosf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("atanf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.atanf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("logf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.logf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("expf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.expf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("cbrtf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.cbrtf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("atan2f ")) { + String[] a = s.split(" "); + long y = hexToLong(a[1]); + long x = hexToLong(a[2]); + float d = FastMath.atan2f(Float.intBitsToFloat((int)y), Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("ldexpf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + long y = hexToLong(a[2]); + float d = FastMath.ldexpf(Float.intBitsToFloat((int)x), (int)Float.intBitsToFloat((int)y)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("powf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + long y = hexToLong(a[2]); + float d = FastMath.powf(Float.intBitsToFloat((int)x), Float.intBitsToFloat((int)y)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("sinhf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.sinhf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("coshf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.coshf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("tanhf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.tanhf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("asinhf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.asinhf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("acoshf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.acoshf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("atanhf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.atanhf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("exp2f ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.exp2f(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("exp10f ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.exp10f(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("expm1f ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.expm1f(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("log10f ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.log10f(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("log1pf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = FastMath.log1pf(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else if (s.startsWith("sqrtf ")) { + String[] a = s.split(" "); + long x = hexToLong(a[1]); + float d = (float)Math.sqrt(Float.intBitsToFloat((int)x)); + System.out.println(longToHex(Float.floatToRawIntBits(d))); + } else { + break; + } + + System.out.flush(); + } + } +} diff --git a/java/Makefile b/java/Makefile new file mode 100644 index 00000000..81a1c344 --- /dev/null +++ b/java/Makefile @@ -0,0 +1,21 @@ +IUT.class : IUT.java org/naokishibata/sleef/FastMath.java + javac IUT.java + +doc : org/naokishibata/sleef/FastMath.java + javadoc -d ./javadoc -subpackages org.naokishibata + +../tester/tester : + cd ../tester; make tester + +../tester/testersp : + cd ../tester; make testersp + +test : IUT.class ../tester/tester ../tester/testersp + ../tester/tester java -ea IUT + ../tester/testersp java -ea IUT + +clean : + rm -f *~ IUT.class + rm -rf javadoc + find org -name "*.class" -exec rm {} ";" + find org -name "*~" -exec rm {} ";" diff --git a/java/org/naokishibata/examples/FastMathTest.java b/java/org/naokishibata/examples/FastMathTest.java new file mode 100644 index 00000000..d8840161 --- /dev/null +++ b/java/org/naokishibata/examples/FastMathTest.java @@ -0,0 +1,2036 @@ +package org.naokishibata.examples; + +import static org.naokishibata.sleef.FastMath.*; + +/** A class to perform correctness and speed tests for FastMath class + * + * @author Naoki Shibata + */ +public class FastMathTest { + static boolean isnan(double d) { return d != d; } + + static boolean cmpDenorm(double x, double y) { + if (isnan(x) && isnan(y)) return true; + if (x == Double.POSITIVE_INFINITY && y == Double.POSITIVE_INFINITY) return true; + if (x == Double.NEGATIVE_INFINITY && y == Double.NEGATIVE_INFINITY) return true; + if (!isnan(x) && !isnan(y) && !Double.isInfinite(x) && !Double.isInfinite(y)) return true; + return false; + } + + /** Perform correctness and speed tests. The accuracy is checked + * by comparing results with the standard math library. Note that + * calculation by the standard math library also has error, and + * the reported error is basically 1 + the calculation error by + * the FastMath methods. + */ + public static void main(String[] args) throws Exception { + System.out.println(); + + // + + System.out.println("Denormal test atan2(y, x)"); + System.out.println(); + + System.out.print("If y is +0 and x is -0, +pi is returned ... "); + System.out.println((atan2(+0.0, -0.0) == Math.PI) ? "OK" : "NG"); + //System.out.println(atan2(+0.0, -0.0)); + + System.out.print("If y is -0 and x is -0, -pi is returned ... "); + System.out.println((atan2(-0.0, -0.0) == -Math.PI) ? "OK" : "NG"); + //System.out.println(atan2(-0.0, -0.0)); + + System.out.print("If y is +0 and x is +0, +0 is returned ... "); + System.out.println(isPlusZero(atan2(+0.0, +0.0)) ? "OK" : "NG"); + //System.out.println(atan2(+0.0, +0.0)); + + System.out.print("If y is -0 and x is +0, -0 is returned ... "); + System.out.println(isMinusZero(atan2(-0.0, +0.0)) ? "OK" : "NG"); + //System.out.println(atan2(-0.0, +0.0)); + + System.out.print("If y is positive infinity and x is negative infinity, +3*pi/4 is returned ... "); + System.out.println((atan2(Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY) == 3*Math.PI/4) ? "OK" : "NG"); + //System.out.println(atan2(Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY)); + + System.out.print("If y is negative infinity and x is negative infinity, -3*pi/4 is returned ... "); + System.out.println((atan2(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY) == -3*Math.PI/4) ? "OK" : "NG"); + //System.out.println(atan2(Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY)); + + System.out.print("If y is positive infinity and x is positive infinity, +pi/4 is returned ... "); + System.out.println((atan2(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY) == Math.PI/4) ? "OK" : "NG"); + //System.out.println(atan2(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY)); + + System.out.print("If y is negative infinity and x is positive infinity, -pi/4 is returned ... "); + System.out.println((atan2(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY) == -Math.PI/4) ? "OK" : "NG"); + //System.out.println(atan2(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY)); + + { + System.out.print("If y is +0 and x is less than 0, +pi is returned ... "); + + double[] ya = { +0.0 }; + double[] xa = { -100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5 }; + + boolean success = true; + for(int i=0;iSLEEF + * library. Some of the methods can be used as substitutions of the + * corresponding methods in Math class. They have slightly less + * accuracy, and some methods are faster compared to those methods in + * Math class. Please note that the methods in the standard Math class + * are JNI methods, and the SLEEF library is specialized for SIMD + * operations. + */ +public class FastMath { + public static double E = Math.E; + public static double PI = Math.PI; + + public static double abs(double a) { return Math.abs(a); } + public static float abs(float a) { return Math.abs(a); } + public static int abs(int a) { return Math.abs(a); } + public static long abs(long a) { return Math.abs(a); } + + public static double ceil(double a) { return Math.ceil(a); } + public static double floor(double a) { return Math.floor(a); } + + // + + static double upper(double d) { + long l = Double.doubleToRawLongBits(d); + return Double.longBitsToDouble(l & 0xfffffffff8000000L); + } + + static double mla(double x, double y, double z) { return x * y + z; } + + static double mulsign(double x, double y) { return Math.copySign(1, y) * x; } + + // + + /** + Returns the absolute value of the argument + */ + public static double fabs(double d) { return Math.copySign(d, 1); } + + /** + Returns the larger value of the two arguments. The result is + undefined if denormal numbers are given. + */ + public static double max(double x, double y) { return x > y ? x : y; } + + /** + Checks if the argument is a NaN or not. + */ + public static boolean isnan(double d) { return d != d; } + + /** + Checks if the argument is either positive infinity or negative infinity. + */ + public static boolean isinf(double d) { return fabs(d) == Double.POSITIVE_INFINITY; } + + static boolean ispinf(double d) { return d == Double.POSITIVE_INFINITY; } + static boolean isminf(double d) { return d == Double.NEGATIVE_INFINITY; } + + /** + Returns the integer value that is closest to the argument. The + result is undefined if a denormal number is given. + */ + public static double rint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); } + + /** + Returns the result of multiplying the floating-point number x + by 2 raised to the power q + */ + public static double ldexp(double x, int q) { + int m = q >> 31; + m = (((m + q) >> 9) - m) << 7; + q = q - (m << 2); + m += 0x3ff; + m = m < 0 ? 0 : m; + m = m > 0x7ff ? 0x7ff : m; + double u = Double.longBitsToDouble(((long)m) << 52); + x = x * u * u * u * u; + u = Double.longBitsToDouble(((long)(q + 0x3ff)) << 52); + return x * u; + } + + static double pow2i(int q) { + return Double.longBitsToDouble(((long)(q + 0x3ff)) << 52); + } + + static int ilogbp1(double d) { + boolean m = d < 4.9090934652977266E-91; + d = m ? 2.037035976334486E90 * d : d; + int q = (int)(Double.doubleToRawLongBits(d) >> 52) & 0x7ff; + q = m ? q - (300 + 0x03fe) : q - 0x03fe; + return q; + } + + /** + Returns the exponent part of their argument as a signed integer + */ + public static int ilogb(double d) { + int e = ilogbp1(fabs(d)) - 1; + e = d == 0 ? -2147483648 : e; + e = d == Double.POSITIVE_INFINITY || d == Double.NEGATIVE_INFINITY ? 2147483647 : e; + return e; + } + + // + + static boolean cmpDenorm(double x, double y) { + if (isnan(x) && isnan(y)) return true; + if (x == Double.POSITIVE_INFINITY && y == Double.POSITIVE_INFINITY) return true; + if (x == Double.NEGATIVE_INFINITY && y == Double.NEGATIVE_INFINITY) return true; + if (!isnan(x) && !isnan(y) && !isinf(x) && !isinf(y)) return true; + return false; + } + + /** + Checks if the argument is +0. + */ + public static boolean isPlusZero(double x) { return x == 0 && Math.copySign(1, x) == 1; } + + /** + Checks if the argument is -0. + */ + public static boolean isMinusZero(double x) { return x == 0 && Math.copySign(1, x) == -1; } + + static double sign(double d) { return Math.copySign(1, d); } + + // + + /** + This class represents a vector of two double values. + */ + public static class double2 { + public double x, y; + public double2() {} + public double2(double x, double y) { this.x = x; this.y = y; } + + public String toString() { + return "(double2:" + x + " + " + y + ")"; + } + } + + static double2 ddnormalize_d2_d2(double2 t) { + double2 s = new double2(); + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; + } + + static double2 ddscale_d2_d2_d(double2 d, double s) { + double2 r = new double2(); + + r.x = d.x * s; + r.y = d.y * s; + + return r; + } + + static double2 ddadd2_d2_d_d(double x, double y) { + double2 r = new double2(); + + r.x = x + y; + double v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; + } + + static double2 ddadd_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r = new double2(); + + //assert(isnan(x.x) || isnan(y) || fabs(x.x) >= fabs(y)); + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; + } + + static double2 ddadd2_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r = new double2(); + + r.x = x.x + y; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; + } + + static double2 ddadd_d2_d_d2(double x, double2 y) { + // |x| >= |y| + + double2 r = new double2(); + + //assert(isnan(x) || isnan(y.x) || fabs(x) >= fabs(y.x)); + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; + } + + static double2 ddadd_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r = new double2(); + + //assert(isnan(x.x) || isinf(x.x) || isnan(y.x) || isinf(y.x) || fabs(x.x) >= fabs(y.x)) : "x.x = " + x.x + ", y.x = " + y.x; + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; + } + + static double2 ddadd2_d2_d2_d2(double2 x, double2 y) { + double2 r = new double2(); + + r.x = x.x + y.x; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; + } + + static double2 ddsub_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r = new double2(); + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; + } + + static double2 dddiv_d2_d2_d2(double2 n, double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double nhh = upper(n.x), nhl = n.x - nhh; + + double2 q = new double2(); + + q.x = n.x * t; + + double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; + } + + static double2 ddmul_d2_d_d(double x, double y) { + double xh = upper(x), xl = x - xh; + double yh = upper(y), yl = y - yh; + double2 r = new double2(); + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; + } + + static double2 ddmul_d2_d2_d(double2 x, double y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y ), yl = y - yh; + double2 r = new double2(); + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; + } + + static double2 ddmul_d2_d2_d2(double2 x, double2 y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y.x), yl = y.x - yh; + double2 r = new double2(); + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; + } + + static double2 ddsqu_d2_d2(double2 x) { + double xh = upper(x.x), xl = x.x - xh; + double2 r = new double2(); + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; + } + + static double2 ddrec_d2_d(double d) { + double t = 1.0 / d; + double dh = upper(d), dl = d - dh; + double th = upper(t), tl = t - th; + double2 q = new double2(); + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; + } + + static double2 ddrec_d2_d2(double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double2 q = new double2(); + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; + } + + static double2 ddsqrt_d2_d2(double2 d) { + double t = Math.sqrt(d.x + d.y); + return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5); + } + + // + + static double atan2k(double y, double x) { + double s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = u * t * s + s; + t = q * (Math.PI/2) + t; + + return t; + } + + /** + This method calculates the arc tangent of y/x in radians, using + the signs of the two arguments to determine the quadrant of the + result. The results may have maximum error of 2 ulps. + */ + public static double atan2(double y, double x) { + double r = atan2k(fabs(y), x); + + r = mulsign(r, x); + if (isinf(x) || x == 0) r = Math.PI/2 - (isinf(x) ? (sign(x) * (Math.PI /2)) : 0); + if (isinf(y) ) r = Math.PI/2 - (isinf(x) ? (sign(x) * (Math.PI*1/4)) : 0); + if ( y == 0) r = (sign(x) == -1 ? Math.PI : 0); + + return isnan(x) || isnan(y) ? Double.NaN : mulsign(r, y); + } + + /** + This method calculates the arc sine of x in radians. The return + value is in the range [-pi/2, pi/2]. The results may have + maximum error of 3 ulps. + */ + public static double asin(double d) { + return mulsign(atan2k(fabs(d), Math.sqrt((1+d)*(1-d))), d); + } + + /** + This method calculates the arc cosine of x in radians. The + return value is in the range [0, pi]. The results may have + maximum error of 3 ulps. + */ + public static double acos(double d) { + return mulsign(atan2k(Math.sqrt((1+d)*(1-d)), fabs(d)), d) + (d < 0 ? Math.PI : 0); + } + + /** + Returns the arc tangent of an angle. The results may have + maximum error of 2 ulps. + */ + public static double atan(double s) { + double t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0 / s; q |= 1; } + + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982 - t; + if ((q & 2) != 0) t = -t; + + return t; + } + + private static final double PI4_A = 0.78539816290140151978; + private static final double PI4_B = 4.9604678871439933374e-10; + private static final double PI4_C = 1.1258708853173288931e-18; + private static final double PI4_D = 1.7607799325916000908e-27; + + private static final double M_1_PI = 0.3183098861837906715377675267450287; + + /** + Returns the trigonometric sine of an angle. The results may + have maximum error of 2 ulps. + */ + public static double sin(double d) { + int q; + double u, s; + + u = d * M_1_PI; + q = (int)(u < 0 ? u - 0.5 : u + 0.5); + + d = mla(q, -PI4_A*4, d); + d = mla(q, -PI4_B*4, d); + d = mla(q, -PI4_C*4, d); + d = mla(q, -PI4_D*4, d); + + if ((q & 1) != 0) d = -d; + + s = d * d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; + } + + /** + Returns the trigonometric cosine of an angle. The results may + have maximum error of 2 ulps. + */ + public static double cos(double d) { + int q; + double u, s; + + q = 1 + 2*(int)rint(d * M_1_PI - 0.5); + + d = mla(q, -PI4_A*2, d); + d = mla(q, -PI4_B*2, d); + d = mla(q, -PI4_C*2, d); + d = mla(q, -PI4_D*2, d); + + if ((q & 2) == 0) d = -d; + + s = d * d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; + } + + /** + Returns the trigonometric sine and cosine of an angle at a + time. The sine and cosine of an argument is returned by the x + and y field of the return value, respectively. The results may + have maximum error of 2 ulps. + */ + public static double2 sincos(double d) { + int q; + double u, s, t; + double2 r = new double2(); + + q = (int)rint(d * (2 * M_1_PI)); + + s = d; + + s = mla(-q, PI4_A*2, s); + s = mla(-q, PI4_B*2, s); + s = mla(-q, PI4_C*2, s); + s = mla(-q, PI4_D*2, s); + + t = s; + + s = s * s; + + u = 1.58938307283228937328511e-10; + u = mla(u, s, -2.50506943502539773349318e-08); + u = mla(u, s, 2.75573131776846360512547e-06); + u = mla(u, s, -0.000198412698278911770864914); + u = mla(u, s, 0.0083333333333191845961746); + u = mla(u, s, -0.166666666666666130709393); + u = u * s * t; + + r.x = t + u; + + u = -1.13615350239097429531523e-11; + u = mla(u, s, 2.08757471207040055479366e-09); + u = mla(u, s, -2.75573144028847567498567e-07); + u = mla(u, s, 2.48015872890001867311915e-05); + u = mla(u, s, -0.00138888888888714019282329); + u = mla(u, s, 0.0416666666666665519592062); + u = mla(u, s, -0.5); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (isinf(d)) { r.x = r.y = Double.NaN; } + + return r; + } + + /** + Returns the trigonometric tangent of an angle. The results may + have maximum error of 3 ulps. + */ + public static double tan(double d) { + int q; + double u, s, x; + + q = (int)rint(d * (2 * M_1_PI)); + + x = mla(q, -PI4_A*2, d); + x = mla(q, -PI4_B*2, x); + x = mla(q, -PI4_C*2, x); + x = mla(q, -PI4_D*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 1.01419718511083373224408e-05; + u = mla(u, s, -2.59519791585924697698614e-05); + u = mla(u, s, 5.23388081915899855325186e-05); + u = mla(u, s, -3.05033014433946488225616e-05); + u = mla(u, s, 7.14707504084242744267497e-05); + u = mla(u, s, 8.09674518280159187045078e-05); + u = mla(u, s, 0.000244884931879331847054404); + u = mla(u, s, 0.000588505168743587154904506); + u = mla(u, s, 0.00145612788922812427978848); + u = mla(u, s, 0.00359208743836906619142924); + u = mla(u, s, 0.00886323944362401618113356); + u = mla(u, s, 0.0218694882853846389592078); + u = mla(u, s, 0.0539682539781298417636002); + u = mla(u, s, 0.133333333333125941821962); + u = mla(u, s, 0.333333333333334980164153); + + u = mla(s, u * x, x); + + if ((q & 1) != 0) u = 1.0 / u; + + if (isinf(d)) u = Double.NaN; + + return u; + } + + // + + private static final double L2U = .69314718055966295651160180568695068359375; + private static final double L2L = .28235290563031577122588448175013436025525412068e-12; + private static final double R_LN2 = 1.442695040888963407359924681001892137426645954152985934135449406931; + + /** + Returns the natural logarithm of the argument. The results may + have maximum error of 3 ulps. + */ + public static double log(double d) { + double x, x2, t, m; + int e, i; + + e = ilogbp1(d * 0.7071); + m = ldexp(d, -e); + + x = (m-1) / (m+1); + x2 = x * x; + + t = 0.148197055177935105296783; + t = mla(t, x2, 0.153108178020442575739679); + t = mla(t, x2, 0.181837339521549679055568); + t = mla(t, x2, 0.22222194152736701733275); + t = mla(t, x2, 0.285714288030134544449368); + t = mla(t, x2, 0.399999999989941956712869); + t = mla(t, x2, 0.666666666666685503450651); + t = mla(t, x2, 2); + + x = x * t + 0.693147180559945286226764 * e; + + if (ispinf(d)) x = Double.POSITIVE_INFINITY; + if (d < 0) x = Double.NaN; + if (d == 0) x = Double.NEGATIVE_INFINITY; + + return x; + } + + /** + Returns the value of e raised to the power of the argument. The + results may have maximum error of 1 ulps. + */ + public static double exp(double d) { + int q = (int)rint(d * R_LN2); + double s, u; + + s = mla(q, -L2U, d); + s = mla(q, -L2L, s); + + u = 2.08860621107283687536341e-09; + u = mla(u, s, 2.51112930892876518610661e-08); + u = mla(u, s, 2.75573911234900471893338e-07); + u = mla(u, s, 2.75572362911928827629423e-06); + u = mla(u, s, 2.4801587159235472998791e-05); + u = mla(u, s, 0.000198412698960509205564975); + u = mla(u, s, 0.00138888888889774492207962); + u = mla(u, s, 0.00833333333331652721664984); + u = mla(u, s, 0.0416666666666665047591422); + u = mla(u, s, 0.166666666666666851703837); + u = mla(u, s, 0.5); + + u = s * s * u + s + 1; + u = ldexp(u, q); + + if (isminf(d)) u = 0; + + return u; + } + + static double2 logk(double d) { + double2 x, x2; + double m, t; + int e; + + e = ilogbp1(d * 0.7071); + m = ldexp(d, -e); + + x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(new double2(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); + } + + static double expk(double2 d) { + int q = (int)rint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, -q * L2U); + s = ddadd2_d2_d2_d(s, -q * L2L); + + s = ddnormalize_d2_d2(s); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + + return ldexp(t.x + t.y, q); + } + + /** + Returns the value of the first argument raised to the power of + the second argument. The results may have maximum error of 1 + ulps. + */ + public static double pow(double x, double y) { + boolean yisint = (int)y == y; + boolean yisodd = (1 & (int)y) != 0 && yisint; + + double result = expk(ddmul_d2_d2_d(logk(fabs(x)), y)); + + result = isnan(result) ? Double.POSITIVE_INFINITY : result; + result *= (x >= 0 ? 1 : (!yisint ? Double.NaN : (yisodd ? -1 : 1))); + + double efx = mulsign(fabs(x) - 1, y); + if (isinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : Double.POSITIVE_INFINITY); + if (isinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : Double.POSITIVE_INFINITY); + if (isnan(x) || isnan(y)) result = Double.NaN; + if (y == 0 || x == 1) result = 1; + + return result; + } + + static double2 expk2(double2 d) { + int q = (int)rint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, q * -L2U); + s = ddadd2_d2_d2_d(s, q * -L2L); + + s = ddnormalize_d2_d2(s); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + return ddscale_d2_d2_d(t, pow2i(q)); + } + + /** + Returns the hyperbolic sine of x. The results may have maximum + error of 2 ulps. + */ + public static double sinh(double x) { + double y = fabs(x); + double2 d = expk2(new double2(y, 0)); + d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = abs(x) > 710 ? Double.POSITIVE_INFINITY : y; + y = isnan(y) ? Double.POSITIVE_INFINITY : y; + y = mulsign(y, x); + y = isnan(x) ? Double.NaN : y; + + return y; + } + + /** + Returns the hyperbolic cosine of x. The results may have + maximum error of 2 ulps. + */ + public static double cosh(double x) { + double y = fabs(x); + double2 d = expk2(new double2(y, 0)); + d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = abs(x) > 710 ? Double.POSITIVE_INFINITY : y; + y = isnan(y) ? Double.POSITIVE_INFINITY : y; + y = isnan(x) ? Double.NaN : y; + + return y; + } + + /** + Returns the hyperbolic tangent of x. The results may have + maximum error of 2 ulps. + */ + public static double tanh(double x) { + double y = fabs(x); + double2 d = expk2(new double2(y, 0)); + double2 e = dddiv_d2_d2_d2(new double2(1, 0), d); + d = dddiv_d2_d2_d2(ddadd2_d2_d2_d2(d, ddscale_d2_d2_d(e, -1)), ddadd2_d2_d2_d2(d, e)); + y = d.x + d.y; + + y = abs(x) > 18.714973875 ? 1.0 : y; + y = isnan(y) ? 1.0 : y; + y = mulsign(y, x); + y = isnan(x) ? Double.NaN : y; + + return y; + } + + static double2 logk2(double2 d) { + double2 x, x2, m; + double t; + int e; + + e = ilogbp1(d.x * 0.7071); + m = ddscale_d2_d2_d(d, pow2i(-e)); + + x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(new double2(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); + } + + /** + Returns the inverse hyperbolic sine of x. The results may have + maximum error of 2 ulps. + */ + public static double asinh(double x) { + double y = fabs(x); + double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(y, y), 1)), y)); + y = d.x + d.y; + + y = isinf(x) || isnan(y) ? Double.POSITIVE_INFINITY : y; + y = mulsign(y, x); + y = isnan(x) ? Double.NaN : y; + + return y; + } + + /** + Returns the inverse hyperbolic cosine of x. The results may + have maximum error of 2 ulps. + */ + public static double acosh(double x) { + double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(x, x), -1)), x)); + double y = d.x + d.y; + + y = isinf(x) || isnan(y) ? Double.POSITIVE_INFINITY : y; + y = x == 1.0 ? 0.0 : y; + y = x < 1.0 ? Double.NaN : y; + y = isnan(x) ? Double.NaN : y; + + return y; + } + + /** + Returns the inverse hyperbolic tangent of x. The results may + have maximum error of 2 ulps. + */ + public static double atanh(double x) { + double y = fabs(x); + double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y))); + y = y > 1.0 ? Double.NaN : (y == 1.0 ? Double.POSITIVE_INFINITY : (d.x + d.y) * 0.5); + + y = isinf(x) || isnan(y) ? Double.NaN : y; + y = mulsign(y, x); + y = isnan(x) ? Double.NaN : y; + + return y; + } + + /** + This function performs a fused multiply-accumulate + operation. This function computes x*y+z, with a single + rounding. This implementation gives the exact result unless an + overflow occurs. + */ + public static double fma(double x, double y, double z) { + double xh = Double.longBitsToDouble((Double.doubleToRawLongBits(x) + 0x4000000) & 0xfffffffff8000000L), xl = x - xh; + double yh = Double.longBitsToDouble((Double.doubleToRawLongBits(y) + 0x4000000) & 0xfffffffff8000000L), yl = y - yh; + + double h = x * y; + double l = xh * yh - h + xl * yh + xh * yl + xl * yl; + + double h2, l2, v; + + h2 = h + z; + v = h2 - h; + l2 = (h - (h2 - v)) + (z - v) + l; + + return h2 + l2; + } + + /** + This function returns the square root of the argument. This + implementation gives the exact result(less than or equal to 0.5 + ulp of error). + */ + public static double sqrt(double d) { + double q = 1; + + if (d < 8.636168555094445E-78) { + d *= 1.157920892373162E77; + q = 2.9387358770557188E-39; + } + + // http://en.wikipedia.org/wiki/Fast_inverse_square_root + double x = Double.longBitsToDouble(0x5fe6ec85e7de30daL - (Double.doubleToRawLongBits(d + 1e-320) >> 1)); + + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + + x = fma(d * x, d * x, -d) * (x * -0.5) + d * x; + + return d == Double.POSITIVE_INFINITY ? Double.POSITIVE_INFINITY : x * q; + } + + /** + This function returns the cube root of the argument. The + results may have maximum error of 2 ulps. + */ + public static double cbrt(double d) { + double x, y, q = 1.0; + int e, r; + + e = ilogbp1(d); + d = ldexp(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106 : q; + q = (r == 2) ? 1.5874010519681994747517056 : q; + q = ldexp(q, (e + 6144) / 3 - 2048); + + q = mulsign(q, d); + d = fabs(d); + + x = -0.640245898480692909870982; + x = x * d + 2.96155103020039511818595; + x = x * d + -5.73353060922947843636166; + x = x * d + 6.03990368989458747961407; + x = x * d + -3.85841935510444988821632; + x = x * d + 2.2307275302496609725722; + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); + y = d * x * x; + y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q; + + return y; + } + + /** + Returns the value of 2 raised to the power of the argument. The + results may have maximum error of 1 ulp. + */ + public static double exp2(double a) { + double u = expk(ddmul_d2_d2_d(new double2(0.69314718055994528623, 2.3190468138462995584e-17), a)); + if (a > 1023) u = Double.POSITIVE_INFINITY; + if (isminf(a)) u = 0; + return u; + } + + /** + Returns the value of 10 raised to the power of the + argument. The results may have maximum error of 1 ulp. + */ + public static double exp10(double a) { + double u = expk(ddmul_d2_d2_d(new double2(2.3025850929940459011, -2.1707562233822493508e-16), a)); + if (a > 308) u = Double.POSITIVE_INFINITY; + if (isminf(a)) u = 0; + return u; + } + + /** + Returns a value equivalent to exp(a)-1. The result is accurate + even when the value of a is close to zero. The results may have + maximum error of 1 ulp. + */ + public static double expm1(double a) { + double2 d = ddadd2_d2_d2_d(expk2(new double2(a, 0)), -1.0); + double x = d.x + d.y; + if (a > 700) x = Double.POSITIVE_INFINITY; + if (a < -0.36043653389117156089696070315825181539851971360337e+2) x = -1; + return x; + } + + /** + Returns the base 10 logarithm of the argument. The results may + have maximum error of 1 ulp. + */ + public static double log10(double a) { + double2 d = ddmul_d2_d2_d2(logk(a), new double2(0.43429448190325176116, 6.6494347733425473126e-17)); + double x = d.x + d.y; + + if (ispinf(a)) x = Double.POSITIVE_INFINITY; + if (a < 0) x = Double.NaN; + if (a == 0) x = -Double.POSITIVE_INFINITY; + + return x; + } + + /** + Returns a value equivalent to log(1+a). The result is accurate + even when the value of a is close to zero. The results may have + maximum error of 1 ulp. + */ + public static double log1p(double a) { + double2 d = logk2(ddadd2_d2_d_d(a, 1)); + double x = d.x + d.y; + + if (ispinf(a)) x = Double.POSITIVE_INFINITY; + if (a < -1) x = Double.NaN; + if (a == -1) x = -Double.POSITIVE_INFINITY; + + return x; + } + + // + + /** + This class represents a vector of two float values. + */ + public static class float2 { + public float x, y; + public float2() {} + public float2(float x, float y) { this.x = x; this.y = y; } + + public String toString() { + return "(float2:" + x + " + " + y + ")"; + } + } + + private static final float PI4_Af = 0.78515625f; + private static final float PI4_Bf = 0.00024187564849853515625f; + private static final float PI4_Cf = 3.7747668102383613586e-08f; + private static final float PI4_Df = 1.2816720341285448015e-12f; + + private static final float L2Uf = 0.693145751953125f; + private static final float L2Lf = 1.428606765330187045e-06f; + + private static final float R_LN2f = 1.442695040888963407359924681001892137426645954152985934135449406931f; + private static final float M_PIf = ((float)Math.PI); + + private static final float INFINITYf = Float.POSITIVE_INFINITY; + private static final float NANf = Float.NaN; + + private static float mlaf(float x, float y, float z) { return x * y + z; } + private static float mulsignf(float x, float y) { return (float)(Math.copySign(1, y) * x); } + private static float signf(float d) { return (float)Math.copySign(1, d); } + + private static float sqrtf(float f) { return (float)Math.sqrt(f); } + + private static float fabsf(float d) { return (float)Math.copySign(d, 1); } + private static float maxf(float x, float y) { return x > y ? x : y; } + private static boolean isnanf(float d) { return d != d; } + private static boolean isinff(float d) { return fabs(d) == Float.POSITIVE_INFINITY; } + + private static boolean ispinff(float d) { return d == Float.POSITIVE_INFINITY; } + private static boolean isminff(float d) { return d == Float.NEGATIVE_INFINITY; } + private static float rintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); } + + static int floatToRawIntBits(float d) { return Float.floatToRawIntBits(d); } + static float intBitsToFloat(int i) { return Float.intBitsToFloat(i); } + + static int ilogbp1f(float d) { + boolean m = d < 5.421010862427522E-20f; + d = m ? 1.8446744073709552E19f * d : d; + int q = (floatToRawIntBits(d) >> 23) & 0xff; + q = m ? q - (64 + 0x7e) : q - 0x7e; + return q; + } + + static float pow2if(int q) { + return intBitsToFloat(((int)(q + 0x7f)) << 23); + } + + public static float ldexpf(float x, int q) { + float u; + int m; + m = q >> 31; + m = (((m + q) >> 6) - m) << 4; + q = q - (m << 2); + m += 127; + m = m < 0 ? 0 : m; + m = m > 255 ? 255 : m; + u = intBitsToFloat(((int)m) << 23); + x = x * u * u * u * u; + u = intBitsToFloat(((int)(q + 0x7f)) << 23); + return x * u; + } + + static float upperf(float d) { + return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000); + } + + static float2 df(float h, float l) { + float2 ret = new float2(); + ret.x = h; ret.y = l; + return ret; + } + + static float2 dfnormalize_f2_f2(float2 t) { + float2 s = new float2(); + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; + } + + static float2 dfscale_f2_f2_f(float2 d, float s) { + float2 r = new float2(); + + r.x = d.x * s; + r.y = d.y * s; + + return r; + } + + static float2 dfadd2_f2_f_f(float x, float y) { + float2 r = new float2(); + + r.x = x + y; + float v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; + } + + static float2 dfadd_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r = new float2(); + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; + } + + static float2 dfadd2_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r = new float2(); + + r.x = x.x + y; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; + } + + static float2 dfadd_f2_f_f2(float x, float2 y) { + // |x| >= |y| + + float2 r = new float2(); + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; + } + + static float2 dfadd_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r = new float2(); + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; + } + + static float2 dfadd2_f2_f2_f2(float2 x, float2 y) { + float2 r = new float2(); + + r.x = x.x + y.x; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; + } + + static float2 dfsub_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r = new float2(); + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; + } + + static float2 dfdiv_f2_f2_f2(float2 n, float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float nhh = upperf(n.x), nhl = n.x - nhh; + + float2 q = new float2(); + + q.x = n.x * t; + + float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; + } + + static float2 dfmul_f2_f_f(float x, float y) { + float xh = upperf(x), xl = x - xh; + float yh = upperf(y), yl = y - yh; + float2 r = new float2(); + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; + } + + static float2 dfmul_f2_f2_f(float2 x, float y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y ), yl = y - yh; + float2 r = new float2(); + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; + } + + static float2 dfmul_f2_f2_f2(float2 x, float2 y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y.x), yl = y.x - yh; + float2 r = new float2(); + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; + } + + static float2 dfsqu_f2_f2(float2 x) { + float xh = upperf(x.x), xl = x.x - xh; + float2 r = new float2(); + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; + } + + static float2 dfrec_f2_f(float d) { + float t = 1.0f / d; + float dh = upperf(d), dl = d - dh; + float th = upperf(t), tl = t - th; + float2 q = new float2(); + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; + } + + static float2 dfrec_f2_f2(float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float2 q = new float2(); + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; + } + + static float2 dfsqrt_f2_f2(float2 d) { + float t = sqrtf(d.x + d.y); + return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f); + } + + /** + This function returns the cube root of the argument in single + precision. The results may have maximum error of 2 ulps. + */ + public static float cbrtf(float d) { + float x, y, q = 1.0f; + int e, r; + + e = ilogbp1f(d); + d = ldexpf(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106f : q; + q = (r == 2) ? 1.5874010519681994747517056f : q; + q = ldexpf(q, (e + 6144) / 3 - 2048); + + q = mulsignf(q, d); + d = fabsf(d); + + x = -0.601564466953277587890625f; + x = mlaf(x, d, 2.8208892345428466796875f); + x = mlaf(x, d, -5.532182216644287109375f); + x = mlaf(x, d, 5.898262500762939453125f); + x = mlaf(x, d, -3.8095417022705078125f); + x = mlaf(x, d, 2.2241256237030029296875f); + + y = d * x * x; + y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q; + + return y; + } + + /** + Returns the trigonometric sine of an angle in single + precision. The results may have maximum error of 3 ulps. + */ + public static float sinf(float d) { + int q; + float u, s; + + q = (int)rintf(d * (float)M_1_PI); + + d = mlaf(q, -PI4_Af*4, d); + d = mlaf(q, -PI4_Bf*4, d); + d = mlaf(q, -PI4_Cf*4, d); + d = mlaf(q, -PI4_Df*4, d); + + s = d * d; + + if ((q & 1) != 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (isinff(d)) { u = NANf; } + + return u; + } + + /** + Returns the trigonometric cosine of an angle in single + precision. The results may have maximum error of 3 ulps. + */ + public static float cosf(float d) { + int q; + float u, s; + + q = 1 + 2*(int)rintf(d * (float)M_1_PI - 0.5f); + + d = mlaf(q, -PI4_Af*2, d); + d = mlaf(q, -PI4_Bf*2, d); + d = mlaf(q, -PI4_Cf*2, d); + d = mlaf(q, -PI4_Df*2, d); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (isinff(d)) { u = NANf; } + + return u; + } + + /** + Returns the trigonometric sine and cosine of an angle in single + precision at a time. The sine and cosine of an argument is + returned by the x and y field of the return value, + respectively. The results may have maximum error of 3 ulps. + */ + public static float2 sincosf(float d) { + int q; + float u, s, t; + float2 r = new float2(); + + q = (int)rintf(d * ((float)(2 * M_1_PI))); + + s = d; + + s = mlaf(q, -PI4_Af*2, s); + s = mlaf(q, -PI4_Bf*2, s); + s = mlaf(q, -PI4_Cf*2, s); + s = mlaf(q, -PI4_Df*2, s); + + t = s; + + s = s * s; + + u = -0.000195169282960705459117889f; + u = mlaf(u, s, 0.00833215750753879547119141f); + u = mlaf(u, s, -0.166666537523269653320312f); + u = u * s * t; + + r.x = t + u; + + u = -2.71811842367242206819355e-07f; + u = mlaf(u, s, 2.47990446951007470488548e-05f); + u = mlaf(u, s, -0.00138888787478208541870117f); + u = mlaf(u, s, 0.0416666641831398010253906f); + u = mlaf(u, s, -0.5f); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (isinff(d)) { r.x = r.y = NANf; } + + return r; + } + + /** + Returns the trigonometric tangent of an angle in single + precision. The results may have maximum error of 4 ulps. + */ + public static float tanf(float d) { + int q; + float u, s, x; + + q = (int)rintf(d * (float)(2 * M_1_PI)); + + x = d; + + x = mlaf(q, -PI4_Af*2, x); + x = mlaf(q, -PI4_Bf*2, x); + x = mlaf(q, -PI4_Cf*2, x); + x = mlaf(q, -PI4_Df*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 0.00927245803177356719970703f; + u = mlaf(u, s, 0.00331984995864331722259521f); + u = mlaf(u, s, 0.0242998078465461730957031f); + u = mlaf(u, s, 0.0534495301544666290283203f); + u = mlaf(u, s, 0.133383005857467651367188f); + u = mlaf(u, s, 0.333331853151321411132812f); + + u = mlaf(s, u * x, x); + + if ((q & 1) != 0) u = 1.0f / u; + + if (isinff(d)) u = NANf; + + return u; + } + + /** + Returns the arc tangent of an angle in single precision. The + results may have maximum error of 3 ulps. + */ + public static float atanf(float s) { + float t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0f / s; q |= 1; } + + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982f - t; + if ((q & 2) != 0) t = -t; + + return t; + } + + private static float atan2kf(float y, float x) { + float s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = u * t * s + s; + t = q * (float)(M_PIf/2) + t; + + return t; + } + + /** + This method calculates the arc tangent of y/x in single + precision. It uses the signs of the two arguments to determine + the quadrant of the result. The results may have maximum error + of 3 ulps. + */ + public static float atan2f(float y, float x) { + float r = atan2kf(fabsf(y), x); + + r = mulsignf(r, x); + if (isinff(x) || x == 0) r = M_PIf/2 - (isinff(x) ? (signf(x) * (float)(M_PIf /2)) : 0); + if (isinff(y) ) r = M_PIf/2 - (isinff(x) ? (signf(x) * (float)(M_PIf*1/4)) : 0); + if ( y == 0) r = (signf(x) == -1 ? M_PIf : 0); + + return isnanf(x) || isnanf(y) ? NANf : mulsignf(r, y); + } + + /** + This method calculates the arc sine of x in single + precision. The results may have maximum error of 3 ulps. + */ + public static float asinf(float d) { + return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d); + } + + /** + This method calculates the arc cosine of x in single + precision. The results may have maximum error of 3 ulps. + */ + public static float acosf(float d) { + return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)M_PIf : 0.0f); + } + + /** + Returns the natural logarithm of the argument in single + precision. The results may have maximum error of 3 ulps. + */ + public static float logf(float d) { + float x, x2, t, m; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpf(d, -e); + + x = (m-1.0f) / (m+1.0f); + x2 = x * x; + + t = 0.2371599674224853515625f; + t = mlaf(t, x2, 0.285279005765914916992188f); + t = mlaf(t, x2, 0.400005519390106201171875f); + t = mlaf(t, x2, 0.666666567325592041015625f); + t = mlaf(t, x2, 2.0f); + + x = x * t + 0.693147180559945286226764f * e; + + if (isinff(d)) x = INFINITYf; + if (d < 0) x = NANf; + if (d == 0) x = -INFINITYf; + + return x; + } + + /** + Returns the value of e raised to the power of the argument in + single precision. The results may have maximum error of 1 ulps. + */ + public static float expf(float d) { + int q = (int)rintf(d * R_LN2f); + float s, u; + + s = mlaf(q, -L2Uf, d); + s = mlaf(q, -L2Lf, s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s, 0.00836596917361021041870117f); + u = mlaf(u, s, 0.0416710823774337768554688f); + u = mlaf(u, s, 0.166665524244308471679688f); + u = mlaf(u, s, 0.499999850988388061523438f); + + u = s * s * u + s + 1.0f; + u = ldexpf(u, q); + + if (isminff(d)) u = 0; + + return u; + } + + static float expkf(float2 d) { + int q = (int)rintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + s = dfnormalize_f2_f2(s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return ldexpf(t.x + t.y, q); + } + + static float2 logkf(float d) { + float2 x, x2; + float m, t; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpf(d, -e); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); + } + + public static float powf(float x, float y) { + boolean yisint = (int)y == y; + boolean yisodd = (1 & (int)y) != 0 && yisint; + + float result = expkf(dfmul_f2_f2_f(logkf(fabsf(x)), y)); + + result = isnanf(result) ? INFINITYf : result; + result *= (x >= 0 ? 1 : (!yisint ? NANf : (yisodd ? -1 : 1))); + + float efx = mulsignf(fabsf(x) - 1, y); + if (isinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : INFINITYf); + if (isinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITYf); + if (isnanf(x) || isnanf(y)) result = NANf; + if (y == 0 || x == 1) result = 1; + + return result; + } + + static float2 expk2f(float2 d) { + int q = (int)rintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + s = dfnormalize_f2_f2(s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return dfscale_f2_f2_f(t, pow2if(q)); + } + + public static float sinhf(float x) { + float y = fabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = fabsf(x) > 89 ? INFINITYf : y; + y = isnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = isnanf(x) ? NANf : y; + + return y; + } + + public static float coshf(float x) { + float y = fabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = fabsf(x) > 89 ? INFINITYf : y; + y = isnanf(y) ? INFINITYf : y; + y = isnanf(x) ? NANf : y; + + return y; + } + + public static float tanhf(float x) { + float y = fabsf(x); + float2 d = expk2f(df(y, 0)); + float2 e = dfdiv_f2_f2_f2(df(1, 0), d); + d = dfdiv_f2_f2_f2(dfadd2_f2_f2_f2(d, dfscale_f2_f2_f(e, -1)), dfadd2_f2_f2_f2(d, e)); + y = d.x + d.y; + + y = fabsf(x) > 8.664339742f ? 1.0f : y; + y = isnanf(y) ? 1.0f : y; + y = mulsignf(y, x); + y = isnanf(x) ? NANf : y; + + return y; + } + + static float2 logk2f(float2 d) { + float2 x, x2, m; + float t; + int e; + + e = ilogbp1f(d.x * 0.7071f); + m = dfscale_f2_f2_f(d, pow2if(-e)); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); + } + + public static float asinhf(float x) { + float y = fabsf(x); + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(y, y), 1)), y)); + y = d.x + d.y; + + y = isinff(x) || isnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = isnanf(x) ? NANf : y; + + return y; + } + + public static float acoshf(float x) { + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(x, x), -1)), x)); + float y = d.x + d.y; + + y = isinff(x) || isnanf(y) ? INFINITYf : y; + y = x == 1.0f ? 0.0f : y; + y = x < 1.0f ? NANf : y; + y = isnanf(x) ? NANf : y; + + return y; + } + + public static float atanhf(float x) { + float y = fabsf(x); + float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y))); + y = y > 1.0 ? NANf : (y == 1.0 ? INFINITYf : (d.x + d.y) * 0.5f); + + y = isinff(x) || isnanf(y) ? NANf : y; + y = mulsignf(y, x); + y = isnanf(x) ? NANf : y; + + return y; + } + + public static float exp2f(float a) { + float u = expkf(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), a)); + if (ispinff(a)) u = INFINITYf; + if (isminff(a)) u = 0; + return u; + } + + public static float exp10f(float a) { + float u = expkf(dfmul_f2_f2_f(df(2.3025851249694824219f, -3.1975436520781386207e-08f), a)); + if (ispinff(a)) u = INFINITYf; + if (isminff(a)) u = 0; + return u; + } + + public static float expm1f(float a) { + float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f); + float x = d.x + d.y; + if (a > 88.0f) x = INFINITYf; + if (a < -0.15942385152878742116596338793538061065739925620174e+2f) x = -1; + return x; + } + + public static float log10f(float a) { + float2 d = dfmul_f2_f2_f2(logkf(a), df(0.43429449200630187988f, -1.0103050118726031315e-08f)); + float x = d.x + d.y; + + if (isinff(a)) x = INFINITYf; + if (a < 0) x = NANf; + if (a == 0) x = -INFINITYf; + + return x; + } + + public static float log1pf(float a) { + float2 d = logk2f(dfadd2_f2_f_f(a, 1)); + float x = d.x + d.y; + + if (isinff(a)) x = INFINITYf; + if (a < -1) x = NANf; + if (a == -1) x = -INFINITYf; + + return x; + } +} diff --git a/purec/Makefile b/purec/Makefile new file mode 100644 index 00000000..3dc6012e --- /dev/null +++ b/purec/Makefile @@ -0,0 +1,25 @@ +CC=gcc + +iut : sleefdp.c sleefsp.c iut.c + $(CC) -Wall -DNDEBUG sleefdp.c sleefsp.c iut.c -o iut -lm + +../tester/tester : + cd ../tester; make tester + +../tester/testeru1 : + cd ../tester; make testeru1 + +../tester/testersp : + cd ../tester; make testersp + +../tester/testerspu1 : + cd ../tester; make testerspu1 + +test : iut ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 + ../tester/tester ./iut + ../tester/testeru1 ./iut + ../tester/testersp ./iut + ../tester/testerspu1 ./iut + +clean : + rm -f *~ *.o iut diff --git a/purec/Makefile.icc b/purec/Makefile.icc new file mode 100644 index 00000000..8d583734 --- /dev/null +++ b/purec/Makefile.icc @@ -0,0 +1,19 @@ +CC=/opt/intel/bin/icc + +iut : sleefdp.c sleefsp.c iut.c + $(CC) -Wall -fp-model precise sleefdp.c sleefsp.c iut.c -o iut -lm + +../tester/tester : + cd ../tester; make tester + +../tester/testersp : + cd ../tester; make testersp + +test : iut ../tester/tester + ../tester/tester ./iut + +testsp : iut ../tester/testersp + ../tester/testersp ./iut + +clean : + rm -f *~ *.o iut diff --git a/purec/iut.c b/purec/iut.c new file mode 100644 index 00000000..89c30b5e --- /dev/null +++ b/purec/iut.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "sleef.h" + +int readln(int fd, char *buf, int cnt) { + int i, rcnt = 0; + + if (cnt < 1) return -1; + + while(cnt >= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int startsWith(char *str, char *prefix) { + return strncmp(str, prefix, strlen(prefix)) == 0; +} + +double u2d(uint64_t u) { + union { + double f; + uint64_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint64_t d2u(double d) { + union { + double f; + uint64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +float u2f(uint32_t u) { + union { + float f; + uint32_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint32_t f2u(float d) { + union { + float f; + uint32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +#define BUFSIZE 1024 + +int main(int argc, char **argv) { + char buf[BUFSIZE]; + + //fprintf(stderr, "IUT start\n"); + + for(;;) { + if (readln(STDIN_FILENO, buf, BUFSIZE-1) < 1) break; + + //fprintf(stderr, "iut: got %s\n", buf); + + if (startsWith(buf, "sin ")) { + uint64_t u; + sscanf(buf, "sin %" PRIx64, &u); + u = d2u(xsin(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "sin_u1 ")) { + uint64_t u; + sscanf(buf, "sin_u1 %" PRIx64, &u); + u = d2u(xsin_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "cos ")) { + uint64_t u; + sscanf(buf, "cos %" PRIx64, &u); + u = d2u(xcos(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "cos_u1 ")) { + uint64_t u; + sscanf(buf, "cos_u1 %" PRIx64, &u); + u = d2u(xcos_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "sincos ")) { + uint64_t u; + sscanf(buf, "sincos %" PRIx64, &u); + double2 x = xsincos(u2d(u)); + printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y)); + } else if (startsWith(buf, "sincos_u1 ")) { + uint64_t u; + sscanf(buf, "sincos_u1 %" PRIx64, &u); + double2 x = xsincos_u1(u2d(u)); + printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y)); + } else if (startsWith(buf, "tan ")) { + uint64_t u; + sscanf(buf, "tan %" PRIx64, &u); + u = d2u(xtan(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "tan_u1 ")) { + uint64_t u; + sscanf(buf, "tan_u1 %" PRIx64, &u); + u = d2u(xtan_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "asin ")) { + uint64_t u; + sscanf(buf, "asin %" PRIx64, &u); + u = d2u(xasin(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "acos ")) { + uint64_t u; + sscanf(buf, "acos %" PRIx64, &u); + u = d2u(xacos(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "atan ")) { + uint64_t u; + sscanf(buf, "atan %" PRIx64, &u); + u = d2u(xatan(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "log ")) { + uint64_t u; + sscanf(buf, "log %" PRIx64, &u); + u = d2u(xlog(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "exp ")) { + uint64_t u; + sscanf(buf, "exp %" PRIx64, &u); + u = d2u(xexp(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "atan2 ")) { + uint64_t u, v; + sscanf(buf, "atan2 %" PRIx64 " %" PRIx64, &u, &v); + u = d2u(xatan2(u2d(u), u2d(v))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "asin_u1 ")) { + uint64_t u; + sscanf(buf, "asin_u1 %" PRIx64, &u); + u = d2u(xasin_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "acos_u1 ")) { + uint64_t u; + sscanf(buf, "acos_u1 %" PRIx64, &u); + u = d2u(xacos_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "atan_u1 ")) { + uint64_t u; + sscanf(buf, "atan_u1 %" PRIx64, &u); + u = d2u(xatan_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "atan2_u1 ")) { + uint64_t u, v; + sscanf(buf, "atan2_u1 %" PRIx64 " %" PRIx64, &u, &v); + u = d2u(xatan2_u1(u2d(u), u2d(v))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "log_u1 ")) { + uint64_t u; + sscanf(buf, "log_u1 %" PRIx64, &u); + u = d2u(xlog_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "pow ")) { + uint64_t u, v; + sscanf(buf, "pow %" PRIx64 " %" PRIx64, &u, &v); + u = d2u(xpow(u2d(u), u2d(v))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "sinh ")) { + uint64_t u; + sscanf(buf, "sinh %" PRIx64, &u); + u = d2u(xsinh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "cosh ")) { + uint64_t u; + sscanf(buf, "cosh %" PRIx64, &u); + u = d2u(xcosh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "tanh ")) { + uint64_t u; + sscanf(buf, "tanh %" PRIx64, &u); + u = d2u(xtanh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "asinh ")) { + uint64_t u; + sscanf(buf, "asinh %" PRIx64, &u); + u = d2u(xasinh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "acosh ")) { + uint64_t u; + sscanf(buf, "acosh %" PRIx64, &u); + u = d2u(xacosh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "atanh ")) { + uint64_t u; + sscanf(buf, "atanh %" PRIx64, &u); + u = d2u(xatanh(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "fma ")) { + uint64_t u, v, w; + sscanf(buf, "fma %" PRIx64 " %" PRIx64 " %" PRIx64, &u, &v, &w); + u = d2u(xfma(u2d(u), u2d(v), u2d(w))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "sqrt ")) { + uint64_t u; + sscanf(buf, "sqrt %" PRIx64, &u); + u = d2u(xsqrt(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "cbrt ")) { + uint64_t u; + sscanf(buf, "cbrt %" PRIx64, &u); + u = d2u(xcbrt(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "cbrt_u1 ")) { + uint64_t u; + sscanf(buf, "cbrt_u1 %" PRIx64, &u); + u = d2u(xcbrt_u1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "exp2 ")) { + uint64_t u; + sscanf(buf, "exp2 %" PRIx64, &u); + u = d2u(xexp2(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "exp10 ")) { + uint64_t u; + sscanf(buf, "exp10 %" PRIx64, &u); + u = d2u(xexp10(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "expm1 ")) { + uint64_t u; + sscanf(buf, "expm1 %" PRIx64, &u); + u = d2u(xexpm1(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "log10 ")) { + uint64_t u; + sscanf(buf, "log10 %" PRIx64, &u); + u = d2u(xlog10(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "log1p ")) { + uint64_t u; + sscanf(buf, "log1p %" PRIx64, &u); + u = d2u(xlog1p(u2d(u))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "ldexp ")) { + uint64_t u, v; + sscanf(buf, "ldexp %" PRIx64 " %" PRIx64, &u, &v); + u = d2u(xldexp(u2d(u), (int)u2d(v))); + printf("%" PRIx64 "\n", u); + } else if (startsWith(buf, "sinf ")) { + uint32_t u; + sscanf(buf, "sinf %x", &u); + u = f2u(xsinf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "cosf ")) { + uint32_t u; + sscanf(buf, "cosf %x", &u); + u = f2u(xcosf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "sincosf ")) { + uint32_t u; + sscanf(buf, "sincosf %x", &u); + float2 x = xsincosf(u2f(u)); + printf("%x %x\n", f2u(x.x), f2u(x.y)); + } else if (startsWith(buf, "tanf ")) { + uint32_t u; + sscanf(buf, "tanf %x", &u); + u = f2u(xtanf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "asinf ")) { + uint32_t u; + sscanf(buf, "asinf %x", &u); + u = f2u(xasinf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "acosf ")) { + uint32_t u; + sscanf(buf, "acosf %x", &u); + u = f2u(xacosf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "atanf ")) { + uint32_t u; + sscanf(buf, "atanf %x", &u); + u = f2u(xatanf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "atan2f ")) { + uint32_t u, v; + sscanf(buf, "atan2f %x %x", &u, &v); + u = f2u(xatan2f(u2f(u), u2f(v))); + printf("%x\n", u); + } else if (startsWith(buf, "logf ")) { + uint32_t u; + sscanf(buf, "logf %x", &u); + u = f2u(xlogf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "expf ")) { + uint32_t u; + sscanf(buf, "expf %x", &u); + u = f2u(xexpf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "cbrtf ")) { + uint32_t u; + sscanf(buf, "cbrtf %x", &u); + u = f2u(xcbrtf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "sqrtf ")) { + uint32_t u; + sscanf(buf, "sqrtf %x", &u); + u = f2u(sqrt(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "ldexpf ")) { + uint32_t u, v; + sscanf(buf, "ldexpf %x %x", &u, &v); + u = f2u(xldexpf(u2f(u), (int)u2f(v))); + printf("%x\n", u); + } else if (startsWith(buf, "powf ")) { + uint32_t u, v; + sscanf(buf, "powf %x %x", &u, &v); + u = f2u(xpowf(u2f(u), u2f(v))); + printf("%x\n", u); + } else if (startsWith(buf, "sinhf ")) { + uint32_t u; + sscanf(buf, "sinhf %x", &u); + u = f2u(xsinhf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "coshf ")) { + uint32_t u; + sscanf(buf, "coshf %x", &u); + u = f2u(xcoshf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "tanhf ")) { + uint32_t u; + sscanf(buf, "tanhf %x", &u); + u = f2u(xtanhf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "asinhf ")) { + uint32_t u; + sscanf(buf, "asinhf %x", &u); + u = f2u(xasinhf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "acoshf ")) { + uint32_t u; + sscanf(buf, "acoshf %x", &u); + u = f2u(xacoshf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "atanhf ")) { + uint32_t u; + sscanf(buf, "atanhf %x", &u); + u = f2u(xatanhf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "exp2f ")) { + uint32_t u; + sscanf(buf, "exp2f %x", &u); + u = f2u(xexp2f(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "exp10f ")) { + uint32_t u; + sscanf(buf, "exp10f %x", &u); + u = f2u(xexp10f(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "expm1f ")) { + uint32_t u; + sscanf(buf, "expm1f %x", &u); + u = f2u(xexpm1f(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "log10f ")) { + uint32_t u; + sscanf(buf, "log10f %x", &u); + u = f2u(xlog10f(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "log1pf ")) { + uint32_t u; + sscanf(buf, "log1pf %x", &u); + u = f2u(xlog1pf(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "sinf_u1 ")) { + uint32_t u; + sscanf(buf, "sinf_u1 %x", &u); + u = f2u(xsinf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "cosf_u1 ")) { + uint32_t u; + sscanf(buf, "cosf_u1 %x", &u); + u = f2u(xcosf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "sincosf_u1 ")) { + uint32_t u; + sscanf(buf, "sincosf_u1 %x", &u); + float2 x = xsincosf_u1(u2f(u)); + printf("%x %x\n", f2u(x.x), f2u(x.y)); + } else if (startsWith(buf, "tanf_u1 ")) { + uint32_t u; + sscanf(buf, "tanf_u1 %x", &u); + u = f2u(xtanf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "asinf_u1 ")) { + uint32_t u; + sscanf(buf, "asinf_u1 %x", &u); + u = f2u(xasinf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "acosf_u1 ")) { + uint32_t u; + sscanf(buf, "acosf_u1 %x", &u); + u = f2u(xacosf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "atanf_u1 ")) { + uint32_t u; + sscanf(buf, "atanf_u1 %x", &u); + u = f2u(xatanf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "atan2f_u1 ")) { + uint32_t u, v; + sscanf(buf, "atan2f_u1 %x %x", &u, &v); + u = f2u(xatan2f_u1(u2f(u), u2f(v))); + printf("%x\n", u); + } else if (startsWith(buf, "logf_u1 ")) { + uint32_t u; + sscanf(buf, "logf_u1 %x", &u); + u = f2u(xlogf_u1(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "cbrtf_u1 ")) { + uint32_t u; + sscanf(buf, "cbrtf_u1 %x", &u); + u = f2u(xcbrtf_u1(u2f(u))); + printf("%x\n", u); + } else { + break; + } + + fflush(stdout); + } + + return 0; +} diff --git a/purec/nonnumber.h b/purec/nonnumber.h new file mode 100644 index 00000000..5d856fa9 --- /dev/null +++ b/purec/nonnumber.h @@ -0,0 +1,19 @@ +#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) +#ifdef INFINITY +#undef INFINITY +#endif + +#ifdef NAN +#undef NAN +#endif + +#define NAN __builtin_nan("") +#define NANf __builtin_nanf("") +#define INFINITY __builtin_inf() +#define INFINITYf __builtin_inff() +#else + +#include +#include + +#endif diff --git a/purec/sleef.h b/purec/sleef.h new file mode 100644 index 00000000..97142720 --- /dev/null +++ b/purec/sleef.h @@ -0,0 +1,100 @@ +typedef struct { + double x, y; +} double2; + +typedef struct { + float x, y; +} float2; + +double xsin(double d); +double xcos(double d); +double2 xsincos(double d); +double xtan(double d); +double xasin(double s); +double xacos(double s); +double xatan(double s); +double xatan2(double y, double x); +double xlog(double d); +double xexp(double d); +double xldexp(double x, int q); +int xilogb(double d); + +double xpow(double x, double y); +double xsinh(double x); +double xcosh(double x); +double xtanh(double x); +double xasinh(double x); +double xacosh(double x); +double xatanh(double x); + +double xfma(double x, double y, double z); +double xsqrt(double d); +double xcbrt(double d); + +double xexp2(double a); +double xexp10(double a); +double xexpm1(double a); +double xlog10(double a); +double xlog1p(double a); + +double xsin_u1(double d); +double xcos_u1(double d); +double2 xsincos_u1(double d); +double xtan_u1(double d); +double xasin_u1(double s); +double xacos_u1(double s); +double xatan_u1(double s); +double xatan2_u1(double y, double x); +double xlog_u1(double d); +double xexp_u1(double d); +double xpow_u1(double x, double y); +double xsinh_u1(double x); +double xcosh_u1(double x); +double xtanh_u1(double x); +double xasinh_u1(double x); +double xacosh_u1(double x); +double xatanh_u1(double x); +double xexp2_u1(double a); +double xexp10_u1(double a); +double xexpm1_u1(double a); +double xlog10_u1(double a); +double xlog1p_u1(double a); +double xcbrt_u1(double d); + +float xsinf(float d); +float xcosf(float d); +float2 xsincosf(float d); +float xtanf(float d); +float xasinf(float s); +float xacosf(float s); +float xatanf(float s); +float xatan2f(float y, float x); +float xlogf(float d); +float xexpf(float d); +float xcbrtf(float d); +float xldexpf(float x, int q); +int xilogbf(float d); + +float xpowf(float x, float y); +float xsinhf(float x); +float xcoshf(float x); +float xtanhf(float x); +float xasinhf(float x); +float xacoshf(float x); +float xatanhf(float x); +float xexp2f(float a); +float xexp10f(float a); +float xexpm1f(float a); +float xlog10f(float a); +float xlog1pf(float a); + +float xsinf_u1(float d); +float xcosf_u1(float d); +float2 xsincosf_u1(float d); +float xtanf_u1(float d); +float xasinf_u1(float s); +float xacosf_u1(float s); +float xatanf_u1(float s); +float xatan2f_u1(float y, float x); +float xlogf_u1(float d); +float xcbrtf_u1(float d); diff --git a/purec/sleefdp.c b/purec/sleefdp.c new file mode 100644 index 00000000..f91c1977 --- /dev/null +++ b/purec/sleefdp.c @@ -0,0 +1,1247 @@ +#include + +#include +#include +#include + +#include "nonnumber.h" + +#define PI4_A 0.78539816290140151978 +#define PI4_B 4.9604678871439933374e-10 +#define PI4_C 1.1258708853173288931e-18 +#define PI4_D 1.7607799325916000908e-27 + +#define M_4_PI 1.273239544735162542821171882678754627704620361328125 + +#define L2U .69314718055966295651160180568695068359375 +#define L2L .28235290563031577122588448175013436025525412068e-12 +#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 + +static inline int64_t doubleToRawLongBits(double d) { + union { + double f; + int64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +static inline double longBitsToDouble(int64_t i) { + union { + double f; + int64_t i; + } tmp; + tmp.i = i; + return tmp.f; +} + +static inline double xfabs(double x) { + return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x)); +} + +static inline double mulsign(double x, double y) { + return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63))); +} + +static inline double sign(double d) { return mulsign(1, d); } +static inline double mla(double x, double y, double z) { return x * y + z; } +static inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); } + +static inline int xisnan(double x) { return x != x; } +static inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; } +static inline int xisminf(double x) { return x == -INFINITY; } +static inline int xispinf(double x) { return x == INFINITY; } + +static inline double pow2i(int q) { + return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); +} + +static inline double ldexpk(double x, int q) { + double u; + int m; + m = q >> 31; + m = (((m + q) >> 9) - m) << 7; + q = q - (m << 2); + m += 0x3ff; + m = m < 0 ? 0 : m; + m = m > 0x7ff ? 0x7ff : m; + u = longBitsToDouble(((int64_t)m) << 52); + x = x * u * u * u * u; + u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); + return x * u; +} + +double xldexp(double x, int q) { return ldexpk(x, q); } + +static inline int ilogbp1(double d) { + int m = d < 4.9090934652977266E-91; + d = m ? 2.037035976334486E90 * d : d; + int q = (doubleToRawLongBits(d) >> 52) & 0x7ff; + q = m ? q - (300 + 0x03fe) : q - 0x03fe; + return q; +} + +int xilogb(double d) { + int e = ilogbp1(xfabs(d)) - 1; + e = d == 0 ? -2147483648 : e; + e = d == INFINITY || d == -INFINITY ? 2147483647 : e; + return e; +} + +// + +typedef struct { + double x, y; +} double2; + +#ifndef NDEBUG +static int checkfp(double x) { + if (xisinf(x) || xisnan(x)) return 1; + return 0; +} +#endif + +static inline double upper(double d) { + return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL); +} + +static inline double2 dd(double h, double l) { + double2 ret; + ret.x = h; ret.y = l; + return ret; +} + +static inline double2 ddnormalize_d2_d2(double2 t) { + double2 s; + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; +} + +static inline double2 ddscale_d2_d2_d(double2 d, double s) { + double2 r; + + r.x = d.x * s; + r.y = d.y * s; + + return r; +} + +static inline double2 ddneg_d2_d2(double2 d) { + double2 r; + + r.x = -d.x; + r.y = -d.y; + + return r; +} + +static inline double2 ddadd_d2_d_d(double x, double y) { + // |x| >= |y| + + double2 r; + +#ifndef NDEBUG + if (!(checkfp(x) || checkfp(y) || xfabs(x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d_d : %g, %g]", x, y); +#endif + + r.x = x + y; + r.y = x - r.x + y; + + return r; +} + +static inline double2 ddadd2_d2_d_d(double x, double y) { + double2 r; + + r.x = x + y; + double v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; +} + +static inline double2 ddadd_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y) || xfabs(x.x) >= xfabs(y))) fprintf(stderr, "[ddadd_d2_d2_d : %g %g]", x.x, y); +#endif + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; +} + +static inline double2 ddadd2_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r; + + r.x = x.x + y; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; +} + +static inline double2 ddadd_d2_d_d2(double x, double2 y) { + // |x| >= |y| + + double2 r; + +#ifndef NDEBUG + if (!(checkfp(x) || checkfp(y.x) || xfabs(x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]", x, y.x); +#endif + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; +} + +static inline double2 ddadd2_d2_d_d2(double x, double2 y) { + double2 r; + + r.x = x + y.x; + double v = r.x - x; + r.y = (x - (r.x - v)) + (y.x - v) + y.y; + + return r; +} + +static inline double2 ddadd_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]", x.x, y.x); +#endif + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; +} + +static inline double2 ddadd2_d2_d2_d2(double2 x, double2 y) { + double2 r; + + r.x = x.x + y.x; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; +} + +static inline double2 ddsub_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y.x) || xfabs(x.x) >= xfabs(y.x))) fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]", x.x, y.x); +#endif + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; +} + +static inline double2 dddiv_d2_d2_d2(double2 n, double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double nhh = upper(n.x), nhl = n.x - nhh; + + double2 q; + + q.x = n.x * t; + + double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; +} + +static inline double2 ddmul_d2_d_d(double x, double y) { + double xh = upper(x), xl = x - xh; + double yh = upper(y), yl = y - yh; + double2 r; + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; +} + +static inline double2 ddmul_d2_d2_d(double2 x, double y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y ), yl = y - yh; + double2 r; + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; +} + +static inline double2 ddmul_d2_d2_d2(double2 x, double2 y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y.x), yl = y.x - yh; + double2 r; + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; +} + +static inline double2 ddsqu_d2_d2(double2 x) { + double xh = upper(x.x), xl = x.x - xh; + double2 r; + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; +} + +static inline double2 ddrec_d2_d(double d) { + double t = 1.0 / d; + double dh = upper(d), dl = d - dh; + double th = upper(t), tl = t - th; + double2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; +} + +static inline double2 ddrec_d2_d2(double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; +} + +static inline double2 ddsqrt_d2_d2(double2 d) { + double t = sqrt(d.x + d.y); + return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5); +} + +// + +static inline double atan2k(double y, double x) { + double s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = u * t * s + s; + t = q * (M_PI/2) + t; + + return t; +} + +double xatan2(double y, double x) { + double r = atan2k(xfabs(y), x); + + r = mulsign(r, x); + if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); + if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); + if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); + + return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y); +} + +double xasin(double d) { + return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d); +} + +double xacos(double d) { + return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? M_PI : 0); +} + +double xatan(double s) { + double t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0 / s; q |= 1; } + + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982 - t; + if ((q & 2) != 0) t = -t; + + return t; +} + +static double2 atan2k_u1(double2 y, double2 x) { + double u; + double2 s, t; + int q = 0; + + if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } + if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } + + s = dddiv_d2_d2_d2(y, x); + t = ddsqu_d2_d2(s); + t = ddnormalize_d2_d2(t); + + u = 1.06298484191448746607415e-05; + u = mla(u, t.x, -0.000125620649967286867384336); + u = mla(u, t.x, 0.00070557664296393412389774); + u = mla(u, t.x, -0.00251865614498713360352999); + u = mla(u, t.x, 0.00646262899036991172313504); + u = mla(u, t.x, -0.0128281333663399031014274); + u = mla(u, t.x, 0.0208024799924145797902497); + u = mla(u, t.x, -0.0289002344784740315686289); + u = mla(u, t.x, 0.0359785005035104590853656); + u = mla(u, t.x, -0.041848579703592507506027); + u = mla(u, t.x, 0.0470843011653283988193763); + u = mla(u, t.x, -0.0524914210588448421068719); + u = mla(u, t.x, 0.0587946590969581003860434); + u = mla(u, t.x, -0.0666620884778795497194182); + u = mla(u, t.x, 0.0769225330296203768654095); + u = mla(u, t.x, -0.0909090442773387574781907); + u = mla(u, t.x, 0.111111108376896236538123); + u = mla(u, t.x, -0.142857142756268568062339); + u = mla(u, t.x, 0.199999999997977351284817); + u = mla(u, t.x, -0.333333333333317605173818); + + t = ddmul_d2_d2_d(t, u); + t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t)); + t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t); + + return t; +} + +double xatan2_u1(double y, double x) { + double2 d = atan2k_u1(dd(xfabs(y), 0), dd(x, 0)); + double r = d.x + d.y; + + r = mulsign(r, x); + if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); + if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); + if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); + + return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y); +} + +double xasin_u1(double d) { + double2 d2 = atan2k_u1(dd(xfabs(d), 0), ddsqrt_d2_d2(ddmul_d2_d2_d2(ddadd_d2_d_d(1, d), ddadd_d2_d_d(1,-d)))); + double r = d2.x + d2.y; + if (xfabs(d) == 1) r = 1.570796326794896557998982; + return mulsign(r, d); +} + +double xacos_u1(double d) { + double2 d2 = atan2k_u1(ddsqrt_d2_d2(ddmul_d2_d2_d2(ddadd_d2_d_d(1, d), ddadd_d2_d_d(1,-d))), dd(xfabs(d), 0)); + d2 = ddscale_d2_d2_d(d2, mulsign(1, d)); + if (xfabs(d) == 1) d2 = dd(0, 0); + if (d < 0) d2 = ddadd_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), d2); + return d2.x + d2.y; +} + +double xatan_u1(double d) { + double2 d2 = atan2k_u1(dd(xfabs(d), 0), dd(1, 0)); + double r = d2.x + d2.y; + if (xisinf(d)) r = 1.570796326794896557998982; + return mulsign(r, d); +} + +double xsin(double d) { + int q; + double u, s; + + q = (int)xrint(d * M_1_PI); + + d = mla(q, -PI4_A*4, d); + d = mla(q, -PI4_B*4, d); + d = mla(q, -PI4_C*4, d); + d = mla(q, -PI4_D*4, d); + + s = d * d; + + if ((q & 1) != 0) d = -d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; +} + +double xsin_u1(double d) { + int q; + double u; + double2 s, t, x; + + q = (int)xrint(d * M_1_PI); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*4)); + + t = s; + s = ddsqu_d2_d2(s); + + u = 2.72052416138529567917983e-15; + u = mla(u, s.x, -7.6429259411395447190023e-13); + u = mla(u, s.x, 1.60589370117277896211623e-10); + u = mla(u, s.x, -2.5052106814843123359368e-08); + u = mla(u, s.x, 2.75573192104428224777379e-06); + u = mla(u, s.x, -0.000198412698412046454654947); + u = mla(u, s.x, 0.00833333333333318056201922); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); + + x = ddmul_d2_d2_d2(t, x); + u = x.x + x.y; + + if ((q & 1) != 0) u = -u; + + return u; +} + +double xcos(double d) { + int q; + double u, s; + + q = 1 + 2*(int)xrint(d * M_1_PI - 0.5); + + d = mla(q, -PI4_A*2, d); + d = mla(q, -PI4_B*2, d); + d = mla(q, -PI4_C*2, d); + d = mla(q, -PI4_D*2, d); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; +} + +double xcos_u1(double d) { + double u, q; + double2 s, t, x; + + d = fabs(d); + + q = mla(2, xrint(d * M_1_PI - 0.5), 1); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + t = s; + s = ddsqu_d2_d2(s); + + u = 2.72052416138529567917983e-15; + u = mla(u, s.x, -7.6429259411395447190023e-13); + u = mla(u, s.x, 1.60589370117277896211623e-10); + u = mla(u, s.x, -2.5052106814843123359368e-08); + u = mla(u, s.x, 2.75573192104428224777379e-06); + u = mla(u, s.x, -0.000198412698412046454654947); + u = mla(u, s.x, 0.00833333333333318056201922); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); + + x = ddmul_d2_d2_d2(t, x); + + u = x.x + x.y; + + if ((((int)q) & 2) == 0) u = -u; + + return u; +} + +double2 xsincos(double d) { + int q; + double u, s, t; + double2 r; + + q = (int)xrint(d * (2 * M_1_PI)); + + s = d; + + s = mla(-q, PI4_A*2, s); + s = mla(-q, PI4_B*2, s); + s = mla(-q, PI4_C*2, s); + s = mla(-q, PI4_D*2, s); + + t = s; + + s = s * s; + + u = 1.58938307283228937328511e-10; + u = mla(u, s, -2.50506943502539773349318e-08); + u = mla(u, s, 2.75573131776846360512547e-06); + u = mla(u, s, -0.000198412698278911770864914); + u = mla(u, s, 0.0083333333333191845961746); + u = mla(u, s, -0.166666666666666130709393); + u = u * s * t; + + r.x = t + u; + + u = -1.13615350239097429531523e-11; + u = mla(u, s, 2.08757471207040055479366e-09); + u = mla(u, s, -2.75573144028847567498567e-07); + u = mla(u, s, 2.48015872890001867311915e-05); + u = mla(u, s, -0.00138888888888714019282329); + u = mla(u, s, 0.0416666666666665519592062); + u = mla(u, s, -0.5); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinf(d)) { r.x = r.y = NAN; } + + return r; +} + +double2 xsincos_u1(double d) { + int q; + double u; + double2 r, s, t, x; + + q = (int)xrint(d * (2 * M_1_PI)); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + t = s; + s = ddsqu_d2_d2(s); + s.x = s.x + s.y; + + u = 1.58938307283228937328511e-10; + u = mla(u, s.x, -2.50506943502539773349318e-08); + u = mla(u, s.x, 2.75573131776846360512547e-06); + u = mla(u, s.x, -0.000198412698278911770864914); + u = mla(u, s.x, 0.0083333333333191845961746); + u = mla(u, s.x, -0.166666666666666130709393); + + u *= s.x * t.x; + + x = ddadd_d2_d2_d(t, u); + r.x = x.x + x.y; + + u = -1.13615350239097429531523e-11; + u = mla(u, s.x, 2.08757471207040055479366e-09); + u = mla(u, s.x, -2.75573144028847567498567e-07); + u = mla(u, s.x, 2.48015872890001867311915e-05); + u = mla(u, s.x, -0.00138888888888714019282329); + u = mla(u, s.x, 0.0416666666666665519592062); + u = mla(u, s.x, -0.5); + + x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u)); + r.y = x.x + x.y; + + if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinf(d)) { r.x = r.y = NAN; } + + return r; +} + +double xtan(double d) { + int q; + double u, s, x; + + q = (int)xrint(d * (2 * M_1_PI)); + + x = mla(q, -PI4_A*2, d); + x = mla(q, -PI4_B*2, x); + x = mla(q, -PI4_C*2, x); + x = mla(q, -PI4_D*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 1.01419718511083373224408e-05; + u = mla(u, s, -2.59519791585924697698614e-05); + u = mla(u, s, 5.23388081915899855325186e-05); + u = mla(u, s, -3.05033014433946488225616e-05); + u = mla(u, s, 7.14707504084242744267497e-05); + u = mla(u, s, 8.09674518280159187045078e-05); + u = mla(u, s, 0.000244884931879331847054404); + u = mla(u, s, 0.000588505168743587154904506); + u = mla(u, s, 0.00145612788922812427978848); + u = mla(u, s, 0.00359208743836906619142924); + u = mla(u, s, 0.00886323944362401618113356); + u = mla(u, s, 0.0218694882853846389592078); + u = mla(u, s, 0.0539682539781298417636002); + u = mla(u, s, 0.133333333333125941821962); + u = mla(u, s, 0.333333333333334980164153); + + u = mla(s, u * x, x); + + if ((q & 1) != 0) u = 1.0 / u; + + if (xisinf(d)) u = NAN; + + return u; +} + +double xtan_u1(double d) { + int q; + double u; + double2 s, t, x; + + q = (int)xrint(d * M_2_PI); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + if ((q & 1) != 0) s = ddneg_d2_d2(s); + + t = s; + s = ddsqu_d2_d2(s); + + u = 1.01419718511083373224408e-05; + u = mla(u, s.x, -2.59519791585924697698614e-05); + u = mla(u, s.x, 5.23388081915899855325186e-05); + u = mla(u, s.x, -3.05033014433946488225616e-05); + u = mla(u, s.x, 7.14707504084242744267497e-05); + u = mla(u, s.x, 8.09674518280159187045078e-05); + u = mla(u, s.x, 0.000244884931879331847054404); + u = mla(u, s.x, 0.000588505168743587154904506); + u = mla(u, s.x, 0.00145612788922812427978848); + u = mla(u, s.x, 0.00359208743836906619142924); + u = mla(u, s.x, 0.00886323944362401618113356); + u = mla(u, s.x, 0.0218694882853846389592078); + u = mla(u, s.x, 0.0539682539781298417636002); + u = mla(u, s.x, 0.133333333333125941821962); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s)); + x = ddmul_d2_d2_d2(t, x); + + if ((q & 1) != 0) x = ddrec_d2_d2(x); + + u = x.x + x.y; + + return u; +} + +double xlog(double d) { + double x, x2, t, m; + int e; + + e = ilogbp1(d * 0.7071); + m = ldexpk(d, -e); + + x = (m-1) / (m+1); + x2 = x * x; + + t = 0.148197055177935105296783; + t = mla(t, x2, 0.153108178020442575739679); + t = mla(t, x2, 0.181837339521549679055568); + t = mla(t, x2, 0.22222194152736701733275); + t = mla(t, x2, 0.285714288030134544449368); + t = mla(t, x2, 0.399999999989941956712869); + t = mla(t, x2, 0.666666666666685503450651); + t = mla(t, x2, 2); + + x = x * t + 0.693147180559945286226764 * e; + + if (xisinf(d)) x = INFINITY; + if (d < 0) x = NAN; + if (d == 0) x = -INFINITY; + + return x; +} + +double xexp(double d) { + int q = (int)xrint(d * R_LN2); + double s, u; + + s = mla(q, -L2U, d); + s = mla(q, -L2L, s); + + u = 2.08860621107283687536341e-09; + u = mla(u, s, 2.51112930892876518610661e-08); + u = mla(u, s, 2.75573911234900471893338e-07); + u = mla(u, s, 2.75572362911928827629423e-06); + u = mla(u, s, 2.4801587159235472998791e-05); + u = mla(u, s, 0.000198412698960509205564975); + u = mla(u, s, 0.00138888888889774492207962); + u = mla(u, s, 0.00833333333331652721664984); + u = mla(u, s, 0.0416666666666665047591422); + u = mla(u, s, 0.166666666666666851703837); + u = mla(u, s, 0.5); + + u = s * s * u + s + 1; + u = ldexpk(u, q); + + if (xisminf(d)) u = 0; + + return u; +} + +static inline double2 logk(double d) { + double2 x, x2; + double m, t; + int e; + + e = ilogbp1(d * 0.7071); + m = ldexpk(d, -e); + + x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); +} + +double xlog_u1(double d) { + double2 s = logk(d); + double x = s.x + s.y; + + if (xisinf(d)) x = INFINITY; + if (d < 0) x = NAN; + if (d == 0) x = -INFINITY; + + return x; +} + +static inline double expk(double2 d) { + int q = (int)xrint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, q * -L2U); + s = ddadd2_d2_d2_d(s, q * -L2L); + + s = ddnormalize_d2_d2(s); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + return ldexpk(t.x + t.y, q); +} + +double xpow(double x, double y) { + int yisint = (int)y == y; + int yisodd = (1 & (int)y) != 0 && yisint; + + double result = expk(ddmul_d2_d2_d(logk(xfabs(x)), y)); + + result = xisnan(result) ? INFINITY : result; + result *= (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1))); + + double efx = mulsign(xfabs(x) - 1, y); + if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY); + if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITY); + if (xisnan(x) || xisnan(y)) result = NAN; + if (y == 0 || x == 1) result = 1; + + return result; +} + +static inline double2 expk2(double2 d) { + int q = (int)xrint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, q * -L2U); + s = ddadd2_d2_d2_d(s, q * -L2L); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + return ddscale_d2_d2_d(t, pow2i(q)); +} + +double xsinh(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = xfabs(x) > 710 ? INFINITY : y; + y = xisnan(y) ? INFINITY : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +double xcosh(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = xfabs(x) > 710 ? INFINITY : y; + y = xisnan(y) ? INFINITY : y; + y = xisnan(x) ? NAN : y; + + return y; +} + +double xtanh(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + double2 e = ddrec_d2_d2(d); + d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e)); + y = d.x + d.y; + + y = xfabs(x) > 18.714973875 ? 1.0 : y; + y = xisnan(y) ? 1.0 : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +static inline double2 logk2(double2 d) { + double2 x, x2, m; + double t; + int e; + + e = ilogbp1(d.x * 0.7071); + m = ddscale_d2_d2_d(d, pow2i(-e)); + + x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); +} + +double xasinh(double x) { + double y = xfabs(x); + double2 d = logk2(ddadd_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(y, y), 1)), y)); + y = d.x + d.y; + + y = xisinf(x) || xisnan(y) ? INFINITY : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +double xacosh(double x) { + double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(x, x), -1)), x)); + double y = d.x + d.y; + + y = xisinf(x) || xisnan(y) ? INFINITY : y; + y = x == 1.0 ? 0.0 : y; + y = x < 1.0 ? NAN : y; + y = xisnan(x) ? NAN : y; + + return y; +} + +double xatanh(double x) { + double y = xfabs(x); + double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y))); + y = y > 1.0 ? NAN : (y == 1.0 ? INFINITY : (d.x + d.y) * 0.5); + + y = xisinf(x) || xisnan(y) ? NAN : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +// + +double xfma(double x, double y, double z) { + union { + double f; + long long int i; + } tmp; + + tmp.f = x; + tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL; + double xh = tmp.f, xl = x - xh; + + tmp.f = y; + tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL; + double yh = tmp.f, yl = y - yh; + + double h = x * y; + double l = xh * yh - h + xl * yh + xh * yl + xl * yl; + + double h2, l2, v; + + h2 = h + z; + v = h2 - h; + l2 = (h - (h2 - v)) + (z - v) + l; + + return h2 + l2; +} + +double xsqrt(double d) { // max error : 0.5 ulp + double q = 1; + + if (d < 8.636168555094445E-78) { + d *= 1.157920892373162E77; + q = 2.9387358770557188E-39; + } + + // http://en.wikipedia.org/wiki/Fast_inverse_square_root + double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1)); + + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + + // You can change xfma to fma if fma is correctly implemented + x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x; + + return d == INFINITY ? INFINITY : x * q; +} + +double xcbrt(double d) { // max error : 2 ulps + double x, y, q = 1.0; + int e, r; + + e = ilogbp1(d); + d = ldexpk(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106 : q; + q = (r == 2) ? 1.5874010519681994747517056 : q; + q = ldexpk(q, (e + 6144) / 3 - 2048); + + q = mulsign(q, d); + d = xfabs(d); + + x = -0.640245898480692909870982; + x = x * d + 2.96155103020039511818595; + x = x * d + -5.73353060922947843636166; + x = x * d + 6.03990368989458747961407; + x = x * d + -3.85841935510444988821632; + x = x * d + 2.2307275302496609725722; + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); + y = d * x * x; + y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q; + + return y; +} + +double xcbrt_u1(double d) { + double x, y, z; + double2 q2 = dd(1, 0), u, v; + int e, r; + + e = ilogbp1(d); + d = ldexpk(d, -e); + r = (e + 6144) % 3; + q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2; + q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2; + + q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d); + d = xfabs(d); + + x = -0.640245898480692909870982; + x = x * d + 2.96155103020039511818595; + x = x * d + -5.73353060922947843636166; + x = x * d + 6.03990368989458747961407; + x = x * d + -3.85841935510444988821632; + x = x * d + 2.2307275302496609725722; + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); + + z = x; + + u = ddmul_d2_d_d(x, x); + u = ddmul_d2_d2_d2(u, u); + u = ddmul_d2_d2_d(u, d); + u = ddadd2_d2_d2_d(u, -x); + y = u.x + u.y; + + y = -2.0 / 3.0 * y * z; + v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y); + v = ddmul_d2_d2_d(v, d); + v = ddmul_d2_d2_d2(v, q2); + z = ldexp(v.x + v.y, (e + 6144) / 3 - 2048); + + if (xisinf(d)) { z = mulsign(INFINITY, q2.x); } + if (d == 0) { z = mulsign(0, q2.x); } + + return z; +} + +double xexp2(double a) { + double u = expk(ddmul_d2_d2_d(dd(0.69314718055994528623, 2.3190468138462995584e-17), a)); + if (a > 1023) u = INFINITY; + if (xisminf(a)) u = 0; + return u; +} + +double xexp10(double a) { + double u = expk(ddmul_d2_d2_d(dd(2.3025850929940459011, -2.1707562233822493508e-16), a)); + if (a > 308) u = INFINITY; + if (xisminf(a)) u = 0; + return u; +} + +double xexpm1(double a) { + double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0); + double x = d.x + d.y; + if (a > 700) x = INFINITY; + if (a < -0.36043653389117156089696070315825181539851971360337e+2) x = -1; + return x; +} + +double xlog10(double a) { + double2 d = ddmul_d2_d2_d2(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17)); + double x = d.x + d.y; + + if (xisinf(a)) x = INFINITY; + if (a < 0) x = NAN; + if (a == 0) x = -INFINITY; + + return x; +} + +double xlog1p(double a) { + double2 d = logk2(ddadd2_d2_d_d(a, 1)); + double x = d.x + d.y; + + if (xisinf(a)) x = INFINITY; + if (a < -1) x = NAN; + if (a == -1) x = -INFINITY; + + return x; +} diff --git a/purec/sleefsp.c b/purec/sleefsp.c new file mode 100644 index 00000000..3dc0a1a2 --- /dev/null +++ b/purec/sleefsp.c @@ -0,0 +1,1093 @@ +#include +#include +#include +#include +#include + +#include "nonnumber.h" + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f + +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f +#define M_PIf ((float)M_PI) + +static inline int32_t floatToRawIntBits(float d) { + union { + float f; + int32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +static inline float intBitsToFloat(int32_t i) { + union { + float f; + int32_t i; + } tmp; + tmp.i = i; + return tmp.f; +} + +static inline float xfabsf(float x) { + return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x)); +} + +static inline float mulsignf(float x, float y) { + return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); +} + +static inline float signf(float d) { return mulsignf(1, d); } +static inline float mlaf(float x, float y, float z) { return x * y + z; } +static inline float xrintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); } + +static inline int xisnanf(float x) { return x != x; } +static inline int xisinff(float x) { return x == INFINITYf || x == -INFINITYf; } +static inline int xisminff(float x) { return x == -INFINITYf; } +static inline int xispinff(float x) { return x == INFINITYf; } + +static inline int ilogbp1f(float d) { + int m = d < 5.421010862427522E-20f; + d = m ? 1.8446744073709552E19f * d : d; + int q = (floatToRawIntBits(d) >> 23) & 0xff; + q = m ? q - (64 + 0x7e) : q - 0x7e; + return q; +} + +static inline float pow2if(int q) { + return intBitsToFloat(((int32_t)(q + 0x7f)) << 23); +} + +static inline float ldexpkf(float x, int q) { + float u; + int m; + m = q >> 31; + m = (((m + q) >> 6) - m) << 4; + q = q - (m << 2); + m += 127; + m = m < 0 ? 0 : m; + m = m > 255 ? 255 : m; + u = intBitsToFloat(((int32_t)m) << 23); + x = x * u * u * u * u; + u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23); + return x * u; +} + +float xldexpf(float x, int q) { return ldexpkf(x, q); } + +// + +typedef struct { + float x, y; +} float2; + +#ifndef NDEBUG +static int checkfp(float x) { + if (xisinff(x) || xisnanf(x)) return 1; + return 0; +} +#endif + +static inline float upperf(float d) { + return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000); +} + +static inline float2 df(float h, float l) { + float2 ret; + ret.x = h; ret.y = l; + return ret; +} + +static inline float2 dfnormalize_f2_f2(float2 t) { + float2 s; + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; +} + +static inline float2 dfscale_f2_f2_f(float2 d, float s) { + float2 r; + + r.x = d.x * s; + r.y = d.y * s; + + return r; +} + +static inline float2 dfneg_f2_f2(float2 d) { + float2 r; + + r.x = -d.x; + r.y = -d.y; + + return r; +} + +static inline float2 dfadd_f2_f_f(float x, float y) { + // |x| >= |y| + + float2 r; + +#ifndef NDEBUG + if (!(checkfp(x) || checkfp(y) || xfabsf(x) >= xfabsf(y))) fprintf(stderr, "[dfadd_f2_f_f : %g, %g]", x, y); +#endif + + r.x = x + y; + r.y = x - r.x + y; + + return r; +} + +static inline float2 dfadd2_f2_f_f(float x, float y) { + float2 r; + + r.x = x + y; + float v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; +} + +static inline float2 dfadd_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y) || xfabsf(x.x) >= xfabsf(y))) fprintf(stderr, "[dfadd_f2_f2_f : %g %g]", x.x, y); +#endif + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; +} + +static inline float2 dfadd2_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r; + + r.x = x.x + y; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; +} + +static inline float2 dfadd_f2_f_f2(float x, float2 y) { + // |x| >= |y| + + float2 r; + +#ifndef NDEBUG + if (!(checkfp(x) || checkfp(y.x) || xfabsf(x) >= xfabsf(y.x))) fprintf(stderr, "[dfadd_df_f_f2 : %g %g]", x, y.x); +#endif + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; +} + +static inline float2 dfadd_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y.x) || xfabsf(x.x) >= xfabsf(y.x))) fprintf(stderr, "[dfadd_f2_f2_f2 : %g %g]", x.x, y.x); +#endif + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; +} + +static inline float2 dfadd2_f2_f2_f2(float2 x, float2 y) { + float2 r; + + r.x = x.x + y.x; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; +} + +static inline float2 dfsub_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r; + +#ifndef NDEBUG + if (!(checkfp(x.x) || checkfp(y.x) || xfabsf(x.x) >= xfabsf(y.x))) fprintf(stderr, "[dfsub_f2_f2_f2 : %g %g]", x.x, y.x); +#endif + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; +} + +static inline float2 dfdiv_f2_f2_f2(float2 n, float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float nhh = upperf(n.x), nhl = n.x - nhh; + + float2 q; + + q.x = n.x * t; + + float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; +} + +static inline float2 dfmul_f2_f_f(float x, float y) { + float xh = upperf(x), xl = x - xh; + float yh = upperf(y), yl = y - yh; + float2 r; + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; +} + +static inline float2 dfmul_f2_f2_f(float2 x, float y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y ), yl = y - yh; + float2 r; + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; +} + +static inline float2 dfmul_f2_f2_f2(float2 x, float2 y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y.x), yl = y.x - yh; + float2 r; + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; +} + +static inline float2 dfsqu_f2_f2(float2 x) { + float xh = upperf(x.x), xl = x.x - xh; + float2 r; + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; +} + +static inline float2 dfrec_f2_f(float d) { + float t = 1.0f / d; + float dh = upperf(d), dl = d - dh; + float th = upperf(t), tl = t - th; + float2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; +} + +static inline float2 dfrec_f2_f2(float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; +} + +static inline float2 dfsqrt_f2_f2(float2 d) { + float t = sqrtf(d.x + d.y); + return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f); +} + +// + +float xsinf(float d) { + int q; + float u, s; + + q = (int)xrintf(d * (float)M_1_PI); + + d = mlaf(q, -PI4_Af*4, d); + d = mlaf(q, -PI4_Bf*4, d); + d = mlaf(q, -PI4_Cf*4, d); + d = mlaf(q, -PI4_Df*4, d); + + s = d * d; + + if ((q & 1) != 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (xisinff(d)) u = NANf; + + return u; +} + +float xsinf_u1(float d) { + int q; + float u; + float2 s, t, x; + + q = (int)xrintf(d * (float)M_1_PI); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*4)); + + t = s; + s = dfsqu_f2_f2(s); + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s.x, -0.0001981069071916863322258f); + u = mlaf(u, s.x, 0.00833307858556509017944336f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); + + x = dfmul_f2_f2_f2(t, x); + u = x.x + x.y; + + if ((q & 1) != 0) u = -u; + + return u; +} + +float xcosf(float d) { + int q; + float u, s; + + q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); + + d = mlaf(q, -PI4_Af*2, d); + d = mlaf(q, -PI4_Bf*2, d); + d = mlaf(q, -PI4_Cf*2, d); + d = mlaf(q, -PI4_Df*2, d); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (xisinff(d)) u = NANf; + + return u; +} + +float xcosf_u1(float d) { + float u, q; + float2 s, t, x; + + d = fabsf(d); + + q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + t = s; + s = dfsqu_f2_f2(s); + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s.x, -0.0001981069071916863322258f); + u = mlaf(u, s.x, 0.00833307858556509017944336f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); + + x = dfmul_f2_f2_f2(t, x); + u = x.x + x.y; + + if ((((int)q) & 2) == 0) u = -u; + + return u; +} + +float2 xsincosf(float d) { + int q; + float u, s, t; + float2 r; + + q = (int)xrintf(d * ((float)(2 * M_1_PI))); + + s = d; + + s = mlaf(q, -PI4_Af*2, s); + s = mlaf(q, -PI4_Bf*2, s); + s = mlaf(q, -PI4_Cf*2, s); + s = mlaf(q, -PI4_Df*2, s); + + t = s; + + s = s * s; + + u = -0.000195169282960705459117889f; + u = mlaf(u, s, 0.00833215750753879547119141f); + u = mlaf(u, s, -0.166666537523269653320312f); + u = u * s * t; + + r.x = t + u; + + u = -2.71811842367242206819355e-07f; + u = mlaf(u, s, 2.47990446951007470488548e-05f); + u = mlaf(u, s, -0.00138888787478208541870117f); + u = mlaf(u, s, 0.0416666641831398010253906f); + u = mlaf(u, s, -0.5f); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinff(d)) { r.x = r.y = NANf; } + + return r; +} + +float2 xsincosf_u1(float d) { + int q; + float u; + float2 r, s, t, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + t = s; + s = dfsqu_f2_f2(s); + s.x = s.x + s.y; + + u = -0.000195169282960705459117889f; + u = mlaf(u, s.x, 0.00833215750753879547119141f); + u = mlaf(u, s.x, -0.166666537523269653320312f); + + u *= s.x * t.x; + + x = dfadd_f2_f2_f(t, u); + r.x = x.x + x.y; + + u = -2.71811842367242206819355e-07f; + u = mlaf(u, s.x, 2.47990446951007470488548e-05f); + u = mlaf(u, s.x, -0.00138888787478208541870117f); + u = mlaf(u, s.x, 0.0416666641831398010253906f); + u = mlaf(u, s.x, -0.5f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u)); + r.y = x.x + x.y; + + if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinff(d)) { r.x = r.y = NAN; } + + return r; +} + +float xtanf(float d) { + int q; + float u, s, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + x = d; + + x = mlaf(q, -PI4_Af*2, x); + x = mlaf(q, -PI4_Bf*2, x); + x = mlaf(q, -PI4_Cf*2, x); + x = mlaf(q, -PI4_Df*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 0.00927245803177356719970703f; + u = mlaf(u, s, 0.00331984995864331722259521f); + u = mlaf(u, s, 0.0242998078465461730957031f); + u = mlaf(u, s, 0.0534495301544666290283203f); + u = mlaf(u, s, 0.133383005857467651367188f); + u = mlaf(u, s, 0.333331853151321411132812f); + + u = mlaf(s, u * x, x); + + if ((q & 1) != 0) u = 1.0f / u; + + if (xisinff(d)) u = NANf; + + return u; +} + +float xtanf_u1(float d) { + int q; + float u; + float2 s, t, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + if ((q & 1) != 0) s = dfneg_f2_f2(s); + + t = s; + s = dfsqu_f2_f2(s); + s = dfnormalize_f2_f2(s); + + u = 0.00446636462584137916564941f; + u = mlaf(u, s.x, -8.3920182078145444393158e-05f); + u = mlaf(u, s.x, 0.0109639242291450500488281f); + u = mlaf(u, s.x, 0.0212360303848981857299805f); + u = mlaf(u, s.x, 0.0540687143802642822265625f); + + x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x); + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s)); + x = dfmul_f2_f2_f2(t, x); + + if ((q & 1) != 0) x = dfrec_f2_f2(x); + + u = x.x + x.y; + + return u; +} + +float xatanf(float s) { + float t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0f / s; q |= 1; } + + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982f - t; + if ((q & 2) != 0) t = -t; + + return t; +} + +static inline float atan2kf(float y, float x) { + float s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = u * t * s + s; + t = q * (float)(M_PI/2) + t; + + return t; +} + +float xatan2f(float y, float x) { + float r = atan2kf(xfabsf(y), x); + + r = mulsignf(r, x); + if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0); + if (xisinff(y) ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0); + if ( y == 0) r = (signf(x) == -1 ? M_PIf : 0); + + return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y); +} + +float xasinf(float d) { + return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d); +} + +float xacosf(float d) { + return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)M_PI : 0.0f); +} + +static float2 atan2kf_u1(float2 y, float2 x) { + float u; + float2 s, t; + int q = 0; + + if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } + if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } + + s = dfdiv_f2_f2_f2(y, x); + t = dfsqu_f2_f2(s); + t = dfnormalize_f2_f2(t); + + u = -0.00176397908944636583328247f; + u = mlaf(u, t.x, 0.0107900900766253471374512f); + u = mlaf(u, t.x, -0.0309564601629972457885742f); + u = mlaf(u, t.x, 0.0577365085482597351074219f); + u = mlaf(u, t.x, -0.0838950723409652709960938f); + u = mlaf(u, t.x, 0.109463557600975036621094f); + u = mlaf(u, t.x, -0.142626821994781494140625f); + u = mlaf(u, t.x, 0.199983194470405578613281f); + + //u = mlaf(u, t.x, -0.333332866430282592773438f); + //t = dfmul_f2_f2_f(t, u); + + t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x)); + t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t)); + t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t); + + return t; +} + +float xatan2f_u1(float y, float x) { + float2 d = atan2kf_u1(df(xfabsf(y), 0), df(x, 0)); + float r = d.x + d.y; + + r = mulsignf(r, x); + if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0.0f); + if (xisinff(y) ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f); + if ( y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f); + + return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y); +} + +float xasinf_u1(float d) { + float2 d2 = atan2kf_u1(df(xfabsf(d), 0), dfsqrt_f2_f2(dfmul_f2_f2_f2(dfadd_f2_f_f(1, d), dfadd_f2_f_f(1,-d)))); + float r = d2.x + d2.y; + if (xfabsf(d) == 1) r = 1.570796326794896557998982f; + return mulsignf(r, d); +} + +float xacosf_u1(float d) { + float2 d2 = atan2kf_u1(dfsqrt_f2_f2(dfmul_f2_f2_f2(dfadd_f2_f_f(1, d), dfadd_f2_f_f(1,-d))), df(xfabsf(d), 0)); + d2 = dfscale_f2_f2_f(d2, mulsignf(1.0f, d)); + if (xfabsf(d) == 1) d2 = df(0.0f, 0.0f); + if (d < 0) d2 = dfadd_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), d2); + return d2.x + d2.y; +} + +float xatanf_u1(float d) { + float2 d2 = atan2kf_u1(df(xfabsf(d), 0.0f), df(1.0f, 0.0f)); + float r = d2.x + d2.y; + if (xisinff(d)) r = 1.570796326794896557998982f; + return mulsignf(r, d); +} + +float xlogf(float d) { + float x, x2, t, m; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpkf(d, -e); + + x = (m-1.0f) / (m+1.0f); + x2 = x * x; + + t = 0.2371599674224853515625f; + t = mlaf(t, x2, 0.285279005765914916992188f); + t = mlaf(t, x2, 0.400005519390106201171875f); + t = mlaf(t, x2, 0.666666567325592041015625f); + t = mlaf(t, x2, 2.0f); + + x = x * t + 0.693147180559945286226764f * e; + + if (xisinff(d)) x = INFINITYf; + if (d < 0) x = NANf; + if (d == 0) x = -INFINITYf; + + return x; +} + +float xexpf(float d) { + int q = (int)xrintf(d * R_LN2f); + float s, u; + + s = mlaf(q, -L2Uf, d); + s = mlaf(q, -L2Lf, s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s, 0.00836596917361021041870117f); + u = mlaf(u, s, 0.0416710823774337768554688f); + u = mlaf(u, s, 0.166665524244308471679688f); + u = mlaf(u, s, 0.499999850988388061523438f); + + u = s * s * u + s + 1.0f; + u = ldexpkf(u, q); + + if (xisminff(d)) u = 0; + + return u; +} + +//#define L2Af 0.693145751953125 +//#define L2Bf 1.4285906217992305756e-06 +//#define L2Cf 1.619850954759360917e-11 + +static inline float expkf(float2 d) { + int q = (int)xrintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + //s = dfadd2_f2_f2_f(d, q * -L2Af); + //s = dfadd2_f2_f2_f(s, q * -L2Bf); + //s = dfadd2_f2_f2_f(s, q * -L2Cf); + + s = dfnormalize_f2_f2(s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return ldexpkf(t.x + t.y, q); +} + +static inline float2 logkf(float d) { + float2 x, x2; + float m, t; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpkf(d, -e); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); +} + +float xlogf_u1(float d) { + float2 s = logkf(d); + float x = s.x + s.y; + + if (xisinff(d)) x = INFINITYf; + if (d < 0) x = NANf; + if (d == 0) x = -INFINITYf; + + return x; +} + +static inline float2 expk2f(float2 d) { + int q = (int)xrintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return dfscale_f2_f2_f(t, pow2if(q)); +} + +float xpowf(float x, float y) { + int yisint = (int)y == y; + int yisodd = (1 & (int)y) != 0 && yisint; + + float result = expkf(dfmul_f2_f2_f(logkf(xfabsf(x)), y)); + + result = xisnanf(result) ? INFINITYf : result; + result *= (x >= 0 ? 1 : (!yisint ? NANf : (yisodd ? -1 : 1))); + + float efx = mulsignf(xfabsf(x) - 1, y); + if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : INFINITYf); + if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITYf); + if (xisnanf(x) || xisnanf(y)) result = NANf; + if (y == 0 || x == 1) result = 1; + + return result; +} + +float xsinhf(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = xfabsf(x) > 89 ? INFINITY : y; + y = xisnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float xcoshf(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = xfabsf(x) > 89 ? INFINITY : y; + y = xisnanf(y) ? INFINITYf : y; + y = xisnanf(x) ? NANf : y; + + return y; +} + +float xtanhf(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + float2 e = dfrec_f2_f2(d); + d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e)); + y = d.x + d.y; + + y = xfabsf(x) > 8.664339742f ? 1.0f : y; + y = xisnanf(y) ? 1.0f : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +static inline float2 logk2f(float2 d) { + float2 x, x2, m; + float t; + int e; + + e = ilogbp1f(d.x * 0.7071f); + m = dfscale_f2_f2_f(d, pow2if(-e)); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); +} + +float xasinhf(float x) { + float y = xfabsf(x); + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(y, y), 1)), y)); + y = d.x + d.y; + + y = xisinff(x) || xisnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float xacoshf(float x) { + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(x, x), -1)), x)); + float y = d.x + d.y; + + y = xisinff(x) || xisnanf(y) ? INFINITYf : y; + y = x == 1.0f ? 0.0f : y; + y = x < 1.0f ? NANf : y; + y = xisnanf(x) ? NANf : y; + + return y; +} + +float xatanhf(float x) { + float y = xfabsf(x); + float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y))); + y = y > 1.0 ? NANf : (y == 1.0 ? INFINITYf : (d.x + d.y) * 0.5f); + + y = xisinff(x) || xisnanf(y) ? NANf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float xexp2f(float a) { + float u = expkf(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), a)); + if (xispinff(a)) u = INFINITYf; + if (xisminff(a)) u = 0; + return u; +} + +float xexp10f(float a) { + float u = expkf(dfmul_f2_f2_f(df(2.3025851249694824219f, -3.1975436520781386207e-08f), a)); + if (xispinff(a)) u = INFINITYf; + if (xisminff(a)) u = 0; + return u; +} + +float xexpm1f(float a) { + float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f); + float x = d.x + d.y; + if (a > 88.0f) x = INFINITYf; + if (a < -0.15942385152878742116596338793538061065739925620174e+2f) x = -1; + return x; +} + +float xlog10f(float a) { + float2 d = dfmul_f2_f2_f2(logkf(a), df(0.43429449200630187988f, -1.0103050118726031315e-08f)); + float x = d.x + d.y; + + if (xisinff(a)) x = INFINITYf; + if (a < 0) x = NANf; + if (a == 0) x = -INFINITYf; + + return x; +} + +float xlog1pf(float a) { + float2 d = logk2f(dfadd2_f2_f_f(a, 1)); + float x = d.x + d.y; + + if (xisinff(a)) x = INFINITYf; + if (a < -1) x = NANf; + if (a == -1) x = -INFINITYf; + + return x; +} + +float xsqrtf(float f) { return sqrtf(f); } + +float xcbrtf(float d) { + float x, y, q = 1.0f; + int e, r; + + e = ilogbp1f(d); + d = ldexpkf(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106f : q; + q = (r == 2) ? 1.5874010519681994747517056f : q; + q = ldexpkf(q, (e + 6144) / 3 - 2048); + + q = mulsignf(q, d); + d = xfabsf(d); + + x = -0.601564466953277587890625f; + x = mlaf(x, d, 2.8208892345428466796875f); + x = mlaf(x, d, -5.532182216644287109375f); + x = mlaf(x, d, 5.898262500762939453125f); + x = mlaf(x, d, -3.8095417022705078125f); + x = mlaf(x, d, 2.2241256237030029296875f); + + y = d * x * x; + y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q; + + return y; +} + +float xcbrtf_u1(float d) { + float x, y, z; + float2 q2 = df(1, 0), u, v; + int e, r; + + e = ilogbp1f(d); + d = ldexpkf(d, -e); + r = (e + 6144) % 3; + q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2; + q2 = (r == 2) ? df(1.5874010324478149414, 1.9520385308169352356e-08) : q2; + + q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d); + d = xfabsf(d); + + x = -0.601564466953277587890625f; + x = mlaf(x, d, 2.8208892345428466796875f); + x = mlaf(x, d, -5.532182216644287109375f); + x = mlaf(x, d, 5.898262500762939453125f); + x = mlaf(x, d, -3.8095417022705078125f); + x = mlaf(x, d, 2.2241256237030029296875f); + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f); + + z = x; + + u = dfmul_f2_f_f(x, x); + u = dfmul_f2_f2_f2(u, u); + u = dfmul_f2_f2_f(u, d); + u = dfadd2_f2_f2_f(u, -x); + y = u.x + u.y; + + y = -2.0 / 3.0 * y * z; + v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y); + v = dfmul_f2_f2_f(v, d); + v = dfmul_f2_f2_f2(v, q2); + z = ldexpf(v.x + v.y, (e + 6144) / 3 - 2048); + + if (xisinff(d)) { z = mulsignf(INFINITYf, q2.x); } + if (d == 0) { z = mulsignf(0, q2.x); } + + return z; +} diff --git a/simd/Makefile b/simd/Makefile new file mode 100644 index 00000000..d6bf9934 --- /dev/null +++ b/simd/Makefile @@ -0,0 +1,56 @@ +CC=gcc +OPT=-O -Wall -Wno-unused -Wno-attributes +SDE=/opt/sde-bdw-external-5.38.0-2013-01-03-lin/sde + +all : testsse2 testavx + +iutsse2 : sleefsimddp.c sleefsimdsp.c helpersse2.h iut.c + $(CC) $(OPT) -DENABLE_SSE2 -msse2 iut.c sleefsimddp.c sleefsimdsp.c -o iutsse2 -lm + +iutavx : sleefsimddp.c sleefsimdsp.c helperavx.h iut.c + $(CC) $(OPT) -DENABLE_AVX -mavx iut.c sleefsimddp.c sleefsimdsp.c -o iutavx -lm + +iutavx2 : sleefsimddp.c sleefsimdsp.c helperavx2.h iut.c + $(CC) $(OPT) -DENABLE_AVX2 -mavx2 -mfma iut.c sleefsimddp.c sleefsimdsp.c -o iutavx2 -lm + +iutfma4 : sleefsimddp.c sleefsimdsp.c helperfma4.h iut.c + $(CC) $(OPT) -DENABLE_FMA4 -mavx -mfma4 iut.c sleefsimddp.c sleefsimdsp.c -o iutfma4 -lm + +../tester/tester : + cd ../tester; make tester + +../tester/testeru1 : + cd ../tester; make testeru1 + +../tester/testersp : + cd ../tester; make testersp + +../tester/testerspu1 : + cd ../tester; make testerspu1 + +testsse2 : iutsse2 ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 + ../tester/tester ./iutsse2 + ../tester/testeru1 ./iutsse2 + ../tester/testersp ./iutsse2 + ../tester/testerspu1 ./iutsse2 + +testavx : iutavx ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 + ../tester/tester ./iutavx + ../tester/testeru1 ./iutavx + ../tester/testersp ./iutavx + ../tester/testerspu1 ./iutavx + +testavx2 : iutavx2 ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 + ../tester/tester $(SDE) -- ./iutavx2 + ../tester/testeru1 $(SDE) -- ./iutavx2 + ../tester/testersp $(SDE) -- ./iutavx2 + ../tester/testerspu1 $(SDE) -- ./iutavx2 + +testfma4 : iutfma4 ../tester/tester ../tester/testeru1 ../tester/testersp ../tester/testerspu1 + ../tester/tester ./iutfma4 + ../tester/testeru1 ./iutfma4 + ../tester/testersp ./iutfma4 + ../tester/testerspu1 ./iutfma4 + +clean : + rm -f *~ *.o *.s iutsse2 iutavx iutavx2 iutfma4 iutneon diff --git a/simd/Makefile.arm b/simd/Makefile.arm new file mode 100644 index 00000000..e8955822 --- /dev/null +++ b/simd/Makefile.arm @@ -0,0 +1,26 @@ +CC=arm-linux-gnueabi-gcc +OPT=-O -Wall -Wno-unused -Wno-attributes -mfloat-abi=softfp -mfpu=neon -static + +all : testneon + +iutneon : sleefsimdsp.c helperneon.h iut.c + $(CC) $(OPT) -DENABLE_NEON iut.c sleefsimdsp.c -o iutneon -lm + +../tester/tester : + cd ../tester; make tester + +../tester/testeru1 : + cd ../tester; make testeru1 + +../tester/testersp : + cd ../tester; make testersp + +../tester/testerspu1 : + cd ../tester; make testerspu1 + +testneon : iutneon ../tester/testersp ../tester/testerspu1 + ../tester/testersp --flushtozero ./iutneon + ../tester/testerspu1 --flushtozero ./iutneon + +clean : + rm -f *~ *.o *.s iutsse2 iutavx iutavx2 iutfma4 iutneon diff --git a/simd/Makefile.icc b/simd/Makefile.icc new file mode 100644 index 00000000..7a73238d --- /dev/null +++ b/simd/Makefile.icc @@ -0,0 +1,44 @@ +CC=/opt/intel/bin/icc +SDE=/opt/sde-bdw-external-5.38.0-2013-01-03-lin/sde + +iutsse2 : sleefsimdsp.c sleefsimddp.c helpersse2.h iut.c + $(CC) -DENABLE_SSE2 -fp-model precise -Wall -Wno-unused -O -msse2 iut.c sleefsimdsp.c sleefsimddp.c -o iutsse2 -lm + +iutavx : sleefsimdsp.c sleefsimddp.c helperavx.h iut.c + $(CC) -DENABLE_AVX -fp-model precise -Wall -Wno-unused -O -mavx iut.c sleefsimdsp.c sleefsimddp.c -o iutavx -lm + +iutavx2 : sleefsimdsp.c sleefsimddp.c helperavx2.h iut.c + $(CC) -DENABLE_AVX2 -fp-model precise -Wall -Wno-unused -O -march=core-avx2 iut.c sleefsimdsp.c sleefsimddp.c -o iutavx2 -lm + +../tester/tester : + cd ../tester; make tester + +../tester/testeru1 : + cd ../tester; make testeru1 + +../tester/testersp : + cd ../tester; make testersp + +../tester/testerspu1 : + cd ../tester; make testerspu1 + +testsse2 : iutsse2 ../tester/tester ../tester/testersp ../tester/testeru1 ../tester/testerspu1 + ../tester/tester ./iutsse2 + ../tester/testeru1 ./iutsse2 + ../tester/testersp ./iutsse2 + ../tester/testerspu1 ./iutsse2 + +testavx : iutavx ../tester/tester ../tester/testersp ../tester/testeru1 ../tester/testerspu1 + ../tester/tester ./iutavx + ../tester/testeru1 ./iutavx + ../tester/testersp ./iutavx + ../tester/testerspu1 ./iutavx + +testavx2 : iutavx2 ../tester/tester ../tester/testersp ../tester/testeru1 ../tester/testerspu1 + ../tester/tester $(SDE) -- ./iutavx2 + ../tester/testeru1 $(SDE) -- ./iutavx2 + ../tester/testersp $(SDE) -- ./iutavx2 + ../tester/testerspu1 $(SDE) -- ./iutavx2 + +clean : + rm -f *~ *.o *.s iutsse2 iutavx iutavx2 iutneon diff --git a/simd/dd.h b/simd/dd.h new file mode 100644 index 00000000..696c49cb --- /dev/null +++ b/simd/dd.h @@ -0,0 +1,342 @@ +typedef struct { + vdouble x, y; +} vdouble2; + +static INLINE vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) { + vdouble2 ret = {h, l}; + return ret; +} + +static INLINE vdouble2 vcast_vd2_d_d(double h, double l) { + vdouble2 ret = {vcast_vd_d(h), vcast_vd_d(l)}; + return ret; +} + +static INLINE vdouble2 vsel_vd2_vm_vd2_vd2(vmask m, vdouble2 x, vdouble2 y) { + vdouble2 r; + r.x = vsel_vd_vm_vd_vd(m, x.x, y.x); + r.y = vsel_vd_vm_vd_vd(m, x.y, y.y); + return r; +} + +static INLINE vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { + return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2); +} + +static INLINE vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { + return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3); +} + +static INLINE vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { + return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4); +} + +static INLINE vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { + return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5); +} + +static INLINE vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) { + return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6); +} + +static INLINE vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { + return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2); +} + +static INLINE vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { + return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3); +} + +static INLINE vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { + return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4); +} + +static INLINE vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { + return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5); +} + +// + +static INLINE vdouble2 ddneg_vd2_vd2(vdouble2 x) { + return vcast_vd2_vd_vd(vneg_vd_vd(x.x), vneg_vd_vd(x.y)); +} + +static INLINE vdouble2 ddnormalize_vd2_vd2(vdouble2 t) { + vdouble2 s; + + s.x = vadd_vd_vd_vd(t.x, t.y); + s.y = vadd_vd_vd_vd(vsub_vd_vd_vd(t.x, s.x), t.y); + + return s; +} + +static INLINE vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) { + vdouble2 r = {vmul_vd_vd_vd(d.x, s), vmul_vd_vd_vd(d.y, s)}; + return r; +} + +static INLINE vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y); + + return r; +} + +static INLINE vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y); + vdouble v = vsub_vd_vd_vd(r.x, x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v)); + + return r; +} + +static INLINE vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y); + r.y = vadd_vd_3vd(vsub_vd_vd_vd(x.x, r.x), y, x.y); + + return r; +} + +static INLINE vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y); + vdouble v = vsub_vd_vd_vd(r.x, x.x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v)); + r.y = vadd_vd_vd_vd(r.y, x.y); + + return r; +} + +static INLINE vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y.x); + r.y = vadd_vd_3vd(vsub_vd_vd_vd(x, r.x), y.x, y.y); + + return r; +} + +static INLINE vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y.x); + r.y = vadd_vd_4vd(vsub_vd_vd_vd(x.x, r.x), y.x, x.y, y.y); + + return r; +} + +static INLINE vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y.x); + vdouble v = vsub_vd_vd_vd(r.x, x.x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y.x, v)); + r.y = vadd_vd_vd_vd(r.y, vadd_vd_vd_vd(x.y, y.y)); + + return r; +} + +static inline vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x, y); + r.y = vsub_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y); + + return r; +} + +static INLINE vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x.x, y.x); + r.y = vsub_vd_vd_vd(x.x, r.x); + r.y = vsub_vd_vd_vd(r.y, y.x); + r.y = vadd_vd_vd_vd(r.y, x.y); + r.y = vsub_vd_vd_vd(r.y, y.y); + + return r; +} + +#if 0 +static inline vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x.x, y.x); + r.y = vsub_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(x.x, r.x), y.x), x.y), y.y); + + return r; +} +#endif + +#ifdef ENABLE_FMA_DP +static INLINE vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { + vdouble2 q; + vdouble t = vrec_vd_vd(d.x), u; + + q.x = vmul_vd_vd_vd(n.x, t); + u = vfmapn_vd_vd_vd_vd(t, n.x, q.x); + q.y = vfmanp_vd_vd_vd_vd(d.y, t, vfmanp_vd_vd_vd_vd(d.x, t, vcast_vd_d(1))); + q.y = vfma_vd_vd_vd_vd(q.x, q.y, vfma_vd_vd_vd_vd(n.y, t, u)); + + return q; +} + +static INLINE vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x, y); + r.y = vfmapn_vd_vd_vd_vd(x, y, r.x); + + return r; +} + +static INLINE vdouble2 ddsqu_vd2_vd2(vdouble2 x) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, x.x); + r.y = vfma_vd_vd_vd_vd(vadd_vd_vd_vd(x.x, x.x), x.y, vfmapn_vd_vd_vd_vd(x.x, x.x, r.x)); + + return r; +} + +static INLINE vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y.x); + r.y = vfma_vd_vd_vd_vd(x.x, y.y, vfma_vd_vd_vd_vd(x.y, y.x, vfmapn_vd_vd_vd_vd(x.x, y.x, r.x))); + + return r; +} + +static INLINE vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y); + r.y = vfma_vd_vd_vd_vd(x.y, y, vfmapn_vd_vd_vd_vd(x.x, y, r.x)); + + return r; +} + +static inline vdouble2 ddrec_vd2_vd(vdouble d) { + vdouble2 q; + + q.x = vrec_vd_vd(d); + q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d, q.x, vcast_vd_d(1))); + + return q; +} + +static INLINE vdouble2 ddrec_vd2_vd2(vdouble2 d) { + vdouble2 q; + + q.x = vrec_vd_vd(d.x); + q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d.y, q.x, vfmanp_vd_vd_vd_vd(d.x, q.x, vcast_vd_d(1)))); + + return q; +} +#else +static INLINE vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { + vdouble t = vrec_vd_vd(d.x); + vdouble dh = vupper_vd_vd(d.x), dl = vsub_vd_vd_vd(d.x, dh); + vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); + vdouble nhh = vupper_vd_vd(n.x), nhl = vsub_vd_vd_vd(n.x, nhh); + + vdouble2 q; + + q.x = vmul_vd_vd_vd(n.x, t); + + vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), q.x), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl), + vmul_vd_vd_vd(q.x, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)))); + + q.y = vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(n.y, vmul_vd_vd_vd(q.x, d.y)), u); + + return q; +} + +static INLINE vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { + vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh); + vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x, y); + r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)); + + return r; +} + +static INLINE vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y); + r.y = vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.y, y)); + + return r; +} + +static INLINE vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble yh = vupper_vd_vd(y.x), yl = vsub_vd_vd_vd(y.x, yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y.x); + r.y = vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.x, y.y), vmul_vd_vd_vd(x.y, y.x)); + + return r; +} + +static INLINE vdouble2 ddsqu_vd2_vd2(vdouble2 x) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, x.x); + r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(r.x), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(x.x, vadd_vd_vd_vd(x.y, x.y))); + + return r; +} + +static INLINE vdouble2 ddrec_vd2_vd(vdouble d) { + vdouble t = vrec_vd_vd(d); + vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh); + vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th); + vdouble2 q; + + q.x = t; + q.y = vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))); + + return q; +} + +static INLINE vdouble2 ddrec_vd2_vd2(vdouble2 d) { + vdouble t = vrec_vd_vd(d.x); + vdouble dh = vupper_vd_vd(d.x), dl = vsub_vd_vd_vd(d.x, dh); + vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); + vdouble2 q; + + q.x = t; + q.y = vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(d.y, t))); + + return q; +} +#endif + +static INLINE vdouble2 ddsqrt_vd2_vd2(vdouble2 d) { + vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(d.x, d.y)); + return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); +} diff --git a/simd/df.h b/simd/df.h new file mode 100644 index 00000000..19779f11 --- /dev/null +++ b/simd/df.h @@ -0,0 +1,396 @@ +typedef struct { + vfloat x, y; +} vfloat2; + +static INLINE vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) { + vfloat2 ret = {h, l}; + return ret; +} + +static INLINE vfloat2 vcast_vf2_f_f(float h, float l) { + vfloat2 ret = {vcast_vf_f(h), vcast_vf_f(l)}; + return ret; +} + +static INLINE vfloat2 vsel_vf2_vm_vf2_vf2(vmask m, vfloat2 x, vfloat2 y) { + vfloat2 r; + r.x = vsel_vf_vm_vf_vf(m, x.x, y.x); + r.y = vsel_vf_vm_vf_vf(m, x.y, y.y); + return r; +} + +static INLINE vfloat2 vabs_vf2_vf2(vfloat2 x) { + return vcast_vf2_vf_vf((vfloat)vxor_vm_vm_vm(vand_vm_vm_vm((vmask)vcast_vf_f(-0.0), (vmask)x.x), (vmask)x.x), + (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm((vmask)vcast_vf_f(-0.0), (vmask)x.x), (vmask)x.y)); +} + +static INLINE vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { + return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2); +} + +static INLINE vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { + return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3); +} + +static INLINE vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { + return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4); +} + +static INLINE vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) { + return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5); +} + +static INLINE vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) { + return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6); +} + +static INLINE vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { + return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2); +} + +static INLINE vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { + return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3); +} + +static INLINE vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { + return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4); +} + +// + +static INLINE vfloat2 dfneg_vf2_vf2(vfloat2 x) { + return vcast_vf2_vf_vf(vneg_vf_vf(x.x), vneg_vf_vf(x.y)); +} + +static INLINE vfloat2 dfnormalize_vf2_vf2(vfloat2 t) { + vfloat2 s; + + s.x = vadd_vf_vf_vf(t.x, t.y); + s.y = vadd_vf_vf_vf(vsub_vf_vf_vf(t.x, s.x), t.y); + + return s; +} + +static INLINE vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) { + vfloat2 r = {vmul_vf_vf_vf(d.x, s), vmul_vf_vf_vf(d.y, s)}; + return r; +} + +static INLINE vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y); + + return r; +} + +static INLINE vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y); + vfloat v = vsub_vf_vf_vf(r.x, x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v)); + + return r; +} + +static INLINE vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y); + r.y = vadd_vf_3vf(vsub_vf_vf_vf(x.x, r.x), y, x.y); + + return r; +} + +static INLINE vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y); + vfloat v = vsub_vf_vf_vf(r.x, x.x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v)); + r.y = vadd_vf_vf_vf(r.y, x.y); + + return r; +} + +static INLINE vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y.x); + r.y = vadd_vf_3vf(vsub_vf_vf_vf(x, r.x), y.x, y.y); + + return r; +} + +static INLINE vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y.x); + r.y = vadd_vf_4vf(vsub_vf_vf_vf(x.x, r.x), y.x, x.y, y.y); + + return r; +} + +static INLINE vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y.x); + vfloat v = vsub_vf_vf_vf(r.x, x.x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y.x, v)); + r.y = vadd_vf_vf_vf(r.y, vadd_vf_vf_vf(x.y, y.y)); + + return r; +} + +static inline vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vsub_vf_vf_vf(x, y); + r.y = vsub_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y); + + return r; +} + +static INLINE vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vsub_vf_vf_vf(x.x, y.x); + r.y = vsub_vf_vf_vf(x.x, r.x); + r.y = vsub_vf_vf_vf(r.y, y.x); + r.y = vadd_vf_vf_vf(r.y, x.y); + r.y = vsub_vf_vf_vf(r.y, y.y); + + return r; +} + +#ifdef ENABLE_FMA_SP +static INLINE vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { + vfloat2 q; + vfloat t = vrec_vf_vf(d.x), u; + + q.x = vmul_vf_vf_vf(n.x, t); + u = vfmapn_vf_vf_vf_vf(t, n.x, q.x); + q.y = vfmanp_vf_vf_vf_vf(d.y, t, vfmanp_vf_vf_vf_vf(d.x, t, vcast_vf_f(1))); + q.y = vfma_vf_vf_vf_vf(q.x, q.y, vfma_vf_vf_vf_vf(n.y, t, u)); + + return q; +} + +static INLINE vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x, y); + r.y = vfmapn_vf_vf_vf_vf(x, y, r.x); + + return r; +} + +static INLINE vfloat2 dfsqu_vf2_vf2(vfloat2 x) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, x.x); + r.y = vfma_vf_vf_vf_vf(vadd_vf_vf_vf(x.x, x.x), x.y, vfmapn_vf_vf_vf_vf(x.x, x.x, r.x)); + + return r; +} + +static INLINE vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y.x); + r.y = vfma_vf_vf_vf_vf(x.x, y.y, vfma_vf_vf_vf_vf(x.y, y.x, vfmapn_vf_vf_vf_vf(x.x, y.x, r.x))); + + return r; +} + +static INLINE vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y); + r.y = vfma_vf_vf_vf_vf(x.y, y, vfmapn_vf_vf_vf_vf(x.x, y, r.x)); + + return r; +} + +static inline vfloat2 dfrec_vf2_vf(vfloat d) { + vfloat2 q; + + q.x = vrec_vf_vf(d); + q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d, q.x, vcast_vf_f(1))); + + return q; +} + +static INLINE vfloat2 dfrec_vf2_vf2(vfloat2 d) { + vfloat2 q; + + q.x = vrec_vf_vf(d.x); + q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d.y, q.x, vfmanp_vf_vf_vf_vf(d.x, q.x, vcast_vf_f(1)))); + + return q; +} +#else +static INLINE vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { + vfloat t = vrec_vf_vf(d.x); + vfloat dh = vupper_vf_vf(d.x), dl = vsub_vf_vf_vf(d.x, dh); + vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); + vfloat nhh = vupper_vf_vf(n.x), nhl = vsub_vf_vf_vf(n.x, nhh); + + vfloat2 q; + + q.x = vmul_vf_vf_vf(n.x, t); + + //vfloat u = vadd_vf_5vf(vsub_vf_vf_vf(vmul_vf_vf_vf(nhh, th), q.x), vmul_vf_vf_vf(nhh, tl), vmul_vf_vf_vf(nhl, th), vmul_vf_vf_vf(nhl, tl), + //vmul_vf_vf_vf(q.x, vsub_vf_5vf(vcast_vf_f(1), vmul_vf_vf_vf(dh, th), vmul_vf_vf_vf(dh, tl), vmul_vf_vf_vf(dl, th), vmul_vf_vf_vf(dl, tl)))); + + vfloat u, w; + w = vcast_vf_f(-1); + w = vmla_vf_vf_vf_vf(dh, th, w); + w = vmla_vf_vf_vf_vf(dh, tl, w); + w = vmla_vf_vf_vf_vf(dl, th, w); + w = vmla_vf_vf_vf_vf(dl, tl, w); + w = vneg_vf_vf(w); + + u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(q.x)); + u = vmla_vf_vf_vf_vf(nhh, tl, u); + u = vmla_vf_vf_vf_vf(nhl, th, u); + u = vmla_vf_vf_vf_vf(nhl, tl, u); + u = vmla_vf_vf_vf_vf(q.x, w , u); + + q.y = vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(n.y, vmul_vf_vf_vf(q.x, d.y)), u); + + return q; +} + +static INLINE vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { + vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh); + vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x, y); + //r.y = vadd_vf_5vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + r.y = t; + + return r; +} + +static INLINE vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y); + //r.y = vadd_vf_6vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(x.y, y)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + t = vmla_vf_vf_vf_vf(x.y, y, t); + r.y = t; + + return r; +} + +static INLINE vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat yh = vupper_vf_vf(y.x), yl = vsub_vf_vf_vf(y.x, yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y.x); + //r.y = vadd_vf_7vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(x.x, y.y), vmul_vf_vf_vf(x.y, y.x)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + t = vmla_vf_vf_vf_vf(x.x, y.y, t); + t = vmla_vf_vf_vf_vf(x.y, y.x, t); + r.y = t; + + return r; +} + +static INLINE vfloat2 dfsqu_vf2_vf2(vfloat2 x) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, x.x); + //r.y = vadd_vf_5vf(vmul_vf_vf_vf(xh, xh), vneg_vf_vf(r.x), vmul_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl), vmul_vf_vf_vf(xl, xl), vmul_vf_vf_vf(x.x, vadd_vf_vf_vf(x.y, x.y))); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t); + t = vmla_vf_vf_vf_vf(xl, xl, t); + t = vmla_vf_vf_vf_vf(x.x, vadd_vf_vf_vf(x.y, x.y), t); + r.y = t; + + return r; +} + +static INLINE vfloat2 dfrec_vf2_vf(vfloat d) { + vfloat t = vrec_vf_vf(d); + vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh); + vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th); + vfloat2 q; + + q.x = t; + //q.y = vmul_vf_vf_vf(t, vsub_vf_5vf(vcast_vf_f(1), vmul_vf_vf_vf(dh, th), vmul_vf_vf_vf(dh, tl), vmul_vf_vf_vf(dl, th), vmul_vf_vf_vf(dl, tl))); + + vfloat u = vcast_vf_f(-1); + u = vmla_vf_vf_vf_vf(dh, th, u); + u = vmla_vf_vf_vf_vf(dh, tl, u); + u = vmla_vf_vf_vf_vf(dl, th, u); + u = vmla_vf_vf_vf_vf(dl, tl, u); + q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u); + + return q; +} + +static INLINE vfloat2 dfrec_vf2_vf2(vfloat2 d) { + vfloat t = vrec_vf_vf(d.x); + vfloat dh = vupper_vf_vf(d.x), dl = vsub_vf_vf_vf(d.x, dh); + vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); + vfloat2 q; + + q.x = t; + //q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + vfloat u = vcast_vf_f(-1); + u = vmla_vf_vf_vf_vf(dh, th, u); + u = vmla_vf_vf_vf_vf(dh, tl, u); + u = vmla_vf_vf_vf_vf(dl, th, u); + u = vmla_vf_vf_vf_vf(dl, tl, u); + u = vmla_vf_vf_vf_vf(d.y, t, u); + q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u); + + return q; +} +#endif + +static INLINE vfloat2 dfsqrt_vf2_vf2(vfloat2 d) { +#ifdef ENABLE_RECSQRT_SP + vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y)); + vfloat2 r = dfmul_vf2_vf2_vf(d, x); + return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5)); +#else + vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y)); + return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5)); +#endif +} diff --git a/simd/helperavx.h b/simd/helperavx.h new file mode 100644 index 00000000..426d7182 --- /dev/null +++ b/simd/helperavx.h @@ -0,0 +1,283 @@ +#ifndef __AVX__ +#error Please specify -mavx. +#endif + +#include +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef struct { vint x, y; } vint2; + +// + +static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static INLINE vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static INLINE vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static INLINE vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static INLINE vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +// + +static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static INLINE vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } + +static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + + +// + +static INLINE vmask veq_vm_vi_vi(vint x, vint y) { + __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1))); + return veq_vm_vd_vd(r, _mm256_set_pd(1, 1, 1, 1)); +} + +static INLINE vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static INLINE vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static INLINE vint2 vcast_vi2_vm(vmask vm) { + vint2 r; + r.x = _mm256_castsi256_si128(vm); + r.y = _mm256_extractf128_si256(vm, 1); + return r; +} + +static INLINE vmask vcast_vm_vi2(vint2 vi) { + vmask m = _mm256_castsi128_si256(vi.x); + m = _mm256_insertf128_si256(m, vi.y, 1); + return m; +} + +static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; } + +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vadd_vi_vi_vi(x.x, y.x); r.y = vadd_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vsub_vi_vi_vi(x.x, y.x); r.y = vsub_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vand_vi_vi_vi(x.x, y.x); r.y = vand_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vandnot_vi_vi_vi(x.x, y.x); r.y = vandnot_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vor_vi_vi_vi(x.x, y.x); r.y = vor_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vxor_vi_vi_vi(x.x, y.x); r.y = vxor_vi_vi_vi(x.y, y.y); return r; } + +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsll_vi_vi_i(x.x, c); r.y = vsll_vi_vi_i(x.y, c); return r; } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsrl_vi_vi_i(x.x, c); r.y = vsrl_vi_vi_i(x.y, c); return r; } +static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsra_vi_vi_i(x.x, c); r.y = vsra_vi_vi_i(x.y, c); return r; } + +static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpeq_epi32(x.x, y.x); + r.y = _mm_cmpeq_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return r; +} + +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { + vint2 r, m2 = vcast_vi2_vm(m); + r.x = vor_vi_vi_vi(vand_vi_vi_vi(m2.x, x.x), vandnot_vi_vi_vi(m2.x, y.x)); + r.y = vor_vi_vi_vi(vand_vi_vi_vi(m2.y, x.y), vandnot_vi_vi_vi(m2.y, y.y)); + return r; +} + +// + +static INLINE double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static INLINE float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static INLINE vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static INLINE vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static INLINE vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static INLINE vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static INLINE vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static INLINE vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static INLINE vdouble vpow2i_vd_vi(vint q) { + vint r; + vdouble y; + q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q); + q = _mm_slli_epi32(q, 20); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0)); + y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return y; +} + +static INLINE vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + m = _mm_slli_epi32(m, 20); + vint r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,0,0,0)); + vdouble y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static INLINE vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static INLINE vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static INLINE vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} diff --git a/simd/helperavx2.h b/simd/helperavx2.h new file mode 100644 index 00000000..fa527a49 --- /dev/null +++ b/simd/helperavx2.h @@ -0,0 +1,254 @@ +#ifndef __AVX2__ +#error Please specify -mavx2. +#endif + +#include +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef __m256i vint2; + +#define ENABLE_FMA_DP +#define ENABLE_FMA_SP + +// + +static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static INLINE vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32 (x, c); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32 (x, c); } +static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32 (x, c); } + +// + +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static INLINE vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static INLINE vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } +static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static INLINE vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } +static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } +static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); } + +// + +static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static INLINE vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } +static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } + +static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + +static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } +static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } +static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); } + +// + +static INLINE vmask veq_vm_vi_vi(vint x, vint y) { + return _mm256_cvtepi32_epi64(_mm_cmpeq_epi32(x, y)); +} + +static INLINE vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static INLINE vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } +static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } + +static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); } + +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); } +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); } +static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); } +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); } +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); } +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); } + +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); } +static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); } + +static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); } +static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); } + +// + +static INLINE double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static INLINE float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static INLINE vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static INLINE vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static INLINE vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static INLINE vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static INLINE vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static INLINE vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static INLINE vdouble vpow2i_vd_vi(vint q) { + vint2 r = _mm256_slli_epi64(_mm256_cvtepi32_epi64(_mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q)), 52); + r = _mm256_and_si256(r, _mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return (vdouble)r; +} + +static INLINE vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + vint2 r = _mm256_slli_epi64(_mm256_cvtepi32_epi64(m), 52); + vdouble y = (vdouble)_mm256_and_si256(r, _mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static INLINE vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static INLINE vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static INLINE vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} diff --git a/simd/helperfma4.h b/simd/helperfma4.h new file mode 100644 index 00000000..4e4f6099 --- /dev/null +++ b/simd/helperfma4.h @@ -0,0 +1,298 @@ +#ifndef __FMA4__ +#error Please specify -mfma4. +#endif + +#include +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef struct { vint x, y; } vint2; + +#define ENABLE_FMA_DP +#define ENABLE_FMA_SP + +// + +static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static INLINE vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static INLINE vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static INLINE vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } +static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static INLINE vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } +static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } +static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); } + +// + +static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static INLINE vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } + +static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + +static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } +static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } +static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); } + +// + +static INLINE vmask veq_vm_vi_vi(vint x, vint y) { + __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1))); + return veq_vm_vd_vd(r, _mm256_set_pd(1, 1, 1, 1)); +} + +static INLINE vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static INLINE vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static INLINE vint2 vcast_vi2_vm(vmask vm) { + vint2 r; + r.x = _mm256_castsi256_si128(vm); + r.y = _mm256_extractf128_si256(vm, 1); + return r; +} + +static INLINE vmask vcast_vm_vi2(vint2 vi) { + vmask m = _mm256_castsi128_si256(vi.x); + m = _mm256_insertf128_si256(m, vi.y, 1); + return m; +} + +static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; } + +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vadd_vi_vi_vi(x.x, y.x); r.y = vadd_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vsub_vi_vi_vi(x.x, y.x); r.y = vsub_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vand_vi_vi_vi(x.x, y.x); r.y = vand_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vandnot_vi_vi_vi(x.x, y.x); r.y = vandnot_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vor_vi_vi_vi(x.x, y.x); r.y = vor_vi_vi_vi(x.y, y.y); return r; } +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vxor_vi_vi_vi(x.x, y.x); r.y = vxor_vi_vi_vi(x.y, y.y); return r; } + +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsll_vi_vi_i(x.x, c); r.y = vsll_vi_vi_i(x.y, c); return r; } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsrl_vi_vi_i(x.x, c); r.y = vsrl_vi_vi_i(x.y, c); return r; } +static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsra_vi_vi_i(x.x, c); r.y = vsra_vi_vi_i(x.y, c); return r; } + +static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpeq_epi32(x.x, y.x); + r.y = _mm_cmpeq_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return r; +} + +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { + vint2 r, m2 = vcast_vi2_vm(m); + r.x = vor_vi_vi_vi(vand_vi_vi_vi(m2.x, x.x), vandnot_vi_vi_vi(m2.x, y.x)); + r.y = vor_vi_vi_vi(vand_vi_vi_vi(m2.y, x.y), vandnot_vi_vi_vi(m2.y, y.y)); + return r; +} + +// + +static INLINE double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static INLINE float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static INLINE vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static INLINE vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static INLINE vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static INLINE vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static INLINE vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static INLINE vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static INLINE vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static INLINE vdouble vpow2i_vd_vi(vint q) { + vint r; + vdouble y; + q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q); + q = _mm_slli_epi32(q, 20); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0)); + y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return y; +} + +static INLINE vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + m = _mm_slli_epi32(m, 20); + vint r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,0,0,0)); + vdouble y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static INLINE vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static INLINE vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static INLINE vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} diff --git a/simd/helperneon.h b/simd/helperneon.h new file mode 100644 index 00000000..d1248db6 --- /dev/null +++ b/simd/helperneon.h @@ -0,0 +1,145 @@ +#ifndef __ARM_NEON__ +#error Please specify -mfpu=neon. +#endif + +#include +#include + +typedef int32x4_t vint; +typedef uint32x4_t vmask; + +typedef float32x4_t vfloat; +typedef int32x4_t vint2; + +// + +static INLINE vint vcast_vi_i(int i) { return vdupq_n_s32(i); } + +static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } +static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } + +// + +static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return vaddq_s32(x, y); } +static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return vsubq_s32(x, y); } +static INLINE vint vneg_vi_vi(vint e) { return vnegq_s32(e); } + +static INLINE vint vand_vi_vi_vi(vint x, vint y) { return vandq_s32(x, y); } +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return vbicq_s32(y, x); } +static INLINE vint vor_vi_vi_vi(vint x, vint y) { return vorrq_s32(x, y); } +static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return veorq_s32(x, y); } + +static INLINE vint vsll_vi_vi_i(vint x, int c) { return (int32x4_t) vshlq_n_u32((uint32x4_t)x, c); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (int32x4_t) vshrq_n_u32((uint32x4_t)x, c); } +static INLINE vint vsra_vi_vi_i(vint x, int c) { return vshrq_n_s32(x, c); } + +// + +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); } +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); } +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); } +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); } + +static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); } +static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); } +static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); } +static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); } +static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); } +static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); } + +// + +static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); } + +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); } +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); } +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); } +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); } + +static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); } +static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); } +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); } +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); } + +static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y); +} + +static INLINE vfloat vrec_vf_vf(vfloat d) { + float32x4_t x = vrecpeq_f32(d); + x = vmulq_f32(x, vrecpsq_f32(d, x)); + return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d); +} + +static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { + float32x4_t x = vrecpeq_f32(d); + x = vmulq_f32(x, vrecpsq_f32(d, x)); + float32x4_t t = vmulq_f32(n, x); + return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d); +} + +static INLINE vfloat vsqrt_vf_vf(vfloat d) { + float32x4_t x = vrsqrteq_f32(d); + x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); + float32x4_t u = vmulq_f32(x, d); + u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); + return (float32x4_t)vbicq_u32((uint32x4_t)u, vceqq_f32(d, vdupq_n_f32(0.0f))); +} + +static INLINE vfloat vrecsqrt_vf_vf(vfloat d) { + float32x4_t x = vrsqrteq_f32(d); + x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); + return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5))); +} + +#define ENABLE_RECSQRT_SP + +// + +static INLINE vmask veq_vm_vi_vi(vint x, vint y) { return vceqq_s32(x, y); } + +// + +static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } +static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } + +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); } + +static INLINE vint2 vrint_vi2_vf(vfloat d) { + //return vcvtq_s32_f32(vrndqn_f32(d)); + return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f)))); +} + +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); } +static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); } + +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); } +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); } +static INLINE vint vneg_vi2_vi2(vint2 e) { return vneg_vi_vi(e); } + +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); } +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); } +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); } +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); } + +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); } +static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); } + +static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); } +static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); } +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgeq_s32(x, y); } +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); } + +// + +static INLINE float vcast_f_vf(vfloat v) { + float p[4]; + vst1q_f32 (p, v); + return p[0]; +} + +static INLINE vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vandq_s32((vint)d, vdupq_n_s32(0xfffff000)); +} diff --git a/simd/helpersse2.h b/simd/helpersse2.h new file mode 100644 index 00000000..0801b496 --- /dev/null +++ b/simd/helpersse2.h @@ -0,0 +1,235 @@ +#ifndef __SSE2__ +#error Please specify -msse2. +#endif + +#include +#include + +typedef __m128d vdouble; +typedef __m128i vint; +typedef __m128i vmask; + +typedef __m128 vfloat; +typedef __m128i vint2; + +// + +static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } +static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } +static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); } +static INLINE vdouble vcast_vd_d(double d) { return _mm_set_pd(d, d); } +static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); } + +static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m128i)vd; } +static INLINE vdouble vreinterpret_vd_vm(vint vm) { return (__m128d)vm; } + +static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (__m128i)vf; } +static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; } + +// + +static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); } +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); } +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); } +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); } + +static INLINE vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); } +static INLINE vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); } +static INLINE vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmplt_pd(x, y); } +static INLINE vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmple_pd(x, y); } +static INLINE vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpgt_pd(x, y); } +static INLINE vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpge_pd(x, y); } + +static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpeq_ps(x, y); } +static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpneq_ps(x, y); } +static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmplt_ps(x, y); } +static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmple_ps(x, y); } +static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpgt_ps(x, y); } +static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpge_ps(x, y); } + +// + +static INLINE vfloat vcast_vf_f(float f) { return _mm_set_ps(f, f, f, f); } + +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); } +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); } +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); } +static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); } +static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); } +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); } +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); } + +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static INLINE vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +// + +static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); } +static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); } +static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); } +static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); } +static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set_pd(1, 1), x); } +static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); } +static INLINE vdouble vabs_vd_vd(vdouble d) { return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0,-0.0), d); } +static INLINE vdouble vneg_vd_vd(vdouble d) { return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0,-0.0), d); } +static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } + +static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); } +static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); } + +// + +static INLINE vmask veq_vm_vi_vi(vint x, vint y) { + __m128 s = (__m128)_mm_cmpeq_epi32(x, y); + return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0)); +} + +static INLINE vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m128d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m128i)x), vandnot_vm_vm_vm(mask, (__m128i)y)); +} + +static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static INLINE vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vlt_vm_vd_vd(d0, d1)), _mm_set_ps(0, 0, 0, 0)); + return vor_vi_vi_vi(vandnot_vi_vi_vi(mask, x), vand_vi_vi_vi(mask, y)); +} + +// + +static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } +static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } + +static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); } +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); } +static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); } +static INLINE vint2 vcast_vi2_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); } +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); } +static INLINE vint vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); } +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); } +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); } +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); } + +static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); } +static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); } +static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); } + +static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); } +static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); } + +// + +static INLINE double vcast_d_vd(vdouble v) { + double s[2]; + _mm_storeu_pd(s, v); + return s[0]; +} + +static INLINE float vcast_f_vf(vfloat v) { + float s[4]; + _mm_storeu_ps(s, v); + return s[0]; +} + +static INLINE vmask vsignbit_vm_vd(vdouble d) { + return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)); +} + +static INLINE vdouble vsign_vd_vd(vdouble d) { + return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0))); +} + +static INLINE vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m128d)vxor_vi_vi_vi((__m128i)x, vsignbit_vm_vd(y)); +} + +static INLINE vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set_pd(INFINITY, INFINITY)); +} + +static INLINE vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY)); +} + +static INLINE vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY)); +} + +static INLINE vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm_cmpneq_pd(d, d); +} + +static INLINE vdouble visinf(vdouble d) { + return (__m128d)_mm_and_si128(visinf_vm_vd(d), _mm_or_si128(vsignbit_vm_vd(d), (__m128i)_mm_set_pd(1, 1))); +} + +static INLINE vdouble visinf2(vdouble d, vdouble m) { + return (__m128d)_mm_and_si128(visinf_vm_vd(d), _mm_or_si128(vsignbit_vm_vd(d), (__m128i)m)); +} + +// + +static INLINE vdouble vpow2i_vd_vi(vint q) { + q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q); + q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,3,0,3)); + return (__m128d)_mm_slli_epi32(q, 20); +} + +static INLINE vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x0, 0x0, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x0, 0x0, 0x7ff, 0x7ff))); + m = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,3,0,3)); + vdouble y = (__m128d)_mm_slli_epi32(m, 20); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static INLINE vint vilogbp1_vi_vd(vdouble d) { + vint m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12)-1) << 20, 0, ((1 << 12)-1) << 20, 0)); + q = _mm_srli_epi32(q, 20); + q = vor_vm_vm_vm(vand_vm_vm_vm (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))), + vandnot_vm_vm_vm(m, _mm_sub_epi32(q, _mm_set_epi32( 0x3fe, 0, 0x3fe, 0)))); + q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0,0,3,1)); + return q; +} + +static INLINE vdouble vupper_vd_vd(vdouble d) { + return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static INLINE vfloat vupper_vf_vf(vfloat d) { + return (__m128)_mm_and_si128((__m128i)d, _mm_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} diff --git a/simd/iut.c b/simd/iut.c new file mode 100644 index 00000000..d85d8c40 --- /dev/null +++ b/simd/iut.c @@ -0,0 +1,1730 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "sleefsimd.h" + +static jmp_buf sigjmp; + +static void sighandler(int signum) { + longjmp(sigjmp, 1); +} + +int detectFeature() { + signal(SIGILL, sighandler); + + if (setjmp(sigjmp) == 0) { +#ifdef ENABLE_DP + double s[VECTLENDP]; + int i; + for(i=0;i= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int startsWith(char *str, char *prefix) { + return strncmp(str, prefix, strlen(prefix)) == 0; +} + +double u2d(uint64_t u) { + union { + double f; + uint64_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint64_t d2u(double d) { + union { + double f; + uint64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +float u2f(uint32_t u) { + union { + float f; + uint32_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint32_t f2u(float d) { + union { + float f; + uint32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +typedef struct { + double x, y; +} double2; + +#ifdef ENABLE_DP + +double xxsin(double d) { + double s[VECTLENDP]; + int i; + for(i=0;i +#include + +#endif diff --git a/simd/sleefsimd.h b/simd/sleefsimd.h new file mode 100644 index 00000000..c9418689 --- /dev/null +++ b/simd/sleefsimd.h @@ -0,0 +1,218 @@ +#include + +// ******** SSE2 ******** + +#ifdef ENABLE_SSE2 +#include + +#define VECTLENDP 2 +#define VECTLENSP 4 + +typedef __m128d vdouble; +typedef __m128i vint; + +typedef __m128 vfloat; +typedef __m128i vint2; + +static vdouble vloadu(double *p) { return _mm_loadu_pd(p); } +static void vstoreu(double *p, vdouble v) { _mm_storeu_pd(p, v); } + +static vfloat vloaduf(float *p) { return _mm_loadu_ps(p); } +static void vstoreuf(float *p, vfloat v) { _mm_storeu_ps(p, v); } + +static vint2 vloadui2(int32_t *p) { return (vint2)_mm_loadu_si128((__m128i *)p); } +static void vstoreui2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, (__m128i)v); } + +#define ENABLE_DP +#define ENABLE_SP +#endif + + +// ******** AVX ******** + +#if defined(ENABLE_AVX) || defined(ENABLE_FMA4) +#include + +#define VECTLENDP 4 +#define VECTLENSP 8 + +typedef __m256d vdouble; +typedef __m128i vint; + +typedef __m256 vfloat; +typedef struct { + vint x, y; +} vint2; + +static vdouble vloadu(double *p) { return _mm256_loadu_pd(p); } +static void vstoreu(double *p, vdouble v) { return _mm256_storeu_pd(p, v); } + +static vfloat vloaduf(float *p) { return _mm256_loadu_ps(p); } +static void vstoreuf(float *p, vfloat v) { return _mm256_storeu_ps(p, v); } + +static vint2 vloadui2(int32_t *p) { + vint2 r; + r.x = _mm_loadu_si128((__m128i *) p ); + r.y = _mm_loadu_si128((__m128i *)(p + 4)); + return r; +} + +static void vstoreui2(int32_t *p, vint2 v) { + _mm_storeu_si128((__m128i *) p , v.x); + _mm_storeu_si128((__m128i *)(p + 4), v.y); +} + +#define ENABLE_DP +#define ENABLE_SP +#endif + + +// ******** AVX2 ******** + +#ifdef ENABLE_AVX2 +#include + +#define VECTLENDP 4 +#define VECTLENSP 8 + +typedef __m256d vdouble; +typedef __m128i vint; + +typedef __m256 vfloat; +typedef __m256i vint2; + +static vdouble vloadu(double *p) { return _mm256_loadu_pd(p); } +static void vstoreu(double *p, vdouble v) { return _mm256_storeu_pd(p, v); } + +static vfloat vloaduf(float *p) { return _mm256_loadu_ps(p); } +static void vstoreuf(float *p, vfloat v) { return _mm256_storeu_ps(p, v); } + +static vint2 vloadui2(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); } +static void vstoreui2(int32_t *p, vint2 v) { return _mm256_storeu_si256((__m256i *)p, v); } + +#define ENABLE_DP +#define ENABLE_SP +#endif + + +// ******** ARM NEON ******** + +#ifdef ENABLE_NEON +#include + +#define VECTLENDP 2 +#define VECTLENSP 4 + +//typedef __m128d vdouble; +typedef int32x4_t vint; +typedef uint32x4_t vmask; + +typedef float32x4_t vfloat; +typedef int32x4_t vint2; + +//static vdouble vloadu(double *p) { return _mm_loadu_pd(p); } +//static void vstoreu(double *p, vdouble v) { _mm_storeu_pd(p, v); } + +static vfloat vloaduf(float *p) { return vld1q_f32(p); } +static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); } + +static vint2 vloadui2(int32_t *p) { return (vint2)vld1q_s32(p); } +static void vstoreui2(int32_t *p, vint2 v) { vst1q_s32(p, v); } + +#define ENABLE_SP +#endif + + +#ifdef ENABLE_DP +typedef struct { + vdouble x, y; +} vdouble2; + +vdouble xldexp(vdouble x, vint q); +vint xilogb(vdouble d); + +vdouble xsin(vdouble d); +vdouble xcos(vdouble d); +vdouble2 xsincos(vdouble d); +vdouble xtan(vdouble d); +vdouble xasin(vdouble s); +vdouble xacos(vdouble s); +vdouble xatan(vdouble s); +vdouble xatan2(vdouble y, vdouble x); +vdouble xlog(vdouble d); +vdouble xexp(vdouble d); +vdouble xpow(vdouble x, vdouble y); + +vdouble xsinh(vdouble d); +vdouble xcosh(vdouble d); +vdouble xtanh(vdouble d); +vdouble xasinh(vdouble s); +vdouble xacosh(vdouble s); +vdouble xatanh(vdouble s); + +vdouble xcbrt(vdouble d); + +vdouble xexp2(vdouble a); +vdouble xexp10(vdouble a); +vdouble xexpm1(vdouble a); +vdouble xlog10(vdouble a); +vdouble xlog1p(vdouble a); + +vdouble xsin_u1(vdouble d); +vdouble xcos_u1(vdouble d); +vdouble2 xsincos_u1(vdouble d); +vdouble xtan_u1(vdouble d); +vdouble xasin_u1(vdouble s); +vdouble xacos_u1(vdouble s); +vdouble xatan_u1(vdouble s); +vdouble xatan2_u1(vdouble y, vdouble x); +vdouble xlog_u1(vdouble d); +vdouble xcbrt_u1(vdouble d); +#endif + +// + +#ifdef ENABLE_SP +typedef struct { + vfloat x, y; +} vfloat2; + +vfloat xldexpf(vfloat x, vint2 q); + +vfloat xsinf(vfloat d); +vfloat xcosf(vfloat d); +vfloat2 xsincosf(vfloat d); +vfloat xtanf(vfloat d); +vfloat xasinf(vfloat s); +vfloat xacosf(vfloat s); +vfloat xatanf(vfloat s); +vfloat xatan2f(vfloat y, vfloat x); +vfloat xlogf(vfloat d); +vfloat xexpf(vfloat d); +vfloat xcbrtf(vfloat s); +vfloat xsqrtf(vfloat s); + +vfloat xpowf(vfloat x, vfloat y); +vfloat xsinhf(vfloat x); +vfloat xcoshf(vfloat x); +vfloat xtanhf(vfloat x); +vfloat xasinhf(vfloat x); +vfloat xacoshf(vfloat x); +vfloat xatanhf(vfloat x); +vfloat xexp2f(vfloat a); +vfloat xexp10f(vfloat a); +vfloat xexpm1f(vfloat a); +vfloat xlog10f(vfloat a); +vfloat xlog1pf(vfloat a); + +vfloat xsinf_u1(vfloat d); +vfloat xcosf_u1(vfloat d); +vfloat2 xsincosf_u1(vfloat d); +vfloat xtanf_u1(vfloat d); +vfloat xasinf_u1(vfloat s); +vfloat xacosf_u1(vfloat s); +vfloat xatanf_u1(vfloat s); +vfloat xatan2f_u1(vfloat y, vfloat x); +vfloat xlogf_u1(vfloat d); +vfloat xcbrtf_u1(vfloat s); +#endif diff --git a/simd/sleefsimddp.c b/simd/sleefsimddp.c new file mode 100644 index 00000000..c3cd51ec --- /dev/null +++ b/simd/sleefsimddp.c @@ -0,0 +1,1022 @@ +#include +#include + +#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) +#define INLINE __attribute__((always_inline)) +#else +#define INLINE inline +#endif + +#include "nonnumber.h" + +#ifdef ENABLE_SSE2 +#include "helpersse2.h" +#endif + +#ifdef ENABLE_AVX +#include "helperavx.h" +#endif + +#ifdef ENABLE_AVX2 +#include "helperavx2.h" +#endif + +#ifdef ENABLE_FMA4 +#include "helperfma4.h" +#endif + +// + +#include "dd.h" + +// + +#define PI4_A 0.78539816290140151978 +#define PI4_B 4.9604678871439933374e-10 +#define PI4_C 1.1258708853173288931e-18 +#define PI4_D 1.7607799325916000908e-27 + +#define M_4_PI 1.273239544735162542821171882678754627704620361328125 + +#define L2U .69314718055966295651160180568695068359375 +#define L2L .28235290563031577122588448175013436025525412068e-12 +#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 + +// + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f + +// + +vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); } + +vint xilogb(vdouble d) { + vdouble e = vcast_vd_vi(vsub_vi_vi_vi(vilogbp1_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1))); + e = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e); + e = vsel_vd_vm_vd_vd(veq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(INFINITY)), vcast_vd_d(2147483647), e); + return vrint_vi_vd(e); +} + +vdouble xsin(vdouble d) { + vint q; + vdouble u, s; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + + u = vcast_vd_vi(q); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*4), d); + + s = vmul_vd_vd_vd(d, d); + + d = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), (vmask)vcast_vd_d(-0.0)), (vmask)d); + + u = vcast_vd_d(-7.97255955009037868891952e-18); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + + return u; +} + +vdouble xsin_u1(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*4))); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(2.72052416138529567917983e-15); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); + + x = ddmul_vd2_vd2_vd2(t, x); + u = vadd_vd_vd_vd(x.x, x.y); + + u = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), (vmask)vcast_vd_d(-0.0)), (vmask)u); + + return u; +} + +vdouble xcos(vdouble d) { + vint q; + vdouble u, s; + + q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); + + u = vcast_vd_vi(q); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), d); + + s = vmul_vd_vd_vd(d, d); + + d = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)), (vmask)vcast_vd_d(-0.0)), (vmask)d); + + u = vcast_vd_d(-7.97255955009037868891952e-18); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + + return u; +} + +vdouble xcos_u1(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + + q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(2.72052416138529567917983e-15); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); + + x = ddmul_vd2_vd2_vd2(t, x); + u = vadd_vd_vd_vd(x.x, x.y); + + u = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)), (vmask)vcast_vd_d(-0.0)), (vmask)u); + + return u; +} + +vdouble2 xsincos(vdouble d) { + vint q; + vmask m; + vdouble u, s, t, rx, ry; + vdouble2 r; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + + s = d; + + u = vcast_vd_vi(q); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), s); + + t = s; + + s = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(1.58938307283228937328511e-10); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393)); + u = vmul_vd_vd_vd(vmul_vd_vd_vd(u, s), t); + + rx = vadd_vd_vd_vd(t, u); + + u = vcast_vd_d(-1.13615350239097429531523e-11); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5)); + + ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1)); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)); + r.x = vsel_vd_vm_vd_vd(m, rx, ry); + r.y = vsel_vd_vm_vd_vd(m, ry, rx); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)); + r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)); + r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + + m = visinf_vm_vd(d); + r.x = (vdouble)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vdouble)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vdouble2 xsincos_u1(vdouble d) { + vint q; + vmask m; + vdouble u, rx, ry; + vdouble2 r, s, t, x; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + t = s; + + s = ddsqu_vd2_vd2(s); + s.x = vadd_vd_vd_vd(s.x, s.y); + + u = vcast_vd_d(1.58938307283228937328511e-10); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.50506943502539773349318e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573131776846360512547e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698278911770864914)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0083333333333191845961746)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.166666666666666130709393)); + + u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(s.x, t.x)); + + x = ddadd_vd2_vd2_vd(t, u); + rx = vadd_vd_vd_vd(x.x, x.y); + + u = vcast_vd_d(-1.13615350239097429531523e-11); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.08757471207040055479366e-09)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.75573144028847567498567e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48015872890001867311915e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.00138888888888714019282329)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666666665519592062)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.5)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(s.x, u)); + ry = vadd_vd_vd_vd(x.x, x.y); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)); + r.x = vsel_vd_vm_vd_vd(m, rx, ry); + r.y = vsel_vd_vm_vd_vd(m, ry, rx); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)); + r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)); + r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + + m = visinf_vm_vd(d); + r.x = (vdouble)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vdouble)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vdouble xtan(vdouble d) { + vint q; + vdouble u, s, x; + vmask m; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + + u = vcast_vd_vi(q); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), x); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), x); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), x); + + s = vmul_vd_vd_vd(x, x); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)); + x = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(m, (vmask)vcast_vd_d(-0.0)), (vmask)x); + + u = vcast_vd_d(1.01419718511083373224408e-05); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.59519791585924697698614e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(5.23388081915899855325186e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.05033014433946488225616e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(7.14707504084242744267497e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(8.09674518280159187045078e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000244884931879331847054404)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000588505168743587154904506)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145612788922812427978848)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00359208743836906619142924)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886323944362401618113356)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694882853846389592078)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539781298417636002)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333333125941821962)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333334980164153)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); + + u = vsel_vd_vm_vd_vd(m, vrec_vd_vd(u), u); + + u = (vdouble)vor_vm_vm_vm(visinf_vm_vd(d), (vmask)u); + + return u; +} + +vdouble xtan_u1(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + vmask m; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)); + vmask n = vand_vm_vm_vm(m, (vmask)vcast_vd_d(-0.0)); + s.x = (vdouble)vxor_vm_vm_vm((vmask)s.x, n); + s.y = (vdouble)vxor_vm_vm_vm((vmask)s.y, n); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(1.01419718511083373224408e-05); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.59519791585924697698614e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(5.23388081915899855325186e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-3.05033014433946488225616e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(7.14707504084242744267497e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(8.09674518280159187045078e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000244884931879331847054404)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000588505168743587154904506)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00145612788922812427978848)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00359208743836906619142924)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00886323944362401618113356)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0218694882853846389592078)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0539682539781298417636002)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.133333333333125941821962)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(0.333333333333334980164153), vmul_vd_vd_vd(u, s.x)), s)); + x = ddmul_vd2_vd2_vd2(t, x); + + x = vsel_vd2_vm_vd2_vd2(m, ddrec_vd2_vd2(x), x); + + u = vadd_vd_vd_vd(x.x, x.y); + + return u; +} + +static INLINE vdouble atan2k(vdouble y, vdouble x) { + vdouble s, t, u; + vint q; + vmask p; + + q = vsel_vi_vd_vd_vi_vi(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0)); + x = vabs_vd_vd(x); + + q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + p = vlt_vm_vd_vd(x, y); + s = vsel_vd_vm_vd_vd(p, vneg_vd_vd(x), y); + t = vmax_vd_vd_vd(x, y); + + s = vdiv_vd_vd_vd(s, t); + t = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(-1.88796008463073496563746e-05); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); + + t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); + t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t); + + return t; +} + +static INLINE vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { + vdouble u; + vdouble2 s, t; + vint q; + vmask p; + + q = vsel_vi_vd_vd_vi_vi(x.x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0)); + p = vlt_vm_vd_vd(x.x, vcast_vd_d(0)); + p = vand_vm_vm_vm(p, (vmask)vcast_vd_d(-0.0)); + x.x = (vdouble)vxor_vm_vm_vm((vmask)x.x, p); + x.y = (vdouble)vxor_vm_vm_vm((vmask)x.y, p); + + q = vsel_vi_vd_vd_vi_vi(x.x, y.x, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + p = vlt_vm_vd_vd(x.x, y.x); + s = vsel_vd2_vm_vd2_vd2(p, ddneg_vd2_vd2(x), y); + t = vsel_vd2_vm_vd2_vd2(p, y, x); + + s = dddiv_vd2_vd2_vd2(s, t); + t = ddsqu_vd2_vd2(s); + t = ddnormalize_vd2_vd2(t); + + u = vcast_vd_d(1.06298484191448746607415e-05); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.000125620649967286867384336)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00070557664296393412389774)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.00251865614498713360352999)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00646262899036991172313504)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0128281333663399031014274)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0208024799924145797902497)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0289002344784740315686289)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0359785005035104590853656)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.041848579703592507506027)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0470843011653283988193763)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0524914210588448421068719)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0587946590969581003860434)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0666620884778795497194182)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0769225330296203768654095)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0909090442773387574781907)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.111111108376896236538123)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.142857142756268568062339)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.199999999997977351284817)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.333333333333317605173818)); + + t = ddmul_vd2_vd2_vd(t, u); + t = ddmul_vd2_vd2_vd2(s, ddadd_vd2_vd_vd2(vcast_vd_d(1), t)); + t = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t); + + return t; +} + +vdouble xatan2(vdouble y, vdouble x) { + vdouble r = atan2k(vabs_vd_vd(y), x); + + r = vmulsign_vd_vd_vd(r, x); + r = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(0.0)), (vdouble)vand_vm_vm_vm(veq_vm_vd_vd(vsign_vd_vd(x), vcast_vd_d(-1.0)), (vmask)vcast_vd_d(M_PI)), r); + + r = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)vmulsign_vd_vd_vd(r, y)); + return r; +} + +vdouble xatan2_u1(vdouble y, vdouble x) { + vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0))); + vdouble r = vadd_vd_vd_vd(d.x, d.y); + + r = vmulsign_vd_vd_vd(r, x); + r = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(0.0)), (vdouble)vand_vm_vm_vm(veq_vm_vd_vd(vsign_vd_vd(x), vcast_vd_d(-1.0)), (vmask)vcast_vd_d(M_PI)), r); + + r = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)vmulsign_vd_vd_vd(r, y)); + return r; +} + +vdouble xasin(vdouble d) { + vdouble x, y; + x = vadd_vd_vd_vd(vcast_vd_d(1), d); + y = vsub_vd_vd_vd(vcast_vd_d(1), d); + x = vmul_vd_vd_vd(x, y); + x = vsqrt_vd_vd(x); + x = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)atan2k(vabs_vd_vd(d), x)); + return vmulsign_vd_vd_vd(x, d); +} + +vdouble xasin_u1(vdouble d) { + vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(1), d), ddsub_vd2_vd_vd(vcast_vd_d(1), d)))); + vdouble r = vadd_vd_vd_vd(d2.x, d2.y); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(1)), vcast_vd_d(1.570796326794896557998982), r); + return vmulsign_vd_vd_vd(r, d); +} + +vdouble xacos(vdouble d) { + vdouble x, y; + x = vadd_vd_vd_vd(vcast_vd_d(1), d); + y = vsub_vd_vd_vd(vcast_vd_d(1), d); + x = vmul_vd_vd_vd(x, y); + x = vsqrt_vd_vd(x); + x = vmulsign_vd_vd_vd(atan2k(x, vabs_vd_vd(d)), d); + y = (vdouble)vand_vm_vm_vm(vlt_vm_vd_vd(d, vcast_vd_d(0)), (vmask)vcast_vd_d(M_PI)); + x = vadd_vd_vd_vd(x, y); + return x; +} + +vdouble xacos_u1(vdouble d) { + vdouble2 d2 = atan2k_u1(ddsqrt_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(1), d), ddsub_vd2_vd_vd(vcast_vd_d(1), d))), vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0))); + d2 = ddscale_vd2_vd2_vd(d2, vmulsign_vd_vd_vd(vcast_vd_d(1), d)); + + vmask m; + m = vneq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(1)); + d2.x = (vdouble)vand_vm_vm_vm(m, (vmask)d2.x); + d2.y = (vdouble)vand_vm_vm_vm(m, (vmask)d2.y); + m = vlt_vm_vd_vd(d, vcast_vd_d(0)); + d2 = vsel_vd2_vm_vd2_vd2(m, ddadd_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), d2), d2); + + return vadd_vd_vd_vd(d2.x, d2.y); +} + +vdouble xatan_u1(vdouble d) { + vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0)); + vdouble r = vadd_vd_vd_vd(d2.x, d2.y); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(d), vcast_vd_d(1.570796326794896557998982), r); + return vmulsign_vd_vd_vd(r, d); +} + +vdouble xatan(vdouble s) { + vdouble t, u; + vint q; + + q = vsel_vi_vd_vd_vi_vi(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0)); + s = vabs_vd_vd(s); + + q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + s = vsel_vd_vm_vd_vd(vlt_vm_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s); + + t = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(-1.88796008463073496563746e-05); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); + + t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); + + t = vsel_vd_vm_vd_vd(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t); + t = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)), (vmask)vcast_vd_d(-0.0)), (vmask)t); + + return t; +} + +vdouble xlog(vdouble d) { + vdouble x, x2; + vdouble t, m; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(0.7071))); + m = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + x = vdiv_vd_vd_vd(vadd_vd_vd_vd(vcast_vd_d(-1), m), vadd_vd_vd_vd(vcast_vd_d(1), m)); + x2 = vmul_vd_vd_vd(x, x); + + t = vcast_vd_d(0.148197055177935105296783); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.153108178020442575739679)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.181837339521549679055568)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.22222194152736701733275)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.285714288030134544449368)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.399999999989941956712869)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.666666666666685503450651)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(2)); + + x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e))); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(d), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), d), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +vdouble xexp(vdouble d) { + vint q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))); + vdouble s, u; + + s = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U), d); + s = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L), s); + + u = vcast_vd_d(2.08860621107283687536341e-09); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.51112930892876518610661e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573911234900471893338e-07)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75572362911928827629423e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.4801587159235472998791e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000198412698960509205564975)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00138888888889774492207962)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333331652721664984)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665047591422)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.166666666666666851703837)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.5)); + + u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s)); + + u = vldexp_vd_vd_vi(u, q); + + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(d), (vmask)u); + + return u; +} + +static INLINE vdouble2 logk(vdouble d) { + vdouble2 x, x2; + vdouble t, m; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(0.7071))); + m = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); + x2 = ddsqu_vd2_vd2(x); + + t = vcast_vd_d(0.134601987501262130076155); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.132248509032032670243288)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153883458318096079652524)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181817427573705403298686)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222231326187414840781)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285651261412873718)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000222439910458)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666666371239645)); + + return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), + vcast_vd_vi(e)), + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); +} + +vdouble xlog_u1(vdouble d) { + vdouble2 s = logk(d); + vdouble x = vadd_vd_vd_vd(s.x, s.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(d), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), d), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +static INLINE vdouble expk(vdouble2 d) { + vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2)); + vint q = vrint_vi_vd(u); + vdouble2 s, t; + + s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L))); + + s = ddnormalize_vd2_vd2(s); + + u = vcast_vd_d(2.51069683420950419527139e-08); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722)); + + t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); + u = vadd_vd_vd_vd(t.x, t.y); + u = vldexp_vd_vd_vi(u, q); + + return u; +} + +vdouble xpow(vdouble x, vdouble y) { +#if 1 + vmask yisnint = vneq_vm_vd_vd(vcast_vd_vi(vrint_vi_vd(y)), y); + vmask yisodd = vandnot_vm_vm_vm(yisnint, veq_vm_vi_vi(vand_vi_vi_vi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1))); + + vdouble result = expk(ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y)); + + result = vmul_vd_vd_vd(result, + vsel_vd_vm_vd_vd(vgt_vm_vd_vd(x, vcast_vd_d(0)), + vcast_vd_d(1), + (vdouble)vor_vm_vm_vm(yisnint, (vmask)vsel_vd_vm_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1))))); + + vdouble efx = (vdouble)vxor_vm_vm_vm((vmask)vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), vsignbit_vm_vd(y)); + + result = vsel_vd_vm_vd_vd(visinf_vm_vd(y), + (vdouble)vandnot_vm_vm_vm(vlt_vm_vd_vd(efx, vcast_vd_d(0.0)), + (vmask)vsel_vd_vm_vd_vd(veq_vm_vd_vd(efx, vcast_vd_d(0.0)), + vcast_vd_d(1.0), + vcast_vd_d(INFINITY))), + result); + + result = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0.0))), + vmul_vd_vd_vd(vsel_vd_vm_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)), + (vdouble)vandnot_vm_vm_vm(vlt_vm_vd_vd(vsel_vd_vm_vd_vd(veq_vm_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)), + (vmask)vcast_vd_d(INFINITY))), + result); + + result = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)result); + + result = vsel_vd_vm_vd_vd(vor_vm_vm_vm(veq_vm_vd_vd(y, vcast_vd_d(0)), veq_vm_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result); + + return result; +#else + return expk(ddmul_vd2_vd2_vd(logk(x), y)); +#endif +} + +static INLINE vdouble2 expk2(vdouble2 d) { + vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2)); + vint q = vrint_vi_vd(u); + vdouble2 s, t; + + s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L))); + + u = vcast_vd_d(2.51069683420950419527139e-08); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722)); + + t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); + + return ddscale_vd2_vd2_vd(t, vpow2i_vd_vi(q)); +} + +vdouble xsinh(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); + y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble xcosh(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); + y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble xtanh(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + vdouble2 e = ddrec_vd2_vd2(d); + d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e)); + y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vm_vd(y)), vcast_vd_d(1.0), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +static INLINE vdouble2 logk2(vdouble2 d) { + vdouble2 x, x2, m; + vdouble t; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d.x, vcast_vd_d(0.7071))); + m = ddscale_vd2_vd2_vd(d, vpow2i_vd_vi(vneg_vi_vi(e))); + + x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1))); + x2 = ddsqu_vd2_vd2(x); + + t = vcast_vd_d(0.134601987501262130076155); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.132248509032032670243288)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153883458318096079652524)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181817427573705403298686)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222231326187414840781)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285651261412873718)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000222439910458)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666666371239645)); + + return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), + vcast_vd_vi(e)), + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); +} + +vdouble xasinh(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(y, y), vcast_vd_d(1))), y)); + y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble xacosh(vdouble x) { + vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(x, x), vcast_vd_d(-1))), x)); + vdouble y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = (vdouble)vandnot_vm_vm_vm(veq_vm_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); + + y = (vdouble)vor_vm_vm_vm(vlt_vm_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble xatanh(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y)))); + y = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(y, vcast_vd_d(1.0)), (vmask)vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)))); + + y = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), (vmask)y); + + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble xcbrt(vdouble d) { + vdouble x, y, q = vcast_vd_d(1.0); + vint e, qu, re; + vdouble t; + + e = vilogbp1_vi_vd(vabs_vd_vd(d)); + d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); + qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); + re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); + + q = vsel_vd_vm_vd_vd(veq_vm_vi_vi(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q); + q = vsel_vd_vm_vd_vd(veq_vm_vi_vi(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q); + q = vldexp_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048))); + + q = vmulsign_vd_vd_vd(q, d); + + d = vabs_vd_vd(d); + + x = vcast_vd_d(-0.640245898480692909870982); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); + + y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); + y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x); + y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q); + + return y; +} + +vdouble xcbrt_u1(vdouble d) { + vdouble x, y, z, t; + vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v; + vint e, qu, re; + + e = vilogbp1_vi_vd(vabs_vd_vd(d)); + d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); + qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); + re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); + + q2 = vsel_vd2_vm_vd2_vd2(veq_vm_vi_vi(re, vcast_vi_i(1)), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2); + q2 = vsel_vd2_vm_vd2_vd2(veq_vm_vi_vi(re, vcast_vi_i(2)), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2); + + q2.x = vmulsign_vd_vd_vd(q2.x, d); q2.y = vmulsign_vd_vd_vd(q2.y, d); + d = vabs_vd_vd(d); + + x = vcast_vd_d(-0.640245898480692909870982); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); + + y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); + + z = x; + + u = ddmul_vd2_vd_vd(x, x); + u = ddmul_vd2_vd2_vd2(u, u); + u = ddmul_vd2_vd2_vd(u, d); + u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x)); + y = vadd_vd_vd_vd(u.x, u.y); + + y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z); + v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y); + v = ddmul_vd2_vd2_vd(v, d); + v = ddmul_vd2_vd2_vd2(v, q2); + z = vldexp_vd_vd_vi(vadd_vd_vd_vd(v.x, v.y), vsub_vi_vi_vi(qu, vcast_vi_i(2048))); + + z = vsel_vd_vm_vd_vd(visinf_vm_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), q2.x), z); + z = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), (vdouble)vsignbit_vm_vd(q2.x), z); + + return z; +} + +vdouble xexp2(vdouble a) { + vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a)); + u = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(1023)), vcast_vd_d(INFINITY), u); + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(a), (vmask)u); + return u; +} + +vdouble xexp10(vdouble a) { + vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a)); + u = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(308)), vcast_vd_d(INFINITY), u); + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(a), (vmask)u); + return u; +} + +vdouble xexpm1(vdouble a) { + vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0)); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + x = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(700)), vcast_vd_d(INFINITY), x); + x = vsel_vd_vm_vd_vd(vlt_vm_vd_vd(a, vcast_vd_d(-0.36043653389117156089696070315825181539851971360337e+2)), vcast_vd_d(-1), x); + return x; +} + +vdouble xlog10(vdouble a) { + vdouble2 d = ddmul_vd2_vd2_vd2(logk(a), vcast_vd2_vd_vd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17))); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(a), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), a), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(a, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +vdouble xlog1p(vdouble a) { + vdouble2 d = logk2(ddadd2_vd2_vd_vd(a, vcast_vd_d(1))); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(a), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(-1.0), a), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(a, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), x); + + return x; +} diff --git a/simd/sleefsimdsp.c b/simd/sleefsimdsp.c new file mode 100644 index 00000000..9dc00a28 --- /dev/null +++ b/simd/sleefsimdsp.c @@ -0,0 +1,1005 @@ +#include +#include + +#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) +#define INLINE __attribute__((always_inline)) +#else +#define INLINE inline +#endif + +#include "nonnumber.h" + +#ifdef ENABLE_SSE2 +#include "helpersse2.h" +#endif + +#ifdef ENABLE_AVX +#include "helperavx.h" +#endif + +#ifdef ENABLE_AVX2 +#include "helperavx2.h" +#endif + +#ifdef ENABLE_FMA4 +#include "helperfma4.h" +#endif + +#ifdef ENABLE_NEON +#include "helperneon.h" +#endif + +// + +#include "df.h" + +// + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f + +// + +static INLINE vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) { + vint2 m2 = vcast_vi2_vm(vlt_vm_vf_vf(f0, f1)); + return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m2, x), vandnot_vi2_vi2_vi2(m2, y)); +} + +static INLINE vmask vsignbit_vm_vf(vfloat f) { + return vand_vm_vm_vm((vmask)f, (vmask)vcast_vf_f(-0.0f)); +} + +static INLINE vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) { + return (vfloat)vxor_vm_vm_vm((vmask)x, vsignbit_vm_vf(y)); +} + +static INLINE vfloat vsign_vf_vf(vfloat f) { + return (vfloat)vor_vm_vm_vm((vmask)vcast_vf_f(1.0f), vand_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f)); +} + +static INLINE vmask visinf_vm_vf(vfloat d) { return veq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); } +static INLINE vmask vispinf_vm_vf(vfloat d) { return veq_vm_vf_vf(d, vcast_vf_f(INFINITYf)); } +static INLINE vmask visminf_vm_vf(vfloat d) { return veq_vm_vf_vf(d, vcast_vf_f(-INFINITYf)); } +static INLINE vmask visnan_vm_vf(vfloat d) { return vneq_vm_vf_vf(d, d); } +static INLINE vfloat visinf2_vf_vf_vm(vfloat d, vfloat m) { return (vfloat)vand_vm_vm_vm(visinf_vm_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), (vmask)m)); } +static INLINE vfloat visinff(vfloat d) { return visinf2_vf_vf_vm(d, vcast_vf_f(1.0f)); } + +static INLINE vint2 vilogbp1_vi2_vf(vfloat d) { + vmask m = vlt_vm_vf_vf(d, vcast_vf_f(5.421010862427522E-20f)); + d = vsel_vf_vm_vf_vf(m, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d); + vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff)); + q = vsub_vi2_vi2_vi2(q, vsel_vi2_vm_vi2_vi2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e))); + return q; +} + +static INLINE vfloat vpow2i_vf_vi2(vint2 q) { + return (vfloat)vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)); +} + +static INLINE vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) { + vfloat u; + vint2 m = vsra_vi2_vi2_i(q, 31); + m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4); + q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2)); + m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f)); + m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m); + vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff)); + m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff))); + u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(m, 23))); + x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u); + u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23))); + return vmul_vf_vf_vf(x, u); +} + +vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); } + +vfloat xsinf(vfloat d) { + vint2 q; + vfloat u, s; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); + u = vcast_vf_vi2(q); + + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4), d); + + s = vmul_vf_vf_vf(d, d); + + d = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0f)), (vmask)d); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat xcosf(vfloat d) { + vint2 q; + vfloat u, s; + + q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); + q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); + + u = vcast_vf_vi2(q); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), d); + + s = vmul_vf_vf_vf(d, d); + + d = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0f)), (vmask)d); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat2 xsincosf(vfloat d) { + vint2 q; + vmask m; + vfloat u, s, t, rx, ry; + vfloat2 r; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); + + s = d; + + u = vcast_vf_vi2(q); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), s); + + t = s; + + s = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(-0.000195169282960705459117889f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f)); + u = vmul_vf_vf_vf(vmul_vf_vf_vf(u, s), t); + + rx = vadd_vf_vf_vf(t, u); + + u = vcast_vf_f(-2.71811842367242206819355e-07f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5)); + + ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1)); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); + r.x = vsel_vf_vm_vf_vf(m, rx, ry); + r.y = vsel_vf_vm_vf_vf(m, ry, rx); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); + r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); + r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + + m = visinf_vm_vf(d); + + r.x = (vfloat)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vfloat)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vfloat xtanf(vfloat d) { + vint2 q; + vmask m; + vfloat u, s, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); + + x = d; + + u = vcast_vf_vi2(q); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), x); + + s = vmul_vf_vf_vf(x, x); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); + x = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(m, (vmask)vcast_vf_f(-0.0f)), (vmask)x); + + u = vcast_vf_f(0.00927245803177356719970703f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); + + u = vsel_vf_vm_vf_vf(m, vrec_vf_vf(u), u); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat xsinf_u1(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4))); + + t = s; + s = dfsqu_vf2_vf2(s); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s)); + + x = dfmul_vf2_vf2_vf2(t, x); + u = vadd_vf_vf_vf(x.x, x.y); + + u = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0)), (vmask)u); + + return u; +} + +vfloat xcosf_u1(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + + q = vrint_vi2_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5))); + q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + t = s; + s = dfsqu_vf2_vf2(s); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s)); + + x = dfmul_vf2_vf2_vf2(t, x); + u = vadd_vf_vf_vf(x.x, x.y); + + u = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0)), (vmask)u); + + return u; +} + +vfloat2 xsincosf_u1(vfloat d) { + vint2 q; + vmask m; + vfloat u, rx, ry; + vfloat2 r, s, t, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + t = s; + + s = dfsqu_vf2_vf2(s); + s.x = vadd_vf_vf_vf(s.x, s.y); + + u = vcast_vf_f(-0.000195169282960705459117889f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833215750753879547119141f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.166666537523269653320312f)); + + u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(s.x, t.x)); + + x = dfadd_vf2_vf2_vf(t, u); + rx = vadd_vf_vf_vf(x.x, x.y); + + u = vcast_vf_f(-2.71811842367242206819355e-07f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(2.47990446951007470488548e-05f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.00138888787478208541870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416666641831398010253906f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.5)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(s.x, u)); + ry = vadd_vf_vf_vf(x.x, x.y); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); + r.x = vsel_vf_vm_vf_vf(m, rx, ry); + r.y = vsel_vf_vm_vf_vf(m, ry, rx); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); + r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); + r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + + m = visinf_vm_vf(d); + r.x = (vfloat)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vfloat)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vfloat xtanf_u1(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + vmask m; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_2_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); + vmask n = vand_vm_vm_vm(m, (vmask)vcast_vf_f(-0.0)); + s.x = (vfloat)vxor_vm_vm_vm((vmask)s.x, n); + s.y = (vfloat)vxor_vm_vm_vm((vmask)s.y, n); + + t = s; + s = dfsqu_vf2_vf2(s); + s = dfnormalize_vf2_vf2(s); + + u = vcast_vf_f(0.00446636462584137916564941f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-8.3920182078145444393158e-05f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0109639242291450500488281f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0212360303848981857299805f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0540687143802642822265625f)); + + x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, s.x)); + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s)); + x = dfmul_vf2_vf2_vf2(t, x); + + x = vsel_vf2_vm_vf2_vf2(m, dfrec_vf2_vf2(x), x); + + u = vadd_vf_vf_vf(x.x, x.y); + + return u; +} + +vfloat xatanf(vfloat d) { + vfloat s, t, u; + vint2 q; + + q = vsel_vi2_vf_vf_vi2_vi2(d, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0)); + s = vabs_vf_vf(d); + + q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + s = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s); + + t = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(0.00282363896258175373077393f); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + + t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); + + t = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t); + + t = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), (vmask)vcast_vf_f(-0.0f)), (vmask)t); + +#ifdef __ARM_NEON__ + t = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t); +#endif + + return t; +} + +static INLINE vfloat atan2kf(vfloat y, vfloat x) { + vfloat s, t, u; + vint2 q; + vmask p; + + q = vsel_vi2_vf_vf_vi2_vi2(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0)); + x = vabs_vf_vf(x); + + q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + p = vlt_vm_vf_vf(x, y); + s = vsel_vf_vm_vf_vf(p, vneg_vf_vf(x), y); + t = vmax_vf_vf_vf(x, y); + + s = vdiv_vf_vf_vf(s, t); + t = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(0.00282363896258175373077393f); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + + t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); + t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t); + + return t; +} + +vfloat xatan2f(vfloat y, vfloat x) { + vfloat r = atan2kf(vabs_vf_vf(y), x); + + r = vmulsign_vf_vf_vf(r, x); + r = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r); + + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(0.0f)), (vfloat)vand_vm_vm_vm(veq_vm_vf_vf(vsign_vf_vf(x), vcast_vf_f(-1.0f)), (vmask)vcast_vf_f((float)M_PI)), r); + + r = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)vmulsign_vf_vf_vf(r, y)); + return r; +} + +vfloat xasinf(vfloat d) { + vfloat x, y; + x = vadd_vf_vf_vf(vcast_vf_f(1.0f), d); + y = vsub_vf_vf_vf(vcast_vf_f(1.0f), d); + x = vmul_vf_vf_vf(x, y); + x = vsqrt_vf_vf(x); + x = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)atan2kf(vabs_vf_vf(d), x)); + return vmulsign_vf_vf_vf(x, d); +} + +vfloat xacosf(vfloat d) { + vfloat x, y; + x = vadd_vf_vf_vf(vcast_vf_f(1.0f), d); + y = vsub_vf_vf_vf(vcast_vf_f(1.0f), d); + x = vmul_vf_vf_vf(x, y); + x = vsqrt_vf_vf(x); + x = vmulsign_vf_vf_vf(atan2kf(x, vabs_vf_vf(d)), d); + y = (vfloat)vand_vm_vm_vm(vlt_vm_vf_vf(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)M_PI)); + x = vadd_vf_vf_vf(x, y); + return x; +} + +// + +static INLINE vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) { + vfloat u; + vfloat2 s, t; + vint2 q; + vmask p; + + q = vsel_vi2_vf_vf_vi2_vi2(x.x, vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0)); + p = vlt_vm_vf_vf(x.x, vcast_vf_f(0)); + p = vand_vm_vm_vm(p, (vmask)vcast_vf_f(-0.0)); + x.x = (vfloat)vxor_vm_vm_vm((vmask)x.x, p); + x.y = (vfloat)vxor_vm_vm_vm((vmask)x.y, p); + + q = vsel_vi2_vf_vf_vi2_vi2(x.x, y.x, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + p = vlt_vm_vf_vf(x.x, y.x); + s = vsel_vf2_vm_vf2_vf2(p, dfneg_vf2_vf2(x), y); + t = vsel_vf2_vm_vf2_vf2(p, y, x); + + s = dfdiv_vf2_vf2_vf2(s, t); + t = dfsqu_vf2_vf2(s); + t = dfnormalize_vf2_vf2(t); + + u = vcast_vf_f(-0.00176397908944636583328247f); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0107900900766253471374512f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0309564601629972457885742f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0577365085482597351074219f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0838950723409652709960938f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.109463557600975036621094f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.142626821994781494140625f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.199983194470405578613281f)); + + //u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.333332866430282592773438f)); + //t = dfmul_vf2_vf2_vf(t, u); + + t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, t.x))); + t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t)); + t = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t); + + return t; +} + +vfloat xatan2f_u1(vfloat y, vfloat x) { + vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0))); + vfloat r = vadd_vf_vf_vf(d.x, d.y); + + r = vmulsign_vf_vf_vf(r, x); + r = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r); + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(0.0)), (vfloat)vand_vm_vm_vm(veq_vm_vf_vf(vsign_vf_vf(x), vcast_vf_f(-1.0)), (vmask)vcast_vf_f(M_PI)), r); + + r = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)vmulsign_vf_vf_vf(r, y)); + return r; +} + +vfloat xasinf_u1(vfloat d) { + vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(1), d), dfsub_vf2_vf_vf(vcast_vf_f(1), d)))); + vfloat r = vadd_vf_vf_vf(d2.x, d2.y); + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(1)), vcast_vf_f(1.570796326794896557998982), r); + return vmulsign_vf_vf_vf(r, d); +} + +vfloat xacosf_u1(vfloat d) { + vfloat2 d2 = atan2kf_u1(dfsqrt_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(1), d), dfsub_vf2_vf_vf(vcast_vf_f(1), d))), vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0))); + d2 = dfscale_vf2_vf2_vf(d2, vmulsign_vf_vf_vf(vcast_vf_f(1), d)); + + vmask m; + m = vneq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(1)); + d2.x = (vfloat)vand_vm_vm_vm(m, (vmask)d2.x); + d2.y = (vfloat)vand_vm_vm_vm(m, (vmask)d2.y); + m = vlt_vm_vf_vf(d, vcast_vf_f(0)); + d2 = vsel_vf2_vm_vf2_vf2(m, dfadd_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f), d2), d2); + + return vadd_vf_vf_vf(d2.x, d2.y); +} + +vfloat xatanf_u1(vfloat d) { + vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0)); + vfloat r = vadd_vf_vf_vf(d2.x, d2.y); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vcast_vf_f(1.570796326794896557998982), r); + return vmulsign_vf_vf_vf(r, d); +} + +// + +vfloat xlogf(vfloat d) { + vfloat x, x2, t, m; + vint2 e; + + e = vilogbp1_vi2_vf(x = vmul_vf_vf_vf(d, vcast_vf_f(0.7071f))); + m = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + d = x; + + x = vdiv_vf_vf_vf(vadd_vf_vf_vf(vcast_vf_f(-1.0f), m), vadd_vf_vf_vf(vcast_vf_f(1.0f), m)); + x2 = vmul_vf_vf_vf(x, x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666567325592041015625f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f)); + + x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(d), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), d), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x); + + return x; +} + +vfloat xexpf(vfloat d) { + vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); + vfloat s, u; + + s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); + s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.499999850988388061523438f)); + + u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s)); + + u = vldexp_vf_vf_vi2(u, q); + + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(d), (vmask)u); + + return u; +} + +#ifdef __ARM_NEON__ +vfloat xsqrtf(vfloat d) { + vfloat e = (vfloat)vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i((vint2)d, 1))); + vfloat m = (vfloat)vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), (vint2)d)); + float32x4_t x = vrsqrteq_f32(m); + x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x))); + float32x4_t u = vmulq_f32(x, m); + u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); + e = (vfloat)vandnot_vm_vm_vm(veq_vm_vf_vf(d, vcast_vf_f(0)), (vmask)e); + u = vmul_vf_vf_vf(e, u); + + u = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vcast_vf_f(INFINITYf), u); + u = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(d), vlt_vm_vf_vf(d, vcast_vf_f(0))), (vmask)u); + u = vmulsign_vf_vf_vf(u, d); + + return u; +} +#else +vfloat xsqrtf(vfloat d) { return vsqrt_vf_vf(d); } +#endif + +vfloat xcbrtf(vfloat d) { + vfloat x, y, q = vcast_vf_f(1.0), t; + vint2 e, qu, re; + + e = vilogbp1_vi2_vf(vabs_vf_vf(d)); + d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); + qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f))); + re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); + + q = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q); + q = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q); + q = vldexp_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); + + q = vmulsign_vf_vf_vf(q, d); + d = vabs_vf_vf(d); + + x = vcast_vf_f(-0.601564466953277587890625f); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); + + y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x); + y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q); + + return y; +} + +vfloat xcbrtf_u1(vfloat d) { + vfloat x, y, z, t; + vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v; + vint2 e, qu, re; + + e = vilogbp1_vi2_vf(vabs_vf_vf(d)); + d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); + qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0))); + re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); + + q2 = vsel_vf2_vm_vf2_vf2(veq_vm_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2); + q2 = vsel_vf2_vm_vf2_vf2(veq_vm_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f, 1.9520385308169352356e-08), q2); + + q2.x = vmulsign_vf_vf_vf(q2.x, d); q2.y = vmulsign_vf_vf_vf(q2.y, d); + d = vabs_vf_vf(d); + + x = vcast_vf_f(-0.601564466953277587890625f); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); + + y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0))); + + z = x; + + u = dfmul_vf2_vf_vf(x, x); + u = dfmul_vf2_vf2_vf2(u, u); + u = dfmul_vf2_vf2_vf(u, d); + u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x)); + y = vadd_vf_vf_vf(u.x, u.y); + + y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z); + v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y); + v = dfmul_vf2_vf2_vf(v, d); + v = dfmul_vf2_vf2_vf2(v, q2); + z = vldexp_vf_vf_vi2(vadd_vf_vf_vf(v.x, v.y), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); + + z = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(INFINITY), q2.x), z); + z = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), (vfloat)vsignbit_vm_vf(q2.x), z); + + return z; +} + +static INLINE vfloat2 logkf(vfloat d) { + vfloat2 x, x2; + vfloat t, m; + vint2 e; + + e = vilogbp1_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(0.7071f))); + m = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); + x2 = dfsqu_vf2_vf2(x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666567325592041015625f)); + + return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), + vcast_vf_vi2(e)), + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); +} + +vfloat xlogf_u1(vfloat d) { + vfloat2 s = logkf(d); + vfloat x = vadd_vf_vf_vf(s.x, s.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(d), vcast_vf_f(INFINITY), x); +#ifdef __ARM_NEON__ + x = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(d, vcast_vf_f(1e-37f)), vcast_vf_f(-INFINITY), x); +#else + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITY), x); +#endif + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), d), (vmask)x); + + return x; +} + +static INLINE vfloat expkf(vfloat2 d) { + vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f)); + vint2 q = vrint_vi2_vf(u); + vfloat2 s, t; + + s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); + + s = dfnormalize_vf2_vf2(s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.499999850988388061523438f)); + + t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); + + t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); + u = vadd_vf_vf_vf(t.x, t.y); + u = vldexp_vf_vf_vi2(u, q); + + return u; +} + +vfloat xpowf(vfloat x, vfloat y) { +#if 1 + vmask yisnint = vneq_vm_vf_vf(vcast_vf_vi2(vrint_vi2_vf(y)), y); + vmask yisodd = vandnot_vm_vm_vm(yisnint, veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vrint_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1))); + + vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y)); + + result = vmul_vf_vf_vf(result, + vsel_vf_vm_vf_vf(vgt_vm_vf_vf(x, vcast_vf_f(0)), + vcast_vf_f(1), + (vfloat)vor_vm_vm_vm(yisnint, (vmask)vsel_vf_vm_vf_vf(yisodd, vcast_vf_f(-1), vcast_vf_f(1))))); + + vfloat efx = (vfloat)vxor_vm_vm_vm((vmask)vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), vsignbit_vm_vf(y)); + + result = vsel_vf_vm_vf_vf(visinf_vm_vf(y), + (vfloat)vandnot_vm_vm_vm(vlt_vm_vf_vf(efx, vcast_vf_f(0.0f)), + (vmask)vsel_vf_vm_vf_vf(veq_vm_vf_vf(efx, vcast_vf_f(0.0f)), + vcast_vf_f(1.0f), + vcast_vf_f(INFINITYf))), + result); + + result = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0))), + vmul_vf_vf_vf(vsel_vf_vm_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)), + (vfloat)vandnot_vm_vm_vm(vlt_vm_vf_vf(vsel_vf_vm_vf_vf(veq_vm_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)), + (vmask)vcast_vf_f(INFINITYf))), + result); + + result = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)result); + + result = vsel_vf_vm_vf_vf(vor_vm_vm_vm(veq_vm_vf_vf(y, vcast_vf_f(0)), veq_vm_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result); + + return result; +#else + return expkf(dfmul_vf2_vf2_vf(logkf(x), y)); +#endif +} + +static INLINE vfloat2 expk2f(vfloat2 d) { + vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f)); + vint2 q = vrint_vi2_vf(u); + vfloat2 s, t; + + s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.499999850988388061523438f)); + + t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); + + t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); + + return dfscale_vf2_vf2_vf(t, vpow2i_vf_vi2(q)); +} + +vfloat xsinhf(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); + y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), + visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat xcoshf(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); + y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), + visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat xtanhf(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + vfloat2 e = dfrec_vf2_vf2(d); + d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e)); + y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)), + visnan_vm_vf(y)), vcast_vf_f(1.0f), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +static INLINE vfloat2 logk2f(vfloat2 d) { + vfloat2 x, x2, m; + vfloat t; + vint2 e; + + e = vilogbp1_vi2_vf(vmul_vf_vf_vf(d.x, vcast_vf_f(0.7071))); + m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e))); + + x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1))); + x2 = dfsqu_vf2_vf2(x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666567325592041015625f)); + + return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), + vcast_vf_vi2(e)), + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); +} + +vfloat xasinhf(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = logk2f(dfadd_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(y, y), vcast_vf_f(1))), y)); + y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat xacoshf(vfloat x) { + vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(x, x), vcast_vf_f(-1))), x)); + vfloat y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + + y = (vfloat)vandnot_vm_vm_vm(veq_vm_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y); + + y = (vfloat)vor_vm_vm_vm(vlt_vm_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat xatanhf(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y)))); + y = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(y, vcast_vf_f(1.0)), (vmask)vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)))); + + y = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), (vmask)y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat xexp2f(vfloat a) { + vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), a)); +#ifdef __ARM_NEON__ + u = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(127.0f)), vcast_vf_f(INFINITYf), u); +#else + u = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), u); +#endif + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(a), (vmask)u); + return u; +} + +vfloat xexp10f(vfloat a) { + vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(2.3025851249694824219f), vcast_vf_f(-3.1975436520781386207e-08f)), a)); +#ifdef __ARM_NEON__ + u = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(38.0f)), vcast_vf_f(INFINITYf), u); +#else + u = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), u); +#endif + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(a), (vmask)u); + return u; +} + +vfloat xexpm1f(vfloat a) { + vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0)); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + x = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(88.0f)), vcast_vf_f(INFINITYf), x); + x = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(a, vcast_vf_f(-0.15942385152878742116596338793538061065739925620174e+2f)), vcast_vf_f(-1), x); + return x; +} + +vfloat xlog10f(vfloat a) { + vfloat2 d = dfmul_vf2_vf2_vf2(logkf(a), vcast_vf2_vf_vf(vcast_vf_f(0.43429449200630187988f), vcast_vf_f(-1.0103050118726031315e-08f))); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), a), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(a, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x); + + return x; +} + +vfloat xlog1pf(vfloat a) { + vfloat2 d = logk2f(dfadd2_vf2_vf_vf(a, vcast_vf_f(1))); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(-1), a), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(a, vcast_vf_f(-1)), vcast_vf_f(-INFINITYf), x); + + return x; +} diff --git a/tester/Makefile b/tester/Makefile new file mode 100644 index 00000000..8787cd11 --- /dev/null +++ b/tester/Makefile @@ -0,0 +1,16 @@ +all : tester testersp testeru1 testerspu1 + +tester : tester.c + gcc -Wall tester.c -lm -lmpfr -o tester + +testeru1 : testeru1.c + gcc -Wall testeru1.c -lm -lmpfr -o testeru1 + +testersp : testersp.c + gcc -Wall testersp.c -lm -lmpfr -o testersp + +testerspu1 : testerspu1.c + gcc -Wall testerspu1.c -lm -lmpfr -o testerspu1 + +clean : + rm -f *~ *.o tester testersp testeru1 testerspu1 diff --git a/tester/nonnumber.h b/tester/nonnumber.h new file mode 100644 index 00000000..5d856fa9 --- /dev/null +++ b/tester/nonnumber.h @@ -0,0 +1,19 @@ +#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) +#ifdef INFINITY +#undef INFINITY +#endif + +#ifdef NAN +#undef NAN +#endif + +#define NAN __builtin_nan("") +#define NANf __builtin_nanf("") +#define INFINITY __builtin_inf() +#define INFINITYf __builtin_inff() +#else + +#include +#include + +#endif diff --git a/tester/tester.c b/tester/tester.c new file mode 100644 index 00000000..858303ca --- /dev/null +++ b/tester/tester.c @@ -0,0 +1,2851 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "nonnumber.h" + +#define POSITIVE_INFINITY INFINITY +#define NEGATIVE_INFINITY (-INFINITY) + +typedef int boolean; + +#define true 1 +#define false 0 + +void stop(char *mes) { + fprintf(stderr, "%s\n", mes); + abort(); +} + +int readln(int fd, char *buf, int cnt) { + int i, rcnt = 0; + + if (cnt < 1) return -1; + + while(cnt >= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int ptoc[2], ctop[2]; +int pid; + +void startChild(const char *path, char *const argv[]) { + pipe(ptoc); + pipe(ctop); + + pid = fork(); + + assert(pid != -1); + + if (pid == 0) { + // child process + char buf0[1], buf1[1]; + int i; + + close(ptoc[1]); + close(ctop[0]); + + i = dup2(ptoc[0], fileno(stdin)); + assert(i != -1); + + i = dup2(ctop[1], fileno(stdout)); + assert(i != -1); + + setvbuf(stdin, buf0, _IONBF,0); + setvbuf(stdout, buf1, _IONBF,0); + + fflush(stdin); + fflush(stdout); + + execvp(path, argv); + + fprintf(stderr, "execvp in startChild : %s\n", strerror(errno)); + + assert(0); + } + + // parent process + + close(ptoc[0]); + close(ctop[1]); +} + +double u2d(uint64_t u) { + union { + double f; + uint64_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint64_t d2u(double d) { + union { + double f; + uint64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +// + +boolean isPlusZero(double x) { return x == 0 && copysign(1, x) == 1; } +boolean isMinusZero(double x) { return x == 0 && copysign(1, x) == -1; } +boolean xisnan(double x) { return x != x; } +double sign(double d) { return d < 0 ? -1 : 1; } + +boolean cmpDenorm(double x, double y) { + if (xisnan(x) && xisnan(y)) return true; + if (xisnan(x) || xisnan(y)) return false; + if (isinf(x) != isinf(y)) return false; + if (x == POSITIVE_INFINITY && y == POSITIVE_INFINITY) return true; + if (x == NEGATIVE_INFINITY && y == NEGATIVE_INFINITY) return true; + if (y == 0) { + if (isPlusZero(x) && isPlusZero(y)) return true; + if (isMinusZero(x) && isMinusZero(y)) return true; + return false; + } + if (!xisnan(x) && !xisnan(y) && !isinf(x) && !isinf(y)) return sign(x) == sign(y); + return false; +} + +long double ulp(long double x) { + x = fabsl(x); + int exp; + + if (x == 0) { + return DBL_MIN; + } else { + frexpl(x, &exp); + } + + return fmax(ldexp(1.0, exp-53), DBL_MIN); +} + +double countULP(long double x, long double y) { + double fx = x; + double fy = y; + if (xisnan(fx) && xisnan(fy)) return 0; + if (xisnan(fx) || xisnan(fy)) return 10000; + if (isinf(fx)) { + if (sign(fx) == sign(fy) && fabs(fy) > 1e+300) return 0; // Relaxed infinity handling + return 10001; + } + if (fx == POSITIVE_INFINITY && fy == POSITIVE_INFINITY) return 0; + if (fx == NEGATIVE_INFINITY && fy == NEGATIVE_INFINITY) return 0; + if (fy == 0) { + if (fx == 0) return 0; + return 10002; + } + if (!xisnan(fx) && !xisnan(fy) && !isinf(fx) && !isinf(fy)) { + return fabs((x - y) / ulp(y)); + } + return 10003; +} + +// + +double sinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atan2fr(double y, double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double atan2lfr(long double y, long double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double logfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double loglfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double explfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double powfr(double x, double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double powlfr(long double x, long double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double sinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double coshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acoshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sqrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double sqrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cbrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double cbrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp2fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp2lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expm1fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double expm1lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log1pfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log1plfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +// + +typedef struct { + double x, y; +} double2; + +double child_sin(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sin %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sin"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cos(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cos %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cos"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double2 child_sincos(double x) { + char str[256]; + uint64_t u, v; + + sprintf(str, "sincos %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sincos"); + sscanf(str, "%" PRIx64 " %" PRIx64, &u, &v); + + double2 ret; + ret.x = u2d(u); + ret.y = u2d(v); + return ret; +} + +double child_tan(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "tan %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tan"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_asin(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "asin %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asin"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_acos(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "acos %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acos"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atan(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atan %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atan2(double y, double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atan2 %" PRIx64 " %" PRIx64 "\n", d2u(y), d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan2"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_pow(double x, double y) { + char str[256]; + uint64_t u; + + sprintf(str, "pow %" PRIx64 " %" PRIx64 "\n", d2u(x), d2u(y)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_pow"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_sinh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sinh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cosh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cosh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cosh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_tanh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "tanh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_asinh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "asinh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_acosh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "acosh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acosh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atanh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atanh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_sqrt(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sqrt %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sqrt"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cbrt(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cbrt %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cbrt"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp2(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp2 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp2"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp10(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp10 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp10"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_expm1(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "expm1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expm1"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log10(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log10 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log10"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log1p(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log1p %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log1p"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_ldexp(double x, int q) { + char str[256]; + uint64_t u; + + sprintf(str, "ldexp %" PRIx64 " %" PRIx64 "\n", d2u(x), d2u(q)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_ldexp"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +int allTestsPassed = 1; + +void showResult(int success) { + if (!success) allTestsPassed = 0; + fprintf(stderr, "%s\n", success ? " OK" : " NG **************"); +} + +void do_test() { + int i, j; + + fprintf(stderr, "Denormal/nonnumber test atan2(y, x)\n\n"); + + fprintf(stderr, "If y is +0 and x is -0, +pi is returned ... "); + showResult(child_atan2(+0.0, -0.0) == M_PI); + + fprintf(stderr, "If y is -0 and x is -0, -pi is returned ... "); + showResult(child_atan2(-0.0, -0.0) == -M_PI); + + fprintf(stderr, "If y is +0 and x is +0, +0 is returned ... "); + showResult(isPlusZero(child_atan2(+0.0, +0.0))); + + fprintf(stderr, "If y is -0 and x is +0, -0 is returned ... "); + showResult(isMinusZero(child_atan2(-0.0, +0.0))); + + fprintf(stderr, "If y is positive infinity and x is negative infinity, +3*pi/4 is returned ... "); + showResult(child_atan2(POSITIVE_INFINITY, NEGATIVE_INFINITY) == 3*M_PI/4); + + fprintf(stderr, "If y is negative infinity and x is negative infinity, -3*pi/4 is returned ... "); + showResult(child_atan2(NEGATIVE_INFINITY, NEGATIVE_INFINITY) == -3*M_PI/4); + + fprintf(stderr, "If y is positive infinity and x is positive infinity, +pi/4 is returned ... "); + showResult(child_atan2(POSITIVE_INFINITY, POSITIVE_INFINITY) == M_PI/4); + + fprintf(stderr, "If y is negative infinity and x is positive infinity, -pi/4 is returned ... "); + showResult(child_atan2(NEGATIVE_INFINITY, POSITIVE_INFINITY) == -M_PI/4); + + { + fprintf(stderr, "If y is +0 and x is less than 0, +pi is returned ... "); + + double ya[] = { +0.0 }; + double xa[] = { -100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5 }; + + boolean success = true; + + for(i=0;i 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_sin(d); + long double c = sinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_sin(d); + long double c = sinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + } + + STOP_SIN: + + fprintf(stderr, "sin : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "cos : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + } + + STOP_SIN2: + + fprintf(stderr, "sin in sincos : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + } + + fprintf(stderr, "cos in sincos : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "tan : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + double q = child_asin(d); + long double c = asinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASIN; + } + } + + STOP_ASIN: + + fprintf(stderr, "asin : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + double q = child_acos(d); + long double c = acoslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOS; + } + } + + STOP_ACOS: + + fprintf(stderr, "acos : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_atan(d); + long double c = atanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + for(d = -10000;d < 10000;d += 0.2) { + double q = child_atan(d); + long double c = atanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + STOP_ATAN: + + fprintf(stderr, "atan : %lf ... ", max); + + showResult(max < 5); + } + + { + double x, y, max = 0; + + for(y = -10;y < 10;y += 0.05) { + for(x = -10;x < 10;x += 0.05) { + double q = child_atan2(y, x); + long double c = atan2lfr(y, x); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + for(y = -100;y < 100;y += 0.51) { + for(x = -100;x < 100;x += 0.51) { + double q = child_atan2(y, x); + long double c = atan2lfr(y, x); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + STOP_ATAN2: + + fprintf(stderr, "atan2 : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(2.1, i); + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp(d); + long double c = explfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.1) { + double q = child_exp(d); + long double c = explfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp : %lf ... ", max); + + showResult(max < 1); + } + + { + double x, y, max = 0; + + for(y = 0.1;y < 100;y += 0.2) { + for(x = -100;x < 100;x += 0.2) { + double q = child_pow(x, y); + long double c = powlfr(x, y); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, x = %g, y = %g\n", q, x, y); + goto STOP_POW; + } + } + } + + double d; + for(d = -1000;d < 1000;d += 0.1) { + double q = child_pow(2.1, d); + long double c = powlfr(2.1, d); + double u = countULP(q, c); + max = fmax(max, u); + + if (u > 1000) { + fprintf(stderr, "q = %g, c = %g, d = %g\n", q, (double)c, d); + goto STOP_POW; + } + } + + STOP_POW: + + fprintf(stderr, "pow : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10000;d < 10000;d += 0.2) { + double q = child_cbrt(d); + long double c = cbrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(2.1, i); + double q = child_cbrt(d); + long double c = cbrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + STOP_CBRT: + + fprintf(stderr, "cbrt : %lf ... ", max); + + showResult(max < 5); + } + +#if 0 + { + double d, max = 0; + + for(d = 0;d < 20000;d += 0.2) { + double q = child_sqrt(d); + long double c = sqrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SQRT; + } + } + + STOP_SQRT: + + fprintf(stderr, "sqrt : %lf ... ", max); + + showResult(max < 5); + } +#endif + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_sinh(d); + long double c = sinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_sinh(d); + long double c = sinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + STOP_SINH: + + fprintf(stderr, "sinh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_cosh(d); + long double c = coshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_cosh(d); + long double c = coshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + STOP_COSH: + + fprintf(stderr, "cosh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_tanh(d); + long double c = tanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_TANH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_tanh(d); + long double c = tanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_TANH; + } + } + + STOP_TANH: + + fprintf(stderr, "tanh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_asinh(d); + long double c = asinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_asinh(d); + long double c = asinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + STOP_ASINH: + + fprintf(stderr, "asinh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 1;d < 10;d += 0.0002) { + double q = child_acosh(d); + long double c = acoshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + for(d = 1;d < 1000;d += 0.02) { + double q = child_acosh(d); + long double c = acoshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + STOP_ACOSH: + + fprintf(stderr, "acosh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_atanh(d); + long double c = atanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_atanh(d); + long double c = atanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + STOP_ATANH: + + fprintf(stderr, "atanh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp2(d); + long double c = exp2lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_exp2(d); + long double c = exp2lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp2 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp10(d); + long double c = exp10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -300;d < 300;d += 0.01) { + double q = child_exp10(d); + long double c = exp10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp10 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_expm1(d); + long double c = expm1lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 5) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\n", q, (double)c, d); + goto STOP_EXPM1; + } + } + + for(d = -1000;d < 1000;d += 0.021) { + double q = child_expm1(d); + long double c = expm1lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 5) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\n", q, (double)c, d); + goto STOP_EXPM1; + } + } + + for(d = 0;d < 300;d += 0.021) { + double d2 = pow(10, -d); + double q = child_expm1(d2); + long double c = expm1lfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 5) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\n", q, (double)c, d); + goto STOP_EXPM1; + } + } + + for(d = 0;d < 300;d += 0.021) { + double d2 = -pow(10, -d); + double q = child_expm1(d2); + long double c = expm1lfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 5) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\n", q, (double)c, d); + goto STOP_EXPM1; + } + } + + STOP_EXPM1: + + fprintf(stderr, "expm1 : %lf ... ", max); + + showResult(max < 5); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log10(d); + long double c = log10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log10(d); + long double c = log10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log10 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log1p(d); + long double c = log1plfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log1p(d); + long double c = log1plfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = pow(10, -d); + double q = child_log1p(d2); + long double c = log1plfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = -pow(10, -d); + double q = child_log1p(d2); + long double c = log1plfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log1p : %lf ... ", max); + + showResult(max < 1); + } +} + +int main(int argc, char **argv) { + char *argv2[argc]; + int i; + + for(i=1;i +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "nonnumber.h" + +#define POSITIVE_INFINITYf ((float)INFINITY) +#define NEGATIVE_INFINITYf (-(float)INFINITY) +#define M_PIf ((float)M_PI) + +#define POSITIVE_INFINITY (INFINITY) +#define NEGATIVE_INFINITY (-INFINITY) + +typedef int boolean; + +#define true 1 +#define false 0 + +int enableFlushToZero = 0; + +void stop(char *mes) { + fprintf(stderr, "%s\n", mes); + abort(); +} + +int readln(int fd, char *buf, int cnt) { + int i, rcnt = 0; + + if (cnt < 1) return -1; + + while(cnt >= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int ptoc[2], ctop[2]; +int pid; + +void startChild(const char *path, char *const argv[]) { + pipe(ptoc); + pipe(ctop); + + pid = fork(); + + assert(pid != -1); + + if (pid == 0) { + // child process + char buf0[1], buf1[1]; + int i; + + close(ptoc[1]); + close(ctop[0]); + + i = dup2(ptoc[0], fileno(stdin)); + assert(i != -1); + + i = dup2(ctop[1], fileno(stdout)); + assert(i != -1); + + setvbuf(stdin, buf0, _IONBF,0); + setvbuf(stdout, buf1, _IONBF,0); + + fflush(stdin); + fflush(stdout); + + execvp(path, argv); + + assert(0); + } + + // parent process + + close(ptoc[0]); + close(ctop[1]); +} + +float u2f(uint32_t u) { + union { + float f; + uint32_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint32_t f2u(float d) { + union { + float f; + uint32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +// + +boolean isPlusZerof(float x) { return x == 0 && copysignf(1, x) == 1; } +boolean isMinusZerof(float x) { return x == 0 && copysignf(1, x) == -1; } +boolean xisnanf(float x) { return x != x; } +float signf(float d) { return d < 0 ? -1 : 1; } + +boolean isPlusZero(double x) { return x == 0 && copysign(1, x) == 1; } +boolean isMinusZero(double x) { return x == 0 && copysign(1, x) == -1; } +boolean xisnan(double x) { return x != x; } + +double flushToZero(double y) { + if (enableFlushToZero && fabs(y) < 1.2e-38) y = copysign(0.0, y); + return y; +} + +boolean cmpDenorm(float x, float y) { + y = flushToZero(y); + if (xisnanf(x) && xisnanf(y)) return true; + if (xisnanf(x) || xisnanf(y)) return false; + if (isinf(x) != isinf(y)) return false; + if (x == POSITIVE_INFINITYf && y == POSITIVE_INFINITYf) return true; + if (x == NEGATIVE_INFINITYf && y == NEGATIVE_INFINITYf) return true; + if (y == 0) { + if (isPlusZerof(x) && isPlusZerof(y)) return true; + if (isMinusZerof(x) && isMinusZerof(y)) return true; + return false; + } + if (!xisnanf(x) && !xisnanf(y) && !isinf(x) && !isinf(y)) return signf(x) == signf(y); + return false; +} + +double ulp(double x) { + x = fabsf(x); + int exp; + + if (x == 0) { + return FLT_MIN; + } else { + frexpf(x, &exp); + } + + return fmaxf(ldexpf(1.0, exp-24), FLT_MIN); +} + +double countULP(double x, double y) { + x = flushToZero(x); + y = flushToZero(y); + float fx = (float)x; + float fy = (float)y; + if (xisnan(fx) && xisnan(fy)) return 0; + if (xisnan(fx) || xisnan(fy)) return 10000; + if (isinf(fx)) { + if (signf(fx) == signf(fy) && fabs(fy) > 1e+37) return 0; // Relaxed infinity handling + return 10001; + } + if (fx == POSITIVE_INFINITY && fy == POSITIVE_INFINITY) return 0; + if (fx == NEGATIVE_INFINITY && fy == NEGATIVE_INFINITY) return 0; + if (fy == 0) { + if (fx == 0) return 0; + return 10002; + } + if (!xisnan(fx) && !xisnan(fy) && !isinf(fx) && !isinf(fy)) { + return fabs((x - y) / ulp(y)); + } + return 10003; +} + +// + +double sinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atan2fr(double y, double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double atan2lfr(long double y, long double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double logfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double loglfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double explfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double powfr(double x, double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double powlfr(long double x, long double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double sinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double coshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acoshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sqrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double sqrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cbrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double cbrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp2fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp2lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expm1fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double expm1lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log1pfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log1plfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +// + +typedef struct { + float x, y; +} float2; + +float child_sinf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sinf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_cosf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "cosf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cosf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float2 child_sincosf(float x) { + char str[256]; + uint32_t u, v; + + sprintf(str, "sincosf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sincosf"); + sscanf(str, "%x %x", &u, &v); + + float2 ret; + ret.x = u2f(u); + ret.y = u2f(v); + return ret; +} + +float child_tanf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "tanf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_asinf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "asinf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_acosf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "acosf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acosf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atanf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atanf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atan2f(float y, float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atan2f %x %x\n", f2u(y), f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan2f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_logf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "logf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_logf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_expf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "expf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_powf(float x, float y) { + char str[256]; + uint32_t u; + + sprintf(str, "powf %x %x\n", f2u(x), f2u(y)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_powf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_sinhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sinhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_coshf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "coshf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_coshf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_tanhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "tanhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_asinhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "asinhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_acoshf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "acoshf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acoshf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atanhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atanhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_sqrtf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sqrtf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sqrtf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_cbrtf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "cbrtf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cbrtf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_exp2f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "exp2f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp2f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_exp10f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "exp10f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp10f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_expm1f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "expm1f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expm1f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_log10f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "log10f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log10f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_log1pf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "log1pf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log1pf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_ldexpf(float x, int q) { + char str[256]; + uint32_t u; + + sprintf(str, "ldexpf %x %x\n", f2u(x), f2u(q)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_powf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +int allTestsPassed = 1; + +void showResult(int success) { + if (!success) allTestsPassed = 0; + fprintf(stderr, "%s\n", success ? " OK" : " NG **************"); +} + +void do_test() { + int i, j; + + fprintf(stderr, "Denormal/nonnumber test atan2f(y, x)\n\n"); + + fprintf(stderr, "If y is +0 and x is -0, +pi is returned ... "); + showResult(child_atan2f(+0.0, -0.0) == M_PIf); + + fprintf(stderr, "If y is -0 and x is -0, -pi is returned ... "); + showResult(child_atan2f(-0.0, -0.0) == -M_PIf); + + fprintf(stderr, "If y is +0 and x is +0, +0 is returned ... "); + showResult(isPlusZerof(child_atan2f(+0.0, +0.0))); + + fprintf(stderr, "If y is -0 and x is +0, -0 is returned ... "); + showResult(isMinusZerof(child_atan2f(-0.0, +0.0))); + + fprintf(stderr, "If y is positive infinity and x is negative infinity, +3*pi/4 is returned ... "); + showResult(child_atan2f(POSITIVE_INFINITYf, NEGATIVE_INFINITYf) == 3*M_PIf/4); + + fprintf(stderr, "If y is negative infinity and x is negative infinity, -3*pi/4 is returned ... "); + showResult(child_atan2f(NEGATIVE_INFINITYf, NEGATIVE_INFINITYf) == -3*M_PIf/4); + + fprintf(stderr, "If y is positive infinity and x is positive infinity, +pi/4 is returned ... "); + showResult(child_atan2f(POSITIVE_INFINITYf, POSITIVE_INFINITYf) == M_PIf/4); + + fprintf(stderr, "If y is negative infinity and x is positive infinity, -pi/4 is returned ... "); + showResult(child_atan2f(NEGATIVE_INFINITYf, POSITIVE_INFINITYf) == -M_PIf/4); + + { + fprintf(stderr, "If y is +0 and x is less than 0, +pi is returned ... "); + + float ya[] = { +0.0 }; + float xa[] = { -100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5 }; + + boolean success = true; + + for(i=0;i 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_sinf(d); + double c = sinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_sinf(d); + double c = sinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + } + + STOP_SIN: + + fprintf(stderr, "sinf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "cosf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float2 q = child_sincosf(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float2 q = child_sincosf(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float2 q = child_sincosf(d); + double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + } + + STOP_SIN2: + + fprintf(stderr, "sin in sincosf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.y, (double)c, d, (double)ulp(c)); + goto STOP_COS2; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.y, (double)c, d, (double)ulp(c)); + goto STOP_COS2; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + } + + STOP_COS2: + + fprintf(stderr, "cos in sincosf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "tanf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + float q = child_asinf(d); + double c = asinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASIN; + } + } + + STOP_ASIN: + + fprintf(stderr, "asinf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + float q = child_acosf(d); + double c = acoslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOS; + } + } + + STOP_ACOS: + + fprintf(stderr, "acosf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_atanf(d); + double c = atanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_atanf(d); + double c = atanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + STOP_ATAN: + + fprintf(stderr, "atanf : %lf ... ", max); + + showResult(max < 5); + } + + { + float x, y, max = 0; + + for(y = -10;y < 10;y += 0.051) { + for(x = -10;x < 10;x += 0.052) { + float q = child_atan2f(y, x); + double c = atan2lfr(flushToZero(y), flushToZero(x)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + for(y = -100;y < 100;y += 0.51) { + for(x = -100;x < 100;x += 0.52) { + float q = child_atan2f(y, x); + double c = atan2lfr(flushToZero(y), flushToZero(x)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + STOP_ATAN2: + + fprintf(stderr, "atan2f : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG; + } + } + + for(d = 0.0001;d < 10000;d += 0.1) { + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(1.1, i); + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + if (flushToZero(d * 0.1) == 0.0 && q == NEGATIVE_INFINITYf) u = 0; + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG; + } + } + + STOP_LOG: + + fprintf(stderr, "logf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_expf(d); + double c = explfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.1) { + float q = child_expf(d); + double c = explfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "expf : %lf ... ", max); + + showResult(max < 1); + } + + { + float x, y, max = 0; + + for(y = 0.1;y < 100;y += 0.21) { + for(x = -100;x < 100;x += 0.22) { + float q = child_powf(x, y); + double c = powlfr(flushToZero(x), flushToZero(y)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 10) { + fprintf(stderr, "q = %g, c = %g, x = %g, y = %g\n", q, c, x, y); + goto STOP_POW; + } + } + } + + float d; + for(d = -1000;d < 1000;d += 0.1) { + float q = child_powf(2.1f, d); + double c = powlfr(2.1f, flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + + if (u > 10) { + fprintf(stderr, "q = %g, c = %g, d = %g\n", q, c, d); + goto STOP_POW; + } + } + + STOP_POW: + + fprintf(stderr, "powf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = 0;d < 20000;d += 0.2) { + float q = child_sqrtf(d); + double c = sqrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SQRT; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(1.1, i); + float q = child_sqrtf(d); + double c = sqrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SQRT; + } + } + + STOP_SQRT: + + fprintf(stderr, "sqrtf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10000;d < 10000;d += 0.2) { + float q = child_cbrtf(d); + double c = cbrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(1.1, i); + float q = child_cbrtf(d); + double c = cbrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + STOP_CBRT: + + fprintf(stderr, "cbrtf : %lf ... ", max); + + showResult(max < 5); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_sinhf(d); + double c = sinhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + for(d = -100;d < 100;d += 0.02) { + float q = child_sinhf(d); + double c = sinhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + STOP_SINH: + + fprintf(stderr, "sinhf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_coshf(d); + double c = coshlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + for(d = -100;d < 100;d += 0.02) { + float q = child_coshf(d); + double c = coshlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + STOP_COSH: + + fprintf(stderr, "coshf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_tanhf(d); + double c = tanhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_TANH; + } + } + + STOP_TANH: + + fprintf(stderr, "tanhf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_asinhf(d); + double c = asinhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + float q = child_asinhf(d); + double c = asinhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + STOP_ASINH: + + fprintf(stderr, "asinhf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = 1;d < 10;d += 0.0002) { + float q = child_acoshf(d); + double c = acoshlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + for(d = 1;d < 1000;d += 0.02) { + float q = child_acoshf(d); + double c = acoshlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + STOP_ACOSH: + + fprintf(stderr, "acoshf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_atanhf(d); + double c = atanhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + for(d = -1000;d < 1000;d += 0.023) { + float q = child_atanhf(d); + double c = atanhlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + STOP_ATANH: + + fprintf(stderr, "atanhf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + float q = child_log10f(d); + double c = log10lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + float q = child_log10f(d); + double c = log10lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log10f : %lf ... ", max); + + showResult(max < 1); + } + + + { + float d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + float q = child_log1pf(d); + double c = log1plfr(flushToZero(d)); + double u = countULP(q, c); + if (u > 10) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG1P; + } + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + float q = child_log1pf(d); + double c = log1plfr(flushToZero(d)); + double u = countULP(q, c); + if (u > 10) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG1P; + } + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + float d2 = pow(10, -d); + float q = child_log1pf(d2); + double c = log1plfr(flushToZero(d2)); + double u = countULP(q, c); + if (flushToZero(d2 * 0.1) == 0.0 && q == 0.0) u = 0; + if (u > 10) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d2, (double)ulp(c)); + goto STOP_LOG1P; + } + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + float d2 = -pow(10, -d); + float q = child_log1pf(d2); + double c = log1plfr(flushToZero(d2)); + double u = countULP(q, c); + if (flushToZero(d2 * 0.1) == 0.0 && q == 0.0) u = 0; + if (u > 10) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d2, (double)ulp(c)); + goto STOP_LOG1P; + } + max = fmax(max, u); + } + + STOP_LOG1P: + + fprintf(stderr, "log1pf : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_exp2f(d); + double c = exp2lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXP2; + } + } + + for(d = -120;d < 1000;d += 0.023) { + float q = child_exp2f(d); + double c = exp2lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXP2; + } + } + + STOP_EXP2: + + fprintf(stderr, "exp2f : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_exp10f(d); + double c = exp10lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXP10; + } + } + + for(d = -35;d < 1000;d += 0.023) { + float q = child_exp10f(d); + double c = exp10lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXP10; + } + } + + STOP_EXP10: + + fprintf(stderr, "exp10f : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_expm1f(d); + double c = expm1lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXPM1; + } + } + + for(d = -1000;d < 1000;d += 0.023) { + float q = child_expm1f(d); + double c = expm1lfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXPM1; + } + } + + for(d = 0;d < 300;d += 0.02) { + float d2 = pow(10, -d); + float q = child_expm1f(d2); + double c = expm1lfr(flushToZero(d2)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXPM1; + } + } + + for(d = 0;d < 300;d += 0.02) { + float d2 = -pow(10, -d); + float q = child_expm1f(d2); + double c = expm1lfr(flushToZero(d2)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_EXPM1; + } + } + + STOP_EXPM1: + + fprintf(stderr, "expm1f : %lf ... ", max); + + showResult(max < 5); + } +} + +int main(int argc, char **argv) { + char *argv2[argc]; + int i, a2s; + + for(a2s=1;a2s +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "nonnumber.h" + +#define POSITIVE_INFINITYf ((float)INFINITY) +#define NEGATIVE_INFINITYf (-(float)INFINITY) +#define M_PIf ((float)M_PI) + +#define POSITIVE_INFINITY (INFINITY) +#define NEGATIVE_INFINITY (-INFINITY) + +typedef int boolean; + +#define true 1 +#define false 0 + +int enableFlushToZero = 0; + +void stop(char *mes) { + fprintf(stderr, "%s\n", mes); + abort(); +} + +int readln(int fd, char *buf, int cnt) { + int i, rcnt = 0; + + if (cnt < 1) return -1; + + while(cnt >= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int ptoc[2], ctop[2]; +int pid; + +void startChild(const char *path, char *const argv[]) { + pipe(ptoc); + pipe(ctop); + + pid = fork(); + + assert(pid != -1); + + if (pid == 0) { + // child process + char buf0[1], buf1[1]; + int i; + + close(ptoc[1]); + close(ctop[0]); + + i = dup2(ptoc[0], fileno(stdin)); + assert(i != -1); + + i = dup2(ctop[1], fileno(stdout)); + assert(i != -1); + + setvbuf(stdin, buf0, _IONBF,0); + setvbuf(stdout, buf1, _IONBF,0); + + fflush(stdin); + fflush(stdout); + + execvp(path, argv); + + assert(0); + } + + // parent process + + close(ptoc[0]); + close(ctop[1]); +} + +float u2f(uint32_t u) { + union { + float f; + uint32_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint32_t f2u(float d) { + union { + float f; + uint32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +// + +boolean isPlusZerof(float x) { return x == 0 && copysignf(1, x) == 1; } +boolean isMinusZerof(float x) { return x == 0 && copysignf(1, x) == -1; } +boolean xisnanf(float x) { return x != x; } +float signf(float d) { return d < 0 ? -1 : 1; } + +boolean isPlusZero(double x) { return x == 0 && copysign(1, x) == 1; } +boolean isMinusZero(double x) { return x == 0 && copysign(1, x) == -1; } +boolean xisnan(double x) { return x != x; } + +double flushToZero(double y) { + if (enableFlushToZero && fabs(y) < 1.2e-38) y = copysign(0.0, y); + return y; +} + +boolean cmpDenorm(float x, float y) { + y = flushToZero(y); + if (xisnanf(x) && xisnanf(y)) return true; + if (xisnanf(x) || xisnanf(y)) return false; + if (isinf(x) != isinf(y)) return false; + if (x == POSITIVE_INFINITYf && y == POSITIVE_INFINITYf) return true; + if (x == NEGATIVE_INFINITYf && y == NEGATIVE_INFINITYf) return true; + if (y == 0) { + if (isPlusZerof(x) && isPlusZerof(y)) return true; + if (isMinusZerof(x) && isMinusZerof(y)) return true; + return false; + } + if (!xisnanf(x) && !xisnanf(y) && !isinf(x) && !isinf(y)) return signf(x) == signf(y); + return false; +} + +double ulp(double x) { + x = fabsf(x); + int exp; + + if (x == 0) { + return FLT_MIN; + } else { + frexpf(x, &exp); + } + + return fmaxf(ldexpf(1.0, exp-24), FLT_MIN); +} + +double countULP(double x, double y) { + y = flushToZero(y); + float fx = (float)x; + float fy = (float)y; + if (xisnan(fx) && xisnan(fy)) return 0; + if (xisnan(fx) || xisnan(fy)) return 10000; + if (isinf(fx)) { + if (signf(fx) == signf(fy) && fabs(fy) > 1e+37) return 0; // Relaxed infinity handling + return 10001; + } + if (fx == POSITIVE_INFINITY && fy == POSITIVE_INFINITY) return 0; + if (fx == NEGATIVE_INFINITY && fy == NEGATIVE_INFINITY) return 0; + if (fy == 0) { + if (fx == 0) return 0; + return 10002; + } + if (!xisnan(fx) && !xisnan(fy) && !isinf(fx) && !isinf(fy)) { + return fabs((x - y) / ulp(y)); + } + return 10003; +} + +// + +double sinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atan2fr(double y, double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double atan2lfr(long double y, long double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double logfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double loglfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double explfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double powfr(double x, double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double powlfr(long double x, long double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double sinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double coshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acoshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sqrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double sqrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cbrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double cbrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp2fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp2lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expm1fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double expm1lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log1pfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log1plfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +// + +typedef struct { + float x, y; +} float2; + +float child_sinf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sinf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_cosf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "cosf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cosf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float2 child_sincosf(float x) { + char str[256]; + uint32_t u, v; + + sprintf(str, "sincosf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sincosf"); + sscanf(str, "%x %x", &u, &v); + + float2 ret; + ret.x = u2f(u); + ret.y = u2f(v); + return ret; +} + +float child_tanf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "tanf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_asinf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "asinf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_acosf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "acosf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acosf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atanf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atanf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atan2f(float y, float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atan2f_u1 %x %x\n", f2u(y), f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan2f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_logf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "logf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_logf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_expf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "expf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_powf(float x, float y) { + char str[256]; + uint32_t u; + + sprintf(str, "powf %x %x\n", f2u(x), f2u(y)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_powf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_sinhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sinhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_coshf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "coshf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_coshf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_tanhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "tanhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_asinhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "asinhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_acoshf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "acoshf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acoshf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_atanhf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "atanhf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanhf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_sqrtf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "sqrtf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sqrtf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_cbrtf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "cbrtf_u1 %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cbrtf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_exp2f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "exp2f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp2f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_exp10f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "exp10f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp10f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_expm1f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "expm1f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expm1f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_log10f(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "log10f %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log10f"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_log1pf(float x) { + char str[256]; + uint32_t u; + + sprintf(str, "log1pf %x\n", f2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log1pf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +float child_ldexpf(float x, int q) { + char str[256]; + uint32_t u; + + sprintf(str, "ldexpf %x %x\n", f2u(x), f2u(q)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_powf"); + sscanf(str, "%x", &u); + return u2f(u); +} + +int allTestsPassed = 1; + +void showResult(int success) { + if (!success) allTestsPassed = 0; + fprintf(stderr, "%s\n", success ? " OK" : " NG **************"); +} + +void do_test() { + int i, j; + + fprintf(stderr, "Denormal/nonnumber test atan2f_u1(y, x)\n\n"); + + fprintf(stderr, "If y is +0 and x is -0, +pi is returned ... "); + showResult(child_atan2f(+0.0, -0.0) == M_PIf); + + fprintf(stderr, "If y is -0 and x is -0, -pi is returned ... "); + showResult(child_atan2f(-0.0, -0.0) == -M_PIf); + + fprintf(stderr, "If y is +0 and x is +0, +0 is returned ... "); + showResult(isPlusZerof(child_atan2f(+0.0, +0.0))); + + fprintf(stderr, "If y is -0 and x is +0, -0 is returned ... "); + showResult(isMinusZerof(child_atan2f(-0.0, +0.0))); + + fprintf(stderr, "If y is positive infinity and x is negative infinity, +3*pi/4 is returned ... "); + showResult(child_atan2f(POSITIVE_INFINITYf, NEGATIVE_INFINITYf) == 3*M_PIf/4); + + fprintf(stderr, "If y is negative infinity and x is negative infinity, -3*pi/4 is returned ... "); + showResult(child_atan2f(NEGATIVE_INFINITYf, NEGATIVE_INFINITYf) == -3*M_PIf/4); + + fprintf(stderr, "If y is positive infinity and x is positive infinity, +pi/4 is returned ... "); + showResult(child_atan2f(POSITIVE_INFINITYf, POSITIVE_INFINITYf) == M_PIf/4); + + fprintf(stderr, "If y is negative infinity and x is positive infinity, -pi/4 is returned ... "); + showResult(child_atan2f(NEGATIVE_INFINITYf, POSITIVE_INFINITYf) == -M_PIf/4); + + { + fprintf(stderr, "If y is +0 and x is less than 0, +pi is returned ... "); + + float ya[] = { +0.0 }; + float xa[] = { -100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5 }; + + boolean success = true; + + for(i=0;i 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_sinf(d); + double c = sinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_sinf(d); + double c = sinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + } + + STOP_SIN: + + fprintf(stderr, "sinf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_cosf(d); + double c = coslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "cosf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float2 q = child_sincosf(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float2 q = child_sincosf(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float2 q = child_sincosf(d); + double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + } + + STOP_SIN2: + + fprintf(stderr, "sin in sincosf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.y, (double)c, d, (double)ulp(c)); + goto STOP_COS2; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.y, (double)c, d, (double)ulp(c)); + goto STOP_COS2; + } + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float2 q = child_sincosf(d); + double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + } + + STOP_COS2: + + fprintf(stderr, "cos in sincosf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + float start = u2f(f2u(M_PI_4 * i)-20); + float end = u2f(f2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2f(f2u(d)+1)) { + float q = child_tanf(d); + double c = tanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "tanf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + float q = child_asinf(d); + double c = asinlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASIN; + } + } + + STOP_ASIN: + + fprintf(stderr, "asinf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + float q = child_acosf(d); + double c = acoslfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOS; + } + } + + STOP_ACOS: + + fprintf(stderr, "acosf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + float q = child_atanf(d); + double c = atanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + for(d = -10000;d < 10000;d += 0.201) { + float q = child_atanf(d); + double c = atanlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + STOP_ATAN: + + fprintf(stderr, "atanf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float x, y, max = 0; + + for(y = -10;y < 10;y += 0.051) { + for(x = -10;x < 10;x += 0.052) { + float q = child_atan2f(y, x); + double c = atan2lfr(flushToZero(y), flushToZero(x)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + for(y = -100;y < 100;y += 0.51) { + for(x = -100;x < 100;x += 0.52) { + float q = child_atan2f(y, x); + double c = atan2lfr(flushToZero(y), flushToZero(x)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + STOP_ATAN2: + + fprintf(stderr, "atan2f_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG; + } + } + + for(d = 0.0001;d < 10000;d += 0.1) { + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_LOG; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(1.1, i); + float q = child_logf(d); + double c = loglfr(flushToZero(d)); + double u = countULP(q, c); + if (flushToZero(d * 0.1) == 0.0 && q == NEGATIVE_INFINITYf) u = 0; + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\ni=%d\n", q, (double)c, d, (double)ulp(c), i); + goto STOP_LOG; + } + } + + STOP_LOG: + + fprintf(stderr, "logf_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + float d, max = 0; + + for(d = -10000;d < 10000;d += 0.2) { + float q = child_cbrtf(d); + double c = cbrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(1.1, i); + float q = child_cbrtf(d); + double c = cbrtlfr(flushToZero(d)); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + STOP_CBRT: + + fprintf(stderr, "cbrtf_u1 : %lf ... ", max); + + showResult(max < 1); + } +} + +int main(int argc, char **argv) { + char *argv2[argc]; + int i, a2s; + + for(a2s=1;a2s +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "nonnumber.h" + +#define POSITIVE_INFINITY INFINITY +#define NEGATIVE_INFINITY (-INFINITY) + +typedef int boolean; + +#define true 1 +#define false 0 + +void stop(char *mes) { + fprintf(stderr, "%s\n", mes); + abort(); +} + +int readln(int fd, char *buf, int cnt) { + int i, rcnt = 0; + + if (cnt < 1) return -1; + + while(cnt >= 2) { + i = read(fd, buf, 1); + if (i != 1) return i; + + if (*buf == '\n') break; + + rcnt++; + buf++; + cnt--; + } + + *++buf = '\0'; + rcnt++; + return rcnt; +} + +int ptoc[2], ctop[2]; +int pid; + +void startChild(const char *path, char *const argv[]) { + pipe(ptoc); + pipe(ctop); + + pid = fork(); + + assert(pid != -1); + + if (pid == 0) { + // child process + char buf0[1], buf1[1]; + int i; + + close(ptoc[1]); + close(ctop[0]); + + i = dup2(ptoc[0], fileno(stdin)); + assert(i != -1); + + i = dup2(ctop[1], fileno(stdout)); + assert(i != -1); + + setvbuf(stdin, buf0, _IONBF,0); + setvbuf(stdout, buf1, _IONBF,0); + + fflush(stdin); + fflush(stdout); + + execvp(path, argv); + + fprintf(stderr, "execvp in startChild : %s\n", strerror(errno)); + + assert(0); + } + + // parent process + + close(ptoc[0]); + close(ctop[1]); +} + +double u2d(uint64_t u) { + union { + double f; + uint64_t i; + } tmp; + tmp.i = u; + return tmp.f; +} + +uint64_t d2u(double d) { + union { + double f; + uint64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +// + +boolean isPlusZero(double x) { return x == 0 && copysign(1, x) == 1; } +boolean isMinusZero(double x) { return x == 0 && copysign(1, x) == -1; } +boolean xisnan(double x) { return x != x; } +double sign(double d) { return d < 0 ? -1 : 1; } + +boolean cmpDenorm(double x, double y) { + if (xisnan(x) && xisnan(y)) return true; + if (xisnan(x) || xisnan(y)) return false; + if (isinf(x) != isinf(y)) return false; + if (x == POSITIVE_INFINITY && y == POSITIVE_INFINITY) return true; + if (x == NEGATIVE_INFINITY && y == NEGATIVE_INFINITY) return true; + if (y == 0) { + if (isPlusZero(x) && isPlusZero(y)) return true; + if (isMinusZero(x) && isMinusZero(y)) return true; + return false; + } + if (!xisnan(x) && !xisnan(y) && !isinf(x) && !isinf(y)) return sign(x) == sign(y); + return false; +} + +long double ulp(long double x) { + x = fabsl(x); + int exp; + + if (x == 0) { + return DBL_MIN; + } else { + frexpl(x, &exp); + } + + return fmax(ldexp(1.0, exp-53), DBL_MIN); +} + +double countULP(long double x, long double y) { + double fx = x; + double fy = y; + if (xisnan(fx) && xisnan(fy)) return 0; + if (xisnan(fx) || xisnan(fy)) return 10000; + if (isinf(fx)) { + if (sign(fx) == sign(fy) && fabs(fy) > 1e+300) return 0; // Relaxed infinity handling + return 10001; + } + if (fx == POSITIVE_INFINITY && fy == POSITIVE_INFINITY) return 0; + if (fx == NEGATIVE_INFINITY && fy == NEGATIVE_INFINITY) return 0; + if (fy == 0) { + if (fx == 0) return 0; + return 10002; + } + if (!xisnan(fx) && !xisnan(fy) && !isinf(fx) && !isinf(fy)) { + return fabs((x - y) / ulp(y)); + } + return 10003; +} + +// + +double sinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asin(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acosfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoslfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acos(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atan(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atan2fr(double y, double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double atan2lfr(long double y, long double x) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_atan2(frx, fry, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double logfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double loglfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double explfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double powfr(double x, double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_d(frx, x, GMP_RNDN); + mpfr_set_d(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +long double powlfr(long double x, long double y) { + mpfr_t frx, fry; + mpfr_inits(frx, fry, NULL); + mpfr_set_ld(frx, x, GMP_RNDN); + mpfr_set_ld(fry, y, GMP_RNDN); + mpfr_pow(frx, frx, fry, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, fry, NULL); + return ret; +} + +double sinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double coshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double tanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double asinhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double acoshfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double atanhfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double coshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double tanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_tanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double asinhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_asinh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double acoshlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_acosh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double atanhlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_atanh(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double sqrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double sqrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_sqrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double cbrtfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double cbrtlfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_cbrt(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp2fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp2lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp2(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double exp10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double exp10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_exp10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double expm1fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double expm1lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_expm1(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log10fr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log10lfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log10(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +double log1pfr(double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_d(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + double ret = mpfr_get_d(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +long double log1plfr(long double d) { + mpfr_t frx; + mpfr_inits(frx, NULL); + mpfr_set_ld(frx, d, GMP_RNDN); + mpfr_log1p(frx, frx, GMP_RNDN); + long double ret = mpfr_get_ld(frx, GMP_RNDN); + mpfr_clears(frx, NULL); + return ret; +} + +// + +typedef struct { + double x, y; +} double2; + +double child_sin(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sin_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sin"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cos(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cos_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cos"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double2 child_sincos(double x) { + char str[256]; + uint64_t u, v; + + sprintf(str, "sincos_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sincos"); + sscanf(str, "%" PRIx64 " %" PRIx64, &u, &v); + + double2 ret; + ret.x = u2d(u); + ret.y = u2d(v); + return ret; +} + +double child_tan(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "tan_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tan"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_asin(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "asin_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asin_"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_acos(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "acos_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acos"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atan(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atan_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atan2(double y, double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atan2_u1 %" PRIx64 " %" PRIx64 "\n", d2u(y), d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atan2"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_pow(double x, double y) { + char str[256]; + uint64_t u; + + sprintf(str, "pow %" PRIx64 " %" PRIx64 "\n", d2u(x), d2u(y)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_pow"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_sinh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sinh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sinh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cosh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cosh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cosh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_tanh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "tanh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_tanh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_asinh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "asinh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_asinh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_acosh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "acosh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_acosh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_atanh(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "atanh %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_atanh"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_sqrt(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "sqrt %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_sqrt"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_cbrt(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "cbrt_u1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_cbrt"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp2(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp2 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp2"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_exp10(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "exp10 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_exp10"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_expm1(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "expm1 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_expm1"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log10(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log10 %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log10"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_log1p(double x) { + char str[256]; + uint64_t u; + + sprintf(str, "log1p %" PRIx64 "\n", d2u(x)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_log1p"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +double child_ldexp(double x, int q) { + char str[256]; + uint64_t u; + + sprintf(str, "ldexp %" PRIx64 " %" PRIx64 "\n", d2u(x), d2u(q)); + write(ptoc[1], str, strlen(str)); + if (readln(ctop[0], str, 255) < 1) stop("child_ldexp"); + sscanf(str, "%" PRIx64, &u); + return u2d(u); +} + +int allTestsPassed = 1; + +void showResult(int success) { + if (!success) allTestsPassed = 0; + fprintf(stderr, "%s\n", success ? " OK" : " NG **************"); +} + +void do_test() { + int i, j; + + fprintf(stderr, "Denormal/nonnumber test atan2_u1(y, x)\n\n"); + + fprintf(stderr, "If y is +0 and x is -0, +pi is returned ... "); + showResult(child_atan2(+0.0, -0.0) == M_PI); + + fprintf(stderr, "If y is -0 and x is -0, -pi is returned ... "); + showResult(child_atan2(-0.0, -0.0) == -M_PI); + + fprintf(stderr, "If y is +0 and x is +0, +0 is returned ... "); + showResult(isPlusZero(child_atan2(+0.0, +0.0))); + + fprintf(stderr, "If y is -0 and x is +0, -0 is returned ... "); + showResult(isMinusZero(child_atan2(-0.0, +0.0))); + + fprintf(stderr, "If y is positive infinity and x is negative infinity, +3*pi/4 is returned ... "); + showResult(child_atan2(POSITIVE_INFINITY, NEGATIVE_INFINITY) == 3*M_PI/4); + + fprintf(stderr, "If y is negative infinity and x is negative infinity, -3*pi/4 is returned ... "); + showResult(child_atan2(NEGATIVE_INFINITY, NEGATIVE_INFINITY) == -3*M_PI/4); + + fprintf(stderr, "If y is positive infinity and x is positive infinity, +pi/4 is returned ... "); + showResult(child_atan2(POSITIVE_INFINITY, POSITIVE_INFINITY) == M_PI/4); + + fprintf(stderr, "If y is negative infinity and x is positive infinity, -pi/4 is returned ... "); + showResult(child_atan2(NEGATIVE_INFINITY, POSITIVE_INFINITY) == -M_PI/4); + + { + fprintf(stderr, "If y is +0 and x is less than 0, +pi is returned ... "); + + double ya[] = { +0.0 }; + double xa[] = { -100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5 }; + + boolean success = true; + + for(i=0;i 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_sin(d); + long double c = sinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_sin(d); + long double c = sinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SIN; + } + } + } + + STOP_SIN: + + fprintf(stderr, "sin_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_cos(d); + long double c = coslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "cos_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double2 q = child_sincos(d); + long double c = sinlfr(d); + double u = fabs((q.x - c) / ulp(c)); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q.x, (double)c, d, (double)ulp(c)); + goto STOP_SIN2; + } + } + } + + STOP_SIN2: + + fprintf(stderr, "sin in sincos_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double2 q = child_sincos(d); + long double c = coslfr(d); + double u = fabs((q.y - c) / ulp(c)); + max = fmax(max, u); + } + } + + fprintf(stderr, "cos in sincos_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -10000000;d < 10000000;d += 200.1) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + + for(i=1;i<10000;i++) { + double start = u2d(d2u(M_PI_4 * i)-20); + double end = u2d(d2u(M_PI_4 * i)+20); + + for(d = start;d <= end;d = u2d(d2u(d)+1)) { + double q = child_tan(d); + long double c = tanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + } + + fprintf(stderr, "tan_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + double q = child_asin(d); + long double c = asinlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASIN; + } + } + + STOP_ASIN: + + fprintf(stderr, "asin_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -1;d < 1;d += 0.00002) { + double q = child_acos(d); + long double c = acoslfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOS; + } + } + + STOP_ACOS: + + fprintf(stderr, "acos_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_atan(d); + long double c = atanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + for(d = -10000;d < 10000;d += 0.2) { + double q = child_atan(d); + long double c = atanlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, d = %g\n", q, d); + goto STOP_ATAN; + } + } + + STOP_ATAN: + + fprintf(stderr, "atan_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double x, y, max = 0; + + for(y = -10;y < 10;y += 0.05) { + for(x = -10;x < 10;x += 0.05) { + double q = child_atan2(y, x); + long double c = atan2lfr(y, x); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + for(y = -100;y < 100;y += 0.51) { + for(x = -100;x < 100;x += 0.51) { + double q = child_atan2(y, x); + long double c = atan2lfr(y, x); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, y = %g, x = %g\n", q, y, x); + goto STOP_ATAN2; + } + } + } + + STOP_ATAN2: + + fprintf(stderr, "atan2_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(2.1, i); + double q = child_log(d); + long double c = loglfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log_u1 : %lf ... ", max); + + showResult(max < 1); + } + +#if 0 + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp(d); + long double c = explfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_exp(d); + long double c = explfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp : %lf ... ", max); + + showResult(max < 1); + } +#endif + +#if 0 + { + double x, y, max = 0; + + for(y = 0.1;y < 100;y += 0.2) { + for(x = -100;x < 100;x += 0.2) { + double q = child_pow(x, y); + long double c = powlfr(x, y); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %g, x = %g, y = %g\n", q, x, y); + goto STOP_POW; + } + } + } + + STOP_POW: + + fprintf(stderr, "pow : %lf ... ", max); + + showResult(max < 1); + } + +#if 0 + { + double d, max = 0; + + for(d = 0;d < 20000;d += 0.2) { + double q = child_sqrt(d); + long double c = sqrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SQRT; + } + } + + STOP_SQRT: + + fprintf(stderr, "sqrt : %lf ... ", max); + + showResult(max < 1); + } +#endif + + { + double d, max = 0; + + for(d = -10000;d < 10000;d += 0.2) { + double q = child_cbrt(d); + long double c = cbrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + int i; + for(i = -1000;i <= 1000;i++) { + d = pow(2.1, i); + double q = child_cbrt(d); + long double c = cbrtlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_CBRT; + } + } + + STOP_CBRT: + + fprintf(stderr, "cbrt_u1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_sinh(d); + long double c = sinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_sinh(d); + long double c = sinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_SINH; + } + } + + STOP_SINH: + + fprintf(stderr, "sinh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_cosh(d); + long double c = coshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_cosh(d); + long double c = coshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_COSH; + } + } + + STOP_COSH: + + fprintf(stderr, "cosh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_tanh(d); + long double c = tanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_TANH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_tanh(d); + long double c = tanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_TANH; + } + } + + STOP_TANH: + + fprintf(stderr, "tanh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_asinh(d); + long double c = asinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_asinh(d); + long double c = asinhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ASINH; + } + } + + STOP_ASINH: + + fprintf(stderr, "asinh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 1;d < 10;d += 0.0002) { + double q = child_acosh(d); + long double c = acoshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + for(d = 1;d < 1000;d += 0.02) { + double q = child_acosh(d); + long double c = acoshlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ACOSH; + } + } + + STOP_ACOSH: + + fprintf(stderr, "acosh : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_atanh(d); + long double c = atanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_atanh(d); + long double c = atanhlfr(d); + double u = countULP(q, c); + max = fmax(max, u); + if (u > 1000) { + fprintf(stderr, "q = %.20g\nc = %.20g\nd = %.20g\nulp = %g\n", q, (double)c, d, (double)ulp(c)); + goto STOP_ATANH; + } + } + + STOP_ATANH: + + fprintf(stderr, "atanh : %lf ... ", max); + + showResult(max < 1); + } +#endif + +#if 0 + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp2(d); + long double c = exp2lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_exp2(d); + long double c = exp2lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp2 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_exp10(d); + long double c = exp10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -300;d < 300;d += 0.01) { + double q = child_exp10(d); + long double c = exp10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "exp10 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = -10;d < 10;d += 0.0002) { + double q = child_expm1(d); + long double c = expm1lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = -1000;d < 1000;d += 0.02) { + double q = child_expm1(d); + long double c = expm1lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = pow(10, -d); + double q = child_expm1(d2); + long double c = expm1lfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = -pow(10, -d); + double q = child_expm1(d2); + long double c = expm1lfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "expm1 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log10(d); + long double c = log10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log10(d); + long double c = log10lfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log10 : %lf ... ", max); + + showResult(max < 1); + } + + { + double d, max = 0; + + for(d = 0.0001;d < 10;d += 0.0001) { + double q = child_log1p(d); + long double c = log1plfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0.0001;d < 10000;d += 0.1) { + double q = child_log1p(d); + long double c = log1plfr(d); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = pow(10, -d); + double q = child_log1p(d2); + long double c = log1plfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + for(d = 0;d < 300;d += 0.02) { + double d2 = -pow(10, -d); + double q = child_log1p(d2); + long double c = log1plfr(d2); + double u = countULP(q, c); + max = fmax(max, u); + } + + fprintf(stderr, "log1p : %lf ... ", max); + + showResult(max < 1); + } +#endif +} + +int main(int argc, char **argv) { + char *argv2[argc]; + int i; + + for(i=1;i