diff --git a/Makefile b/Makefile
index 703faec3..97a75d4e 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,7 @@ distclean : clean
+"$(MAKE)" --directory=include distclean
+"$(MAKE)" --directory=lib distclean
+"$(MAKE)" --directory=src distclean
+ rm -rf debian/libsleef3
# rm -f debian/debhelper-build-stamp
# rm -f debian/files
# rm -f debian/libsleef3.debhelper.log
diff --git a/README.md b/README.md
index 28f76782..acbfcab3 100644
--- a/README.md
+++ b/README.md
@@ -1,117 +1,15 @@
[![Build Status](https://travis-ci.org/shibatch/sleef.svg?branch=master)](https://travis-ci.org/shibatch/sleef)
In this library, functions for evaluating some elementary functions
-are implemented. The algorithm is intended for efficient evaluation
-utilizing SIMD instruction sets like SSE or AVX, but it is also fast
-using usual scalar operations.
-
-The package contains a few directories in which implementation in the
-corresponding languages are contained. You can run "make test" in
-order to test the functions in each directory.
+are implemented. The library also includes DFT subroutines.
The software is distributed under the Boost Software License, Version
1.0. See accompanying file LICENSE.txt or copy at
http://www.boost.org/LICENSE_1_0.txt.
Contributions to this project are accepted under the same license.
-Copyright Naoki Shibata and contributors 2010 - 2017.
-
-
-Main download page : http://shibatch.sourceforge.net/
-
---
-
-Compiling library with Microsoft Visual C++
-
-Below is the instruction for compiling SLEEF with Microsoft Visual
-C++. Only 64bit architecture is supported. Only DLLs are built.
-
-
-1. Install Visual Studio 2015 or later, along with Cygwin
-2. Copy vcvars64.bat to a working directory.
- This file is usually in the following directory.
- C:\Program Files (x86)\MSVCCommunity2015\VC\bin\amd64
-3. Add the following line at the end of vcvars64.bat
- if "%SHELL%"=="/bin/bash" c:\cygwin64\bin\bash.exe
-4. Execute vcvars64.bat within the Cygwin bash shell.
-5. Go to sleef-3.X directory
-6. Run "make -f Makefile.vc"
-
---
-
-
-History
-
-3.0
-* New API is defined
-* Functions for DFT are added
-* sincospi functions are added
-* gencoef now supports single, extended and quad precision in addition to double precision
-* Linux, Windows and Mac OS X are supported
-* GCC, Clang, Intel Compiler, Microsoft Visual C++ are supported
-* The library can be compiled as DLLs
-* Files needed for creating a debian package are now included
-
-
-2.121
-* Renamed LICENSE_1_0.txt to LICENSE.txt
-2.120
-* Relicensed to Boost Software License Version 1.0
-
-2.110
-* The valid range of argument is extended for trig functions
-* Specification of each functions regarding to the domain and accuracy is added
-* A coefficient generation tool is added
-* New testing tools are introduced
-* Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
-* SIMD xsin and xcos returned values more than 1 when FMA is enabled
-* Pure C cbrt returned incorrect values when the argument is negative
-* tan_u1 returned values with more than 1 ulp of error on rare occasions
-* Removed support for Java language(because no one seems using this)
-
-2.100 Added support for AVX-512F and Clang Extended Vectors.
-
-2.90 Added ilogbf. All the reported bugs(listed below) are fixed.
-* Log function returned incorrect values when the argument is very small.
-* Signs of returned values were incorrect when the argument is signed zero.
-* Tester incorrectly counted ULP in some cases.
-* ilogb function returned incorrect values in some cases.
-
-2.80 Added support for ARM NEON. Added higher accuracy single
-precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1,
-acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1.
-
-2.70 Added higher accuracy functions : sin_u1, cos_u1, sincos_u1,
-tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and
-cbrt_u1. These functions evaluate the corresponding function with at
-most 1 ulp of error.
-
-2.60 Added the remaining single precision functions : powf, sinhf,
-coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4
-(for AMD Bulldozer). Added more test cases. Fixed minor bugs (which
-degraded accuracy in some rare cases).
-
-2.50 Added support for AVX2. SLEEF now compiles with ICC.
-
-2.40 Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf,
-sinf and cosf. Removed support for Go language.
-
-2.31 Added sincosf.
-
-2.30 Added single precision functions : sinf, cosf, tanf, asinf,
-acosf, atanf, logf, expf, atan2f and cbrtf.
-
-2.20 Added exp2, exp10, expm1, log10, log1p, and cbrt.
-
-2.10 asin() and acos() are back. Added ilogb() and ldexp(). Added
-hyperbolic functions. Eliminated dependency on frexp, ldexp, fabs,
-isnan and isinf.
-
-2.00 All of the algorithm has been updated. Both accuracy and speed
-are improved since version 1.10. Denormal number handling is also
-improved.
-
-1.10 AVX support is added. Accuracy tester is added.
+Copyright Naoki Shibata and contributors 2010 - 2017.
-1.00 Initial release
+Main Page : http://sleef.org/
+GitHub Repo : https://github.com/shibatch/sleef
diff --git a/config.mk b/config.mk
index 844fd07f..957e68cb 100644
--- a/config.mk
+++ b/config.mk
@@ -1,5 +1,5 @@
export SOVERSION=3
-export MINORVERSION=0
+export MINORVERSION=1
export MAXBUTWIDTH=3
diff --git a/debian/changelog b/debian/changelog
index 58993818..307ca863 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,74 +1,5 @@
-libsleef3 (3.1-2) xenial; urgency=medium
-
- * hypot is now ok
- * Added nextafter, frexp and fmod to sleedp.c
-
- -- Naoki Shibata Sleef_float32x4_t_2 Description
+Sleef_float32x4_t_2 is a data type for storing two float32x4_t values,
+which is defined in sleef.h as follows:
+ Vectorized single precision sine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cosf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cosf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision combined sine and cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincos_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision combined sine and cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincosf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincosf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision sine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinpif_u05. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cospif_u05. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision combined sine and cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospif_u05. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospif_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision power function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_powf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision natural logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_logf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision natural logarithmic function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_logf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision base-10 logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log10f_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision logarithm of one plus argument with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log1pf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision base-e exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision base-2 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp2f_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision base-10 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp10f_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision base-e exponential function minus 1 with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expm1f_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision square root function with 0.5001 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrtf_u05. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision square root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrtf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision cubic root function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrtf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision cubic root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrtf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision 2D Euclidian distance function with 0.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypotf_u05. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision 2D Euclidian distance function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypotf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acosf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acosf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanf_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc tangent function of two variables with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2f_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision arc tangent function of two variables with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2f_u35. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinhf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_coshf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanhf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision inverse hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinhf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision inverse hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acoshf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision inverse hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanhf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erff_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision complementary error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erfcf_u15. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tgammaf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision log gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_lgammaf_u10. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for rounding to integer towards zero Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_truncf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for rounding to integer towards negative infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_floorf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for rounding to integer towards positive infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_ceilf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_roundf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_rintf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for fused multiply-accumulation Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmaf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision FP remainder Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmodf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for obtaining fractional component of an FP number Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_frfrexpf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision signed integral and fractional values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_modff. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for calculating the absolute value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fabsf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for copying signs Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_copysignf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for determining maximum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmaxf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for determining minimum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fminf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function to calculate positive difference of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fdimf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Vectorized single precision function for obtaining the next representable FP value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_nextafterf. This function may less accurate than the scalar function since AArch32 NEON is not IEEE 754-compliant.
+ Sleef_float32x4_t_2 Description
+Sleef_float32x4_t_2 is a data type for storing two float32x4_t values,
+which is defined in sleef.h as follows:
+ Sleef_float64x2_t_2 Description
+Sleef_float64x2_t_2 is a data type for storing two float64x2_t values,
+which is defined in sleef.h as follows:
+ Vectorized double precision sine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sin_u10 with the same accuracy specification.
+ Vectorized single precision sine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinf_u10 with the same accuracy specification.
+ Vectorized double precision sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sin_u35 with the same accuracy specification.
+ Vectorized single precision sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinf_u35 with the same accuracy specification.
+ Vectorized double precision cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cos_u10 with the same accuracy specification.
+ Vectorized single precision cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cosf_u10 with the same accuracy specification.
+ Vectorized double precision cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cos_u35 with the same accuracy specification.
+ Vectorized single precision cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cosf_u35 with the same accuracy specification.
+ Vectorized single precision combined sine and cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincos_u10 with the same accuracy specification.
+ Vectorized single precision combined sine and cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincosf_u10 with the same accuracy specification.
+ Vectorized double precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincos_u35 with the same accuracy specification.
+ Vectorized single precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincosf_u35 with the same accuracy specification.
+ Vectorized double precision sine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinpi_u05 with the same accuracy specification.
+ Vectorized single precision sine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinpif_u05 with the same accuracy specification.
+ Vectorized double precision cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cospi_u05 with the same accuracy specification.
+ Vectorized single precision cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cospif_u05 with the same accuracy specification.
+ Vectorized double precision combined sine and cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospi_u05 with the same accuracy specification.
+ Vectorized single precision combined sine and cosine function with 0.506 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospif_u05 with the same accuracy specification.
+ Vectorized double precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospi_u35 with the same accuracy specification.
+ Vectorized single precision combined sine and cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sincospif_u35 with the same accuracy specification.
+ Vectorized double precision tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tan_u10 with the same accuracy specification.
+ Vectorized single precision tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanf_u10 with the same accuracy specification.
+ Vectorized double precision tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tan_u35 with the same accuracy specification.
+ Vectorized single precision tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanf_u35 with the same accuracy specification.
+ Vectorized double precision power function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_pow_u10 with the same accuracy specification.
+ Vectorized single precision power function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_powf_u10 with the same accuracy specification.
+ Vectorized double precision natural logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log_u10 with the same accuracy specification.
+ Vectorized single precision natural logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_logf_u10 with the same accuracy specification.
+ Vectorized double precision natural logarithmic function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log_u35 with the same accuracy specification.
+ Vectorized single precision natural logarithmic function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_logf_u35 with the same accuracy specification.
+ Vectorized double precision base-10 logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log10_u10 with the same accuracy specification.
+ Vectorized single precision base-10 logarithmic function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log10f_u10 with the same accuracy specification.
+ Vectorized double precision logarithm of one plus argument with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log1p_u10 with the same accuracy specification.
+ Vectorized single precision logarithm of one plus argument with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_log1pf_u10 with the same accuracy specification.
+ Vectorized double precision base-e exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp_u10 with the same accuracy specification.
+ Vectorized single precision base-e exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expf_u10 with the same accuracy specification.
+ Vectorized double precision base-2 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp2_u10 with the same accuracy specification.
+ Vectorized single precision base-2 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp2f_u10 with the same accuracy specification.
+ Vectorized double precision base-10 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp10_u10 with the same accuracy specification.
+ Vectorized single precision base-10 exponential function function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_exp10f_u10 with the same accuracy specification.
+ Vectorized double precision base-e exponential function minus 1 with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expm1_u10 with the same accuracy specification.
+ Vectorized single precision base-e exponential function minus 1 with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expm1f_u10 with the same accuracy specification.
+ Vectorized double precision square root function with 0.5001 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrt_u05 with the same accuracy specification.
+ Vectorized single precision square root function with 0.5001 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrtf_u05 with the same accuracy specification.
+ Vectorized double precision square root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrt_u35 with the same accuracy specification.
+ Vectorized single precision square root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sqrtf_u35 with the same accuracy specification.
+ Vectorized double precision cubic root function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrt_u10 with the same accuracy specification.
+ Vectorized single precision cubic root function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrtf_u10 with the same accuracy specification.
+ Vectorized double precision cubic root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrt_u35 with the same accuracy specification.
+ Vectorized single precision cubic root function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cbrtf_u35 with the same accuracy specification.
+ Vectorized double precision 2D Euclidian distance function with 0.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypot_u05 with the same accuracy specification.
+ Vectorized single precision 2D Euclidian distance function with 0.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypotf_u05 with the same accuracy specification.
+ Vectorized double precision 2D Euclidian distance function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypot_u35 with the same accuracy specification.
+ Vectorized single precision 2D Euclidian distance function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_hypotf_u35 with the same accuracy specification.
+ Vectorized double precision arc sine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asin_u10 with the same accuracy specification.
+ Vectorized single precision arc sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinf_u10 with the same accuracy specification.
+ Vectorized double precision arc sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asin_u35 with the same accuracy specification.
+ Vectorized single precision arc sine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinf_u35 with the same accuracy specification.
+ Vectorized double precision arc cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acos_u10 with the same accuracy specification.
+ Vectorized single precision arc cosine function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acosf_u10 with the same accuracy specification.
+ Vectorized double precision arc cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acos_u35 with the same accuracy specification.
+ Vectorized single precision arc cosine function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acosf_u35 with the same accuracy specification.
+ Vectorized double precision arc tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan_u10 with the same accuracy specification.
+ Vectorized single precision arc tangent function with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanf_u10 with the same accuracy specification.
+ Vectorized double precision arc tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan_u35 with the same accuracy specification.
+ Vectorized single precision arc tangent function with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanf_u35 with the same accuracy specification.
+ Vectorized double precision arc tangent function of two variables with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2_u10 with the same accuracy specification.
+ Vectorized single precision arc tangent function of two variables with 1.0 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2f_u10 with the same accuracy specification.
+ Vectorized double precision arc tangent function of two variables with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2_u35 with the same accuracy specification.
+ Vectorized single precision arc tangent function of two variables with 3.5 ULP error bound Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atan2f_u35 with the same accuracy specification.
+ Vectorized double precision hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinh_u10 with the same accuracy specification.
+ Vectorized single precision hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_sinhf_u10 with the same accuracy specification.
+ Vectorized double precision hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_cosh_u10 with the same accuracy specification.
+ Vectorized single precision hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_coshf_u10 with the same accuracy specification.
+ Vectorized double precision hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanh_u10 with the same accuracy specification.
+ Vectorized single precision hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tanhf_u10 with the same accuracy specification.
+ Vectorized double precision inverse hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinh_u10 with the same accuracy specification.
+ Vectorized single precision inverse hyperbolic sine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_asinhf_u10 with the same accuracy specification.
+ Vectorized double precision inverse hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acosh_u10 with the same accuracy specification.
+ Vectorized single precision inverse hyperbolic cosine function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_acoshf_u10 with the same accuracy specification.
+ Vectorized double precision inverse hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanh_u10 with the same accuracy specification.
+ Vectorized single precision inverse hyperbolic tangent function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_atanhf_u10 with the same accuracy specification.
+ Vectorized double precision error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erf_u10 with the same accuracy specification.
+ Vectorized single precision error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erff_u10 with the same accuracy specification.
+ Vectorized double precision complementary error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erfc_u15 with the same accuracy specification.
+ Vectorized single precision complementary error function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_erfcf_u15 with the same accuracy specification.
+ Vectorized double precision gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tgamma_u10 with the same accuracy specification.
+ Vectorized single precision gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_tgammaf_u10 with the same accuracy specification.
+ Vectorized double precision log gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_lgamma_u10 with the same accuracy specification.
+ Vectorized single precision log gamma function Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_lgammaf_u10 with the same accuracy specification.
+ Vectorized double precision function for rounding to integer towards zero Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_trunc with the same accuracy specification.
+ Vectorized single precision function for rounding to integer towards zero Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_truncf with the same accuracy specification.
+ Vectorized double precision function for rounding to integer towards negative infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_floor with the same accuracy specification.
+ Vectorized single precision function for rounding to integer towards negative infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_floorf with the same accuracy specification.
+ Vectorized double precision function for rounding to integer towards positive infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_ceil with the same accuracy specification.
+ Vectorized single precision function for rounding to integer towards positive infinity Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_ceilf with the same accuracy specification.
+ Vectorized double precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_round with the same accuracy specification.
+ Vectorized single precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_roundf with the same accuracy specification.
+ Vectorized double precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_rint with the same accuracy specification.
+ Vectorized single precision function for rounding to nearest integer Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_rintf with the same accuracy specification.
+ Vectorized double precision function for fused multiply-accumulation Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fma with the same accuracy specification.
+ Vectorized single precision function for fused multiply-accumulation Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmaf with the same accuracy specification.
+ Vectorized double precision FP remainder Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmod with the same accuracy specification.
+ Vectorized single precision FP remainder Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmodf with the same accuracy specification.
+ Vectorized double precision function for multiplying by integral power of 2 Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_ldexp with the same accuracy specification.
+ Vectorized double precision function for obtaining fractional component of an FP number Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_frfrexp with the same accuracy specification.
+ Vectorized single precision function for obtaining fractional component of an FP number Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_frfrexpf with the same accuracy specification.
+ Vectorized double precision function for obtaining integral component of an FP number Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_expfrexp with the same accuracy specification.
+ Vectorized double precision function for getting integer exponent Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_ilogb with the same accuracy specification.
+ Vectorized double precision signed integral and fractional values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_modf with the same accuracy specification.
+ Vectorized single precision signed integral and fractional values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_modff with the same accuracy specification.
+ Vectorized double precision function for calculating the absolute value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fabs with the same accuracy specification.
+ Vectorized single precision function for calculating the absolute value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fabsf with the same accuracy specification.
+ Vectorized double precision function for copying signs Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_copysign with the same accuracy specification.
+ Vectorized single precision function for copying signs Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_copysignf with the same accuracy specification.
+ Vectorized double precision function for determining maximum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmax with the same accuracy specification.
+ Vectorized single precision function for determining maximum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmaxf with the same accuracy specification.
+ Vectorized double precision function for determining minimum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fmin with the same accuracy specification.
+ Vectorized single precision function for determining minimum of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fminf with the same accuracy specification.
+ Vectorized double precision function to calculate positive difference of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fdim with the same accuracy specification.
+ Vectorized single precision function to calculate positive difference of two values Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_fdimf with the same accuracy specification.
+ Vectorized double precision function for obtaining the next representable FP value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_nextafter with the same accuracy specification.
+ Vectorized single precision function for obtaining the next representable FP value Synopsis
+#include <sleef.h> Description
+This is the vectorized function of Sleef_nextafterf with the same accuracy specification.
+
+ Fig. 7.1 shows a simplified code of our dispatcher. There is only
+ one exported function mainFunc. When
+ mainFunc is called for the first
+ time, dispatcherMain is called internally,
+ since funcPtr is initialized to the pointer to
+ dispatcherMain(line 14). It then detects if the
+ CPU supports SSE 4.1(line 7), and
+ rewrites funcPtr to a pointer to the function
+ that utilizes SSE 4.1 or SSE 2, depending on the result of CPU
+ feature detection(line 10). When
+ mainFunc is called for the second time, it does
+ not execute the
+ dispatcherMain. It just executes the function
+ pointed by the pointer stored in funcPtr during
+ the execution of
+ dispatcherMain.
+
+ There are a few advantages in our dispatcher. The first advantage is
+ that it does not require any compiler-specific extension. The second
+ advantage is simplicity. There are only 18 lines of simple
+ code. Since the dispatchers are completely separated for each
+ function, there is not much room for bugs to get in.
+
+ The third advantage is low overhead. You might think that the
+ overhead is one function call including execution of prologue and
+ epilogue. However, since modern compilers eliminate redundant
+ execution of the prologue, epilogue and return instruction, the
+ actual overhead is just one jmp instruction. This is very fast since
+ it is not conditional.
+
+ The fourth advantage is thread safety. There is only one variable
+ shared among threads, which is funcPtr. There are
+ only two possible values for this pointer variable. The first value
+ is the pointer to the dispatcherMain, and the
+ second value is the pointer to either funcSSE2
+ or funcSSE4, depending on the availability of
+ extensions. Once funcPtr is substituted with the
+ pointer to funcSSE2
+ or funcSSE4, it will not be changed in the
+ future. It is obvious that the code works in all the cases.
+
+ Fig. 7.1: Simplified code of our dispatcher
+
+ ULP stands for "unit in the last place", which is sometimes used for
+ measuring accuracy of calculations. 1 ULP is basically the distance
+ between the two closest floating point number, which depends on the
+ exponent of the FP number. The accuracy of calculations by reputable
+ math libraries is usually between 0.5 and 1 ULP. Here, the accuracy
+ means the largest error of calculation, which only happens in the
+ worst case. SLEEF math library provides multiple accuracy choices
+ for some math functions. Many functions have 3.5-ULP and 1-ULP
+ versions, and 3.5-ULP versions are significantly faster than 1-ULP
+ versions. If you care more about execution speed than accuracy, it
+ is advised to use the 3.5-ULP versions along with -ffast-math or
+ "unsafe math optimization" options for the compiler.
+
+ In IEEE 754 standard, underflow does not happen abruptly when the
+ exponent becomes zero. Instead, denormal numbers are produced which
+ has less precision, and this is sometimes called gradual
+ underflow. On some implementation which is not IEEE-754 conformant,
+ flush-to-zero mode is used since it is easier to implement. In
+ flush-to-zero mode, numbers smaller than the smallest normalized
+ number cannot be represented, and it is replaced with zero. Because
+ of this, the accuracy of calculation may be influenced in some
+ cases. The smallest normalized precision number can be referred with
+ DBL_MIN for double precision, and FLT_MIN for single precision. The
+ naming of these macros is a little bit confusing because DBL_MIN is
+ not the smallest double precision number.
+
+ The sincospi series of functions evaluates sin(
+ πa ) and cos(
+ πa ) simultaneously. These functions are
+ added to SLEEF as of version 3.0. There are a few reasons that I
+ added these functions.
+
+ C standards include specifications for functions that evaluate
+ trigonometric functions. In order to do calculations for evaluating
+ these functions, reduction of an argument is required. This involves
+ a multiple precision multiplication with π,
+ which requires many operations of addition and multiplication. This
+ is slow especially if accurate evaluation is required. By designing
+ the function in a way that the argument is pre-multiplied
+ by π, this reduction can be eliminated. This
+ leads to faster and more accurate evaluation.
+
+ The second reason is that sincospi functions are handy for
+ implementing an FFT library. FFT libraries need to evaluate
+ trigonometric functions for generating twiddle factors that is used in
+ the butterfly operations. Since the butterfly operations are
+ repeatedly applied, the error in twiddle factors accumulates. Thus, we
+ want to make the error in twiddle factors as small as possible. In an
+ FFT of power-of-two size, twiddle factors are
+ sin( πm /
+ 2n ) where m
+ and n are integer. If we just use the usual
+ trigonometric functions defined in the C standards with the
+ precision same as that used for butterfly operations, we already
+ have error when calculating arguments, since
+ πm / 2n cannot
+ be represented as a floating point value without error. On the
+ other hand, if we use sincospi function, the argument can be
+ accurately represented by a radix 2 FP number. Thus, we can
+ calculate twiddle factors with better accuracy.
+
+ The third reason is that sinpi is needed internally to implement
+ gamma functions.
+
+ It is a soup ladle.
+
+
+
+
+
+ These graphs show comparison of the execution time between
+ SLEEF-3.1 and Intel
+ SVML.
+
+ The execution time of each function is measured by executing each
+ function 10^8 times and taking the average time. Each time a
+ function is executed, a uniformly distributed random number is set
+ to each element of the argument vector(each element is set a
+ different value.) The ranges of the random number for each
+ function are shown below. Argument vectors are generated before
+ the measurement, and the time to generate random argument vectors
+ is not included in the execution time.
+
+ The accuracy of SVML functions can be chosen by compiler options,
+ not the function names. "-fimf-max-error=1.0" option is specified
+ to icc to obtain the 1-ulp-accuracy results, and
+ "-fimf-max-error=5.0" option is used for the 5-ulp-accuracy
+ results.
+
+ Those results are measured on a PC with Intel Core i7-6700 CPU @
+ 3.40GHz with Turbo Boost turned off. The CPU should be always
+ running at 3.4GHz during the measurement.
+
+ Click graphs to magnify.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ You need to install libmpfr and OpenMP(libmpfr is only required to
+ build the tester, and it is not linked to the library.) Change
+ directory to sleef-3.X directory and run make. The built headers
+ and libraries will be located under include and lib directories.
+
+ You can run make install using sudo command to install the library
+ and header. Those files are installed under /usr/lib and
+ /usr/include. You can run make uninstall to uninstall those files.
+
+ Fig. 2.1: Commands for compiling SLEEF
+
+Below is the instruction for compiling SLEEF with Microsoft Visual C++
+2015.
+
+ Now, let's try compiling the source code shown in Fig. 2.2.
+
+ Fig. 2.2: Source code for testing
+
+ Fig.2.3 shows typical commands for compiling and executing the hello
+ code on Linux computers.
+
+ Fig. 2.3: Commands for compiling and executing hellox86.c
+
+ You may need to set LD_LIBRARY_PATH environment variable
+ appropriately. If you are trying to execute the program on Mac OSX
+ or Windows, try copying the DLLs to the current directory.
+
+
I now explain how to use this DFT library by referring to an example
source code shown below.
This source code is
@@ -113,8 +118,13 @@
+ Fig. 4.1: Test code for DFT subroutines
+
+
As shown in the first line, you can compile the source code with the
following command, after you install the library.
+
This program takes one integer argument n. It executes
forward complex transform with size 2n using a
naive transform and the library. If the two results match, it prints
OK.
+
For the first execution, this program takes a few seconds to
finish. This is because the library measures computation speed with
- many different configurations to find the best exectuion plan. The
+ many different configurations to find the best execution plan. The
best plan is saved to "plan.txt", as specified in line 28. Later
executions will finish instantly as the library reads the plan from
this file. Instead of specifying the file name in the program, the
@@ -142,7 +152,7 @@
specified at line 30.
+
This library executes transforms using the most suitable SIMD
instructions available on the computer, in addition to
multi-threading. In order to make the computation efficient, the
@@ -157,21 +167,21 @@
memory region yourself, and pass the pointer to the library.
- The real and imaginary parts of the kth number
- are stored in (2k)-th and
- (2k+1)-th elements of the input and output array,
+
+ The real and imaginary parts of the kth number
+ are stored in (2k)-th and
+ (2k+1)-th elements of the input and output array,
respectively. At line 54, the transform is executed by the
library. You can specify the same array as the input and output.
+
Under src/dft-tester directory, there are other examples showing how
to execute transforms in a way that you get equivalent results to
other libraries.
SLEEF Documentation - Math library reference
+
+Table of contents
+
+
+
+
+Data types for AArch32 architecture
+
+typedef struct {
+ float32x4_t x, y;
+} Sleef_float32x4_t_2;
+
+
+
+Trigonometric Functions
+
+
+
+float32x4_t Sleef_sinf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cosf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cosf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_sincosd2_u10neon(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincosf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincosf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinpif4_u05neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cospif4_u05neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincospif4_u05neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincospif4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+Power, exponential, and logarithmic function
+
+
+
+float32x4_t Sleef_powf4_u10neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_logf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_logf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_log10f4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_log1pf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_expf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_exp2f4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_exp10f4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_expm1f4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sqrtf4_u05neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sqrtf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cbrtf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cbrtf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_hypotf4_u05neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_hypotf4_u35neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+Inverse Trigonometric Functions
+
+
+
+float32x4_t Sleef_asinf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_asinf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acosf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acosf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanf4_u35neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atan2f4_u10neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atan2f4_u35neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+Hyperbolic function and inverse hyperbolic function
+
+
+
+float32x4_t Sleef_sinhf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_coshf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanhf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_asinhf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acoshf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanhf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+Error and gamma function
+
+
+
+float32x4_t Sleef_erff4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_erfcf4_u15neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tgammaf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_lgammaf4_u10neon(float32x4_t a);
+
+Link with -lsleef.
+Nearest integer function
+
+
+
+float32x4_t Sleef_truncf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_floorf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_ceilf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_roundf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_rintf4_neon(float32x4_t a);
+
+Link with -lsleef.
+Other function
+
+
+
+float32x4_t Sleef_fmaf4_neon(float32x4_t a, float32x4_t b, float32x4_t c);
+
+Link with -lsleef.
+
+
+
+
+float32x4_t Sleef_fmodf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_frfrexpf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+
+Sleef_float32x4_t_2 Sleef_modff4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fabsf4_neon(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_copysignf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fmaxf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fminf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fdimf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_nextafterf4_neon(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+SLEEF Documentation - Math library reference
+
+Table of contents
+
+
+
+
+
+
+
+
+ Data types for AArch64 architecture
+
+typedef struct {
+ float32x4_t x, y;
+} Sleef_float32x4_t_2;
+
+
+
+
+typedef struct {
+ float64x2_t x, y;
+} Sleef_float64x2_t_2;
+
+
+Trigonometric Functions
+
+
+
+float64x2_t Sleef_sind2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_sind2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_cosd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cosf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_cosd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cosf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_sincosd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincosf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_sincosd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincosf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_sinpid2_u05advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinpif4_u05advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_cospid2_u05advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cospif4_u05advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_sincospid2_u05advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincospif4_u05advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_sincospid2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_sincospif4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_tand2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_tand2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+Power, exponential, and logarithmic function
+
+
+
+float64x2_t Sleef_powd2_u10advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_powf4_u10advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_logd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_logf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_logd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_logf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_log10d2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_log10f4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_log1pd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_log1pf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_expd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_expf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_exp2d2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_exp2f4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_exp10d2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_exp10f4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_expm1d2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_expm1f4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_sqrtd2_u05advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sqrtf4_u05advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_sqrtd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sqrtf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_cbrtd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cbrtf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_cbrtd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_cbrtf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_hypotd2_u05advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_hypotf4_u05advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_hypotd2_u35advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_hypotf4_u35advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+Inverse Trigonometric Functions
+
+
+
+float64x2_t Sleef_asind2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_asinf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_asind2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_asinf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_acosd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acosf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_acosd2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acosf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_atand2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_atand2_u35advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanf4_u35advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_atan2d2_u10advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atan2f4_u10advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_atan2d2_u35advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atan2f4_u35advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+Hyperbolic function and inverse hyperbolic function
+
+
+
+float64x2_t Sleef_sinhd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_sinhf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_coshd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_coshf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_tanhd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tanhf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_asinhd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_asinhf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_acoshd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_acoshf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_atanhd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_atanhf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+Error and gamma function
+
+
+
+float64x2_t Sleef_erfd2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_erff4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_erfcd2_u15advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_erfcf4_u15advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_tgammad2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_tgammaf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_lgammad2_u10advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_lgammaf4_u10advsimd(float32x4_t a);
+
+Link with -lsleef.
+Nearest integer function
+
+
+
+float64x2_t Sleef_truncd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_truncf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_floord2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_floorf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_ceild2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_ceilf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_roundd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_roundf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_rintd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_rintf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+Other function
+
+
+
+float64x2_t Sleef_fmad2_advsimd(float64x2_t a, float64x2_t b, float64x2_t c);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fmaf4_advsimd(float32x4_t a, float32x4_t b, float32x4_t c);
+
+Link with -lsleef.
+
+
+
+
+float64x2_t Sleef_fmodd2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fmodf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_ldexpd2_advsimd(float64x2_t a, int32x2_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_frfrexpd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_frfrexpf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+int32x2_t Sleef_expfrexpd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+
+int32x2_t Sleef_ilogbd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float64x2_t_2 Sleef_modfd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+Sleef_float32x4_t_2 Sleef_modff4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_fabsd2_advsimd(float64x2_t a);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fabsf4_advsimd(float32x4_t a);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_copysignd2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_copysignf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_fmaxd2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fmaxf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_fmind2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fminf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_fdimd2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_fdimf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+
+
+
+float64x2_t Sleef_nextafterd2_advsimd(float64x2_t a, float64x2_t b);
+
+Link with -lsleef.
+
+
+
+float32x4_t Sleef_nextafterf4_advsimd(float32x4_t a, float32x4_t b);
+
+Link with -lsleef.
+SLEEF Documentation - Additional Notes
+
+Table of contents
+
+
+
+
+Additional Notes
+
+How the dispatcher works
+
+
+
+static double (*funcPtr)(double arg);
+
+
static double dispatcherMain(double arg) {
+ double (*p)(double arg) = funcSSE2;
+
+
#if the compiler supports SSE4.1
+ if (SSE4.1 is available on the CPU) p = funcSSE4;
+#endif
+
+
funcPtr = p;
+ return (*funcPtr)(arg);
+}
+
+
static double (*funcPtr)(double arg) = dispatcherMain;
+
+
double mainFunc(double arg) {
+ return (*funcPtr)(arg);
+}
+ULP, gradual underflow and flush-to-zero mode
+
+About sincospi
+
+About the logo
+
+
+
+
+ Fig. 7.2: SLEEF logo
+SLEEF Documentation - Benchmark Results
+
+Table of contents
+
+
+
+
+ Benchmark results
+
+
+
+
+
+
+
+
+
+ Fig. 6.1: Execution time of double precision trigonometric functions
+
+ Fig. 6.2: Execution time of single precision trigonometric functions
+
+ Fig. 6.3: Execution time of double precision log, exp, pow and inverse trigonometric functions
+
+ Fig. 6.4: Execution time of single precision log, exp, pow and inverse trigonometric functions
+ SLEEF Documentation - Compiling and installing the library
+
+Table of contents
+
+
+
+
+Compiling and installing the library
+
+Compiling and installing library on Linux
+
+
+
+$ sudo apt-get install libmpfr-dev libgomp1-dev gcc
+$ cd sleef-3.X
+$ make
+$ sudo make install
+Compiling library with Microsoft Visual C++
+
+
+
+
+C:\Program Files (x86)\MSVCCommunity2015\VC\bin\amd64
if "%SHELL%"=="/bin/bash" c:\cygwin64\bin\bash.exe
Compiling and running "Hello SLEEF!"
+
+
+
+#include <stdio.h>
+#include <x86intrin.h>
+#include <sleef.h>
+
+
int main(int argc, char **argv) {
+ double a[] = {2, 10};
+ double b[] = {3, 20};
+
+
__m128d va, vb, vc;
+
+ va = _mm_loadu_pd(a);
+ vb = _mm_loadu_pd(b);
+
+
vc = Sleef_powd2_u10(va, vb);
+
+
double c[2];
+
+
_mm_storeu_pd(c, vc);
+
+
printf("pow(%g, %g) = %g\n", a[0], b[0], c[0]);
+ printf("pow(%g, %g) = %g\n", a[1], b[1], c[1]);
+}
+
+
+$ gcc hellox86.c -o hellox86 -lsleef
+$ ./hellox86
+pow(2, 3) = 8
+pow(10, 20) = 1e+20
+SLEEF API Reference - DFT library reference
+SLEEF Documentation - DFT library reference
Table of contents
-
+
Tutorial
- exit(success);
}
+gcc tutorial.c -lsleef -lsleefdft -lm
-Compatibility with other libraries
+
Function reference
@@ -184,15 +194,15 @@
#include <stdlib.h>
#include <sleef.h>
-void * Sleef_malloc(size_t z);
+void * Sleef_malloc(size_t z);
Link with -lsleef.
Description
-- Sleef_malloc allocates z bytes of aligned +
+ Sleef_malloc allocates z bytes of aligned
memory region, and return the pointer to that region. The returned
pointer points an address that can be accessed by all SIMD load and
store instructions available on that computer. Memory regions
@@ -210,15 +220,15 @@ Link with -lsleef.
#include <stdlib.h>
#include <sleef.h>
-void Sleef_free(void *ptr);
+void Sleef_free(void *ptr);
Link with -lsleef.
Description
-- A memory region pointed by ptr that is allocated +
+ A memory region pointed by ptr that is allocated by Sleef_malloc can be freed with Sleef_free.
@@ -233,24 +243,29 @@ with Sleef_free. #include <stdint.h>Description
-+
File name for storing execution plan can be specified by this -function. If NULL is specified as path, the file name is read +function. If NULL is specified as path, the file name is read from SLEEFDFTPLAN environment variable. A string for identifying system micro architecture can be also given. The library will automatically detect the marchitecture if NULL is given -as arch. Management options for the plan file can be specified -by the mode parameter, as shown below. +as arch. Management options for the plan file can be specified +by the mode parameter, as shown below.
-Table 4.2: Mode flags for SleefFT_setPlanFilePath | +|||||
|