-
Notifications
You must be signed in to change notification settings - Fork 131
Examples of auto vectorizable codes
Naoki Shibata edited this page May 8, 2020
·
37 revisions
These codes are provided for checking how compilers vecotorize a source code. These codes are not meant for practical use. Try compiling them at Compiler Explorer. All source codes in this page are in public domain unless otherwise stated.
Suggested compiler options :
gcc: -mavx2 -O3 -ffast-math -mveclibabi=svml
gcc: -mavx2 -O3 -fno-math-errno -fno-trapping-math
clang: -mavx2 -O3 -fno-honor-nans -fno-math-errno -fno-trapping-math
icc: -xCORE-AVX2 -ffast-math -O3
Just a simple example.
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) float in[N][N], out[N][N];
static float srgb2linear_pix(float c) {
float r = pow((c + 0.055) / (1 + 0.055), 2.4);
return c < 0.04045 ? (c * (1.0 / 12.92)) : r;
}
void srgb2linear(void) {
for (int y = 0; y < N; y++) {
for (int x = 0; x < N; x++) {
out[y][x] = srgb2linear_pix(in[y][x]);
}
}
}
More complicated than the previous one. Conditional selection from two values.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) double in[4][N], out[3][N];
typedef struct { double x, y, z; } double3;
static double3 execute(double a, double b, double c, double d) {
double s1 = b*(1/a), s2 = c*(1/a), s3 = d*(1/a);
double p = s2 - 1/3.0*s1*s1;
double q = s3 - 1/3.0*s1*s2 + 2/27.0*s1*s1*s1;
double z = q*q + 4/27.0*p*p*p;
double w = cbrt((-q + sqrt(z)) * 0.5) + cbrt((-q - sqrt(z)) * 0.5) - 1/3.0*s1;
double th = acos(0.5*(3.0/p)*q*sqrt(-(3.0/p)));
double w0 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*0/3)-1/3.0*s1;
double w1 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*1/3)-1/3.0*s1;
double w2 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*2/3)-1/3.0*s1;
double3 ret = { NAN, NAN, NAN };
if (z >= 0) {
ret.x = w;
} else {
ret.x = w0; ret.y = w1; ret.z = w2;
}
return ret;
}
void cardanoN(void) {
for (int i = 0; i < N; i++) {
double3 r = execute(in[3][i], in[2][i], in[1][i], in[0][i]);
out[0][i] = r.x; out[1][i] = r.y; out[2][i] = r.z;
}
}
int main(int argc, char **argv) {
double r[N][3];
for(int i=0;i<N;i++) {
for(int j=0;j<3;j++)
r[i][j] = (2.0 * rand() / RAND_MAX - 1) * 10;
in[3][i] = 1;
in[2][i] = - r[i][0] - r[i][1] - r[i][2];
in[1][i] = + r[i][0] * r[i][1] + r[i][1] * r[i][2] + r[i][0] * r[i][2];
in[0][i] = - r[i][0] * r[i][1] * r[i][2];
}
cardanoN();
for(int i=0;i<N;i++)
printf("%g, %g, %g : %g, %g, %g\n", out[0][i], out[1][i], out[2][i], r[i][0], r[i][1], r[i][2]);
}
The compiled code may call sincos.
#include <math.h>
typedef struct { double x, y, z; } double3;
#define N 256
__attribute__ ((__aligned__(64))) double3 out[N][N];
static double3 dini(double a, double b, double u, double v) {
double3 ret;
ret.x = a * cos(u) * sin(v);
ret.y = a * sin(u) * sin(v);
ret.z = a * (cos(v) + log(tan(v * 0.5))) + b * u;
return ret;
}
void diniSurface(double a, double b) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double u = 4.0 * M_PI * i / N;
double v = 2.0 * j / N;
out[i][j] = dini(a, b, u, v);
}
}
}
Calls to pow may be removed.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];
// Factorial approximation formula by Peter Luschny
#define c0 (1.0 / 24.0)
#define c1 (3.0 / 80.0)
#define c2 (18029.0 / 45360.0)
#define c3 (6272051.0 / 14869008.0)
static double lus(double x) {
x += 0.5;
double p = (pow(x, 5)+(c3+c2+c1)*pow(x, 3)+c1*c3*x) /
(pow(x,4)+(c3+c2+c1+c0)*pow(x,2)+(c1+c0)*c3+c0*c2);
return 0.5*log(2*M_PI) + x * (log(p)-1);
}
void factorialN() {
for (int i = 0; i < N; i++) {
out[i] = lus(in[i]);
}
}
int main(int argc, char **argv) {
for(int i=0;i<N;i++)
in[i] = (rand() / (double)RAND_MAX) * 10;
factorialN();
for(int i=0;i<N;i++)
printf("%.20g, %.20g\n", out[i], gamma(in[i]+1));
}
I couldn't make gcc or clang vectorize this code, while Intel Compiler does.
// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.
#include <math.h>
static double F(double x, double y) { return sin(x)/x; }
#define M 1024
/* Runge-Kutta method */
static double runge4(double x0, double y0, double xn) {
double x, y, h, h2, f1, f2, f3, f4;
x = x0; y = y0; h = (xn - x0) / M; h2 = h / 2;
for (int i = 0; i < M; i++) {
f1 = h * F(x, y);
f2 = h * F(x + h2, y + f1 / 2);
f3 = h * F(x + h2, y + f2 / 2);
f4 = h * F(x + h, y + f3);
x = x0 + i * h;
y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
}
return y;
}
#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];
void runge4N() {
for (int i = 0; i < N; i++)
out[i] = runge4(1, 0.9460830703671830149413, in[i]);
}
It seems still hard without a mathlib call.
// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 256
#define M 1024
static double F(double x, double y) { return 1 - y*y; }
static double runge4(double x0, double y0, double xn) {
double x, y, h, h2, f1, f2, f3, f4;
x = x0; y = y0; h = (xn - x0) / M; h2 = h / 2;
for (int i = 0; i < M; i++) {
f1 = h * F(x, y);
f2 = h * F(x + h2, y + f1 / 2);
f3 = h * F(x + h2, y + f2 / 2);
f4 = h * F(x + h, y + f3);
x = x0 + i * h;
y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
}
return y;
}
__attribute__ ((__aligned__(64))) double in[N], out[N];
void runge4N() {
for (int i = 0; i < N; i++) out[i] = runge4(0, 0, in[i]);
}
int main(int argc, char **argv)
{
for(int i=0;i<N;i++) in[i] = (rand() / (double)RAND_MAX) * 10;
runge4N();
for(int i = 0; i < N; i++) printf("%.20g %.20g\n", tanh(in[i]), out[i]);
}
Surprisingly large variation in the generated codes can be seen. Do not forget to specify -ffast-math.
double inv4(double * __restrict__ r, double * __restrict__ m) {
const int N = 4;
const double d12_01 = +m[1*N+0]*m[2*N+1]-m[1*N+1]*m[2*N+0];
const double d13_01 = +m[1*N+0]*m[3*N+1]-m[1*N+1]*m[3*N+0];
const double d23_01 = +m[2*N+0]*m[3*N+1]-m[2*N+1]*m[3*N+0];
const double d12_02 = +m[1*N+0]*m[2*N+2]-m[1*N+2]*m[2*N+0];
const double d12_03 = +m[1*N+0]*m[2*N+3]-m[1*N+3]*m[2*N+0];
const double d13_02 = +m[1*N+0]*m[3*N+2]-m[1*N+2]*m[3*N+0];
const double d13_03 = +m[1*N+0]*m[3*N+3]-m[1*N+3]*m[3*N+0];
const double d23_02 = +m[2*N+0]*m[3*N+2]-m[2*N+2]*m[3*N+0];
const double d23_03 = +m[2*N+0]*m[3*N+3]-m[2*N+3]*m[3*N+0];
const double d12_12 = +m[1*N+1]*m[2*N+2]-m[1*N+2]*m[2*N+1];
const double d12_13 = +m[1*N+1]*m[2*N+3]-m[1*N+3]*m[2*N+1];
const double d12_23 = +m[1*N+2]*m[2*N+3]-m[1*N+3]*m[2*N+2];
const double d13_12 = +m[1*N+1]*m[3*N+2]-m[1*N+2]*m[3*N+1];
const double d13_13 = +m[1*N+1]*m[3*N+3]-m[1*N+3]*m[3*N+1];
const double d13_23 = +m[1*N+2]*m[3*N+3]-m[1*N+3]*m[3*N+2];
const double d23_12 = +m[2*N+1]*m[3*N+2]-m[2*N+2]*m[3*N+1];
const double d23_13 = +m[2*N+1]*m[3*N+3]-m[2*N+3]*m[3*N+1];
const double d23_23 = +m[2*N+2]*m[3*N+3]-m[2*N+3]*m[3*N+2];
const double d012_012 = +m[0*N+0]*d12_12-m[0*N+1]*d12_02+m[0*N+2]*d12_01;
const double d013_012 = +m[0*N+0]*d13_12-m[0*N+1]*d13_02+m[0*N+2]*d13_01;
const double d023_012 = +m[0*N+0]*d23_12-m[0*N+1]*d23_02+m[0*N+2]*d23_01;
const double d123_012 = +m[1*N+0]*d23_12-m[1*N+1]*d23_02+m[1*N+2]*d23_01;
const double d012_013 = +m[0*N+0]*d12_13-m[0*N+1]*d12_03+m[0*N+3]*d12_01;
const double d013_013 = +m[0*N+0]*d13_13-m[0*N+1]*d13_03+m[0*N+3]*d13_01;
const double d023_013 = +m[0*N+0]*d23_13-m[0*N+1]*d23_03+m[0*N+3]*d23_01;
const double d123_013 = +m[1*N+0]*d23_13-m[1*N+1]*d23_03+m[1*N+3]*d23_01;
const double d012_023 = +m[0*N+0]*d12_23-m[0*N+2]*d12_03+m[0*N+3]*d12_02;
const double d013_023 = +m[0*N+0]*d13_23-m[0*N+2]*d13_03+m[0*N+3]*d13_02;
const double d023_023 = +m[0*N+0]*d23_23-m[0*N+2]*d23_03+m[0*N+3]*d23_02;
const double d123_023 = +m[1*N+0]*d23_23-m[1*N+2]*d23_03+m[1*N+3]*d23_02;
const double d012_123 = +m[0*N+1]*d12_23-m[0*N+2]*d12_13+m[0*N+3]*d12_12;
const double d013_123 = +m[0*N+1]*d13_23-m[0*N+2]*d13_13+m[0*N+3]*d13_12;
const double d023_123 = +m[0*N+1]*d23_23-m[0*N+2]*d23_13+m[0*N+3]*d23_12;
const double d123_123 = +m[1*N+1]*d23_23-m[1*N+2]*d23_13+m[1*N+3]*d23_12;
const double d0123_0123 = +m[0*N+0]*d123_123-m[0*N+1]*d123_023+m[0*N+2]*d123_013-m[0*N+3]*d123_012;
r[0*N+0] = +d123_123 * (1.0/d0123_0123);
r[0*N+1] = -d023_123 * (1.0/d0123_0123);
r[0*N+2] = +d013_123 * (1.0/d0123_0123);
r[0*N+3] = -d012_123 * (1.0/d0123_0123);
r[1*N+0] = -d123_023 * (1.0/d0123_0123);
r[1*N+1] = +d023_023 * (1.0/d0123_0123);
r[1*N+2] = -d013_023 * (1.0/d0123_0123);
r[1*N+3] = +d012_023 * (1.0/d0123_0123);
r[2*N+0] = +d123_013 * (1.0/d0123_0123);
r[2*N+1] = -d023_013 * (1.0/d0123_0123);
r[2*N+2] = +d013_013 * (1.0/d0123_0123);
r[2*N+3] = -d012_013 * (1.0/d0123_0123);
r[3*N+0] = -d123_012 * (1.0/d0123_0123);
r[3*N+1] = +d023_012 * (1.0/d0123_0123);
r[3*N+2] = -d013_012 * (1.0/d0123_0123);
r[3*N+3] = +d012_012 * (1.0/d0123_0123);
return d0123_0123;
}