-
Notifications
You must be signed in to change notification settings - Fork 131
Examples of auto vectorizable codes
Naoki Shibata edited this page Feb 9, 2018
·
37 revisions
These codes are provided for checking how compilers vecotorize codes with various compiler options. Try compiling them at Compiler Explorer. On gcc, try -fno-math-errno and -fno-trapping-math options. For clang, try -fno-honor-nans -fno-math-errno -fno-trapping-math options. All source codes in this page are in public domain unless otherwise stated.
Just a simple example.
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) float in[N][N], out[N][N];
static float srgb2linear_pix(float c) {
float r = pow((c + 0.055) / (1 + 0.055), 2.4);
return c < 0.04045 ? (c * (1.0 / 12.92)) : r;
}
void srgb2linear(void) {
for (int y = 0; y < N; y++) {
for (int x = 0; x < N; x++) {
out[y][x] = srgb2linear_pix(in[y][x]);
}
}
}
More complicated than the previous one. Conditional selection from two values.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) double in[4][N], out[3][N];
typedef struct { double x, y, z; } double3;
static double3 execute(double a, double b, double c, double d) {
double s1 = b*(1/a), s2 = c*(1/a), s3 = d*(1/a);
double p = s2 - 1/3.0*s1*s1;
double q = s3 - 1/3.0*s1*s2 + 2/27.0*s1*s1*s1;
double z = q*q + 4/27.0*p*p*p;
double w = cbrt((-q + sqrt(z)) * 0.5) + cbrt((-q - sqrt(z)) * 0.5) - 1/3.0*s1;
double th = acos(0.5*(3.0/p)*q*sqrt(-(3.0/p)));
double w0 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*0/3)-1/3.0*s1;
double w1 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*1/3)-1/3.0*s1;
double w2 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*2/3)-1/3.0*s1;
double3 ret = { NAN, NAN, NAN };
if (z >= 0) {
ret.x = w;
} else {
ret.x = w0; ret.y = w1; ret.z = w2;
}
return ret;
}
void cardanoN(void) {
for (int i = 0; i < N; i++) {
double3 r = execute(in[3][i], in[2][i], in[1][i], in[0][i]);
out[0][i] = r.x; out[1][i] = r.y; out[2][i] = r.z;
}
}
int main(int argc, char **argv) {
double r[N][3];
for(int i=0;i<N;i++) {
for(int j=0;j<3;j++)
r[i][j] = (2.0 * rand() / RAND_MAX - 1) * 10;
in[3][i] = 1;
in[2][i] = - r[i][0] - r[i][1] - r[i][2];
in[1][i] = + r[i][0] * r[i][1] + r[i][1] * r[i][2] + r[i][0] * r[i][2];
in[0][i] = - r[i][0] * r[i][1] * r[i][2];
}
cardanoN();
for(int i=0;i<N;i++)
printf("%g, %g, %g : %g, %g, %g\n", out[0][i], out[1][i], out[2][i], r[i][0], r[i][1], r[i][2]);
}
The compiled code may call sincos.
#include <math.h>
typedef struct { double x, y, z; } double3;
#define N 256
__attribute__ ((__aligned__(64))) double3 out[N][N];
static double3 dini(double a, double b, double u, double v) {
double3 ret;
ret.x = a * cos(u) * sin(v);
ret.y = a * sin(u) * sin(v);
ret.z = a * (cos(v) + log(tan(v * 0.5))) + b * u;
return ret;
}
void diniSurface(double a, double b) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
double u = 4.0 * M_PI * i / N;
double v = 2.0 * j / N;
out[i][j] = dini(a, b, u, v);
}
}
}
Calls to pow may be removed.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];
// Factorial approximation formula by Peter Luschny
#define c0 (1.0 / 24.0)
#define c1 (3.0 / 80.0)
#define c2 (18029.0 / 45360.0)
#define c3 (6272051.0 / 14869008.0)
static double lus(double x) {
x += 0.5;
double p = (pow(x, 5)+(c3+c2+c1)*pow(x, 3)+c1*c3*x) /
(pow(x,4)+(c3+c2+c1+c0)*pow(x,2)+(c1+c0)*c3+c0*c2);
return 0.5*log(2*M_PI) + x * (log(p)-1);
}
void factorialN() {
for (int i = 0; i < N; i++) {
out[i] = lus(in[i]);
}
}
int main(int argc, char **argv) {
for(int i=0;i<N;i++)
in[i] = (rand() / (double)RAND_MAX) * 10;
factorialN();
for(int i=0;i<N;i++)
printf("%.20g, %.20g\n", out[i], gamma(in[i]+1));
}
I couldn't make gcc or clang vectorize this source code. Intel Compiler does, however.
// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.
// https://creativecommons.org/licenses/by/4.0/
#include <math.h>
/* $F(x, y)$ */
static double F(double x, double y) { return sin(x)/x; }
#define M 1024
/* Runge-Kutta method */
static double runge4(double x0, double y0, double xn) {
double x, y, h, h2, f1, f2, f3, f4;
x = x0; y = y0; h = (xn - x0) / M; h2 = h / 2;
for (int i = 0; i < M; i++) {
f1 = h * F(x, y);
f2 = h * F(x + h2, y + f1 / 2);
f3 = h * F(x + h2, y + f2 / 2);
f4 = h * F(x + h, y + f3);
x = x0 + i * h;
y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
}
return y;
}
#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];
void runge4N() {
for (int i = 0; i < N; i++)
out[i] = runge4(1, 0.9460830703671830149413, in[i]);
}