Skip to content

Examples of auto vectorizable codes

Naoki Shibata edited this page Feb 9, 2018 · 37 revisions

These codes are provided for checking how compilers vecotorize codes with various compiler options. Try compiling them at Compiler Explorer. On gcc, try -fno-math-errno and -fno-trapping-math options. For clang, try -fno-honor-nans -fno-math-errno -fno-trapping-math options. All source codes in this page are in public domain unless otherwise stated.

SRGB to linear image conversion

Just a simple example.

#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) float in[N][N], out[N][N];

static float srgb2linear_pix(float c) {
  float r = pow((c + 0.055) / (1 + 0.055), 2.4);
  return c < 0.04045 ? (c * (1.0 / 12.92)) : r;
}

void srgb2linear(void) {
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      out[y][x] = srgb2linear_pix(in[y][x]);
    }
  }
}

Finding roots of cubic equations

More complicated than the previous one. Conditional selection from two values.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[4][N], out[3][N];

typedef struct { double x, y, z; } double3;

static double3 execute(double a, double b, double c, double d) {
  double s1 = b*(1/a), s2 = c*(1/a), s3 = d*(1/a);
  double p = s2 - 1/3.0*s1*s1;
  double q = s3 - 1/3.0*s1*s2 + 2/27.0*s1*s1*s1;
  double z = q*q + 4/27.0*p*p*p;

  double w = cbrt((-q + sqrt(z)) * 0.5) + cbrt((-q - sqrt(z)) * 0.5) - 1/3.0*s1;

  double th = acos(0.5*(3.0/p)*q*sqrt(-(3.0/p)));
  double w0 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*0/3)-1/3.0*s1;
  double w1 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*1/3)-1/3.0*s1;
  double w2 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*2/3)-1/3.0*s1;

  double3 ret = { NAN, NAN, NAN };
  if (z >= 0) {
    ret.x = w;
  } else {
    ret.x = w0; ret.y = w1; ret.z = w2;
  }

  return ret;
}

void cardanoN(void) {
  for (int i = 0; i < N; i++) {
    double3 r = execute(in[3][i], in[2][i], in[1][i], in[0][i]);
    out[0][i] = r.x; out[1][i] = r.y; out[2][i] = r.z;
  }
}

int main(int argc, char **argv) {
  double r[N][3];

  for(int i=0;i<N;i++) {
    for(int j=0;j<3;j++)
      r[i][j] = (2.0 * rand() / RAND_MAX - 1) * 10;

    in[3][i] = 1;
    in[2][i] = - r[i][0] - r[i][1] - r[i][2];
    in[1][i] = + r[i][0] * r[i][1] + r[i][1] * r[i][2] + r[i][0] * r[i][2];
    in[0][i] = - r[i][0] * r[i][1] * r[i][2];
  }

  cardanoN();

  for(int i=0;i<N;i++)
    printf("%g, %g, %g : %g, %g, %g\n", out[0][i], out[1][i], out[2][i], r[i][0], r[i][1], r[i][2]);
}

Generating Dini's surface

The compiled code may call sincos.

#include <math.h>

typedef struct { double x, y, z; } double3;

#define N 256
__attribute__ ((__aligned__(64))) double3 out[N][N];

static double3 dini(double a, double b, double u, double v) {
  double3 ret;
  ret.x = a * cos(u) * sin(v);
  ret.y = a * sin(u) * sin(v);
  ret.z = a * (cos(v) + log(tan(v * 0.5))) + b * u;
  return ret;
}

void diniSurface(double a, double b) {
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      double u = 4.0 * M_PI * i / N;
      double v = 2.0 * j / N;
      out[i][j] = dini(a, b, u, v);
    }
  }
}

Factorial approximation formula by Peter Luschny

Calls to pow may be removed.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];

// Factorial approximation formula by Peter Luschny
#define c0 (1.0 / 24.0)
#define c1 (3.0 / 80.0)
#define c2 (18029.0 / 45360.0)
#define c3 (6272051.0 / 14869008.0)

static double lus(double x) {
  x += 0.5;
  double p = (pow(x, 5)+(c3+c2+c1)*pow(x, 3)+c1*c3*x) /
    (pow(x,4)+(c3+c2+c1+c0)*pow(x,2)+(c1+c0)*c3+c0*c2);
  return 0.5*log(2*M_PI) + x * (log(p)-1);
}

void factorialN() {
  for (int i = 0; i < N; i++) {
    out[i] = lus(in[i]);
  }
}

int main(int argc, char **argv) {
  for(int i=0;i<N;i++)
    in[i] = (rand() / (double)RAND_MAX) * 10;

  factorialN();

  for(int i=0;i<N;i++)
    printf("%.20g, %.20g\n", out[i], gamma(in[i]+1));
}

Evaluate Si(x) with Runge-Kutta method

I couldn't make gcc or clang vectorize this source code, while Intel Compiler does it.

// The original code is taken from Haruhiko Okumura's book.
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
// The code is distributed under the Creative Commons Attribution 4.0 International License.
// https://creativecommons.org/licenses/by/4.0/

#include <math.h>

/* $F(x, y)$ */
static double F(double x, double y) { return sin(x)/x; }

#define M 1024

/* Runge-Kutta method */
static double runge4(double x0, double y0, double xn) {
    double x, y, h, h2, f1, f2, f3, f4;
    x = x0;  y = y0;  h = (xn - x0) / M;  h2 = h / 2;
    for (int i = 0; i < M; i++) {
        f1 = h * F(x, y);
        f2 = h * F(x + h2, y + f1 / 2);
        f3 = h * F(x + h2, y + f2 / 2);
        f4 = h * F(x + h, y + f3);
        x = x0 + i * h;
        y += (f1 + 2 * f2 + 2 * f3 + f4) / 6;
    }
    return y;
}

#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];

void runge4N() {
  for (int i = 0; i < N; i++)
    out[i] = runge4(1, 0.9460830703671830149413, in[i]);
}
Clone this wiki locally