Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: More Optimizations and SIMD fixes for MSVC & ARM #413

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/opt.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ SSE and SSE2 Shuffle Option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**_mm_shuffle_ps** generates **shufps** instruction even if registers are same.
You can force it to generate **pshufd** instruction by defining
**CGLM_USE_INT_DOMAIN** macro. As default it is not defined.
**CGLM_NO_INT_DOMAIN** macro. As default it is not defined.

SSE3 and SSE4 Dot Product Options
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion include/cglm/mat2.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ glm_mat2_scale(mat2 m, float s) {
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
wasm_f32x4_splat(s)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s)));
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s)));
#elif defined(CGLM_NEON_FP)
vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
#else
Expand Down
32 changes: 15 additions & 17 deletions include/cglm/mat3.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ glm_mat3_det(mat3 mat) {
d = mat[1][0], e = mat[1][1], f = mat[1][2],
g = mat[2][0], h = mat[2][1], i = mat[2][2];

return a * (e * i - h * f) - d * (b * i - c * h) + g * (b * f - c * e);
return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c);
}

/*!
Expand All @@ -346,24 +346,22 @@ glm_mat3_det(mat3 mat) {
CGLM_INLINE
void
glm_mat3_inv(mat3 mat, mat3 dest) {
float det;
float a = mat[0][0], b = mat[0][1], c = mat[0][2],
d = mat[1][0], e = mat[1][1], f = mat[1][2],
g = mat[2][0], h = mat[2][1], i = mat[2][2];

dest[0][0] = e * i - f * h;
dest[0][1] = -(b * i - h * c);
dest[0][2] = b * f - e * c;
dest[1][0] = -(d * i - g * f);
dest[1][1] = a * i - c * g;
dest[1][2] = -(a * f - d * c);
dest[2][0] = d * h - g * e;
dest[2][1] = -(a * h - g * b);
dest[2][2] = a * e - b * d;

det = 1.0f / (a * dest[0][0] + b * dest[1][0] + c * dest[2][0]);

glm_mat3_scale(dest, det);
g = mat[2][0], h = mat[2][1], i = mat[2][2],

c1 = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e,
idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt;

dest[0][0] = idt * c1;
dest[0][1] = ndt * (b * i - h * c);
dest[0][2] = idt * (b * f - e * c);
dest[1][0] = ndt * c2;
dest[1][1] = idt * (a * i - g * c);
dest[1][2] = ndt * (a * f - d * c);
dest[2][0] = idt * c3;
dest[2][1] = ndt * (a * h - g * b);
dest[2][2] = idt * (a * e - d * b);
}

/*!
Expand Down
69 changes: 32 additions & 37 deletions include/cglm/mat4.h
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,8 @@ void
glm_mat4_transpose_to(mat4 m, mat4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_transp_wasm(m, dest);
#elif defined(__AVX__)
glm_mat4_transp_avx(m, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, dest);
#elif defined(CGLM_NEON_FP)
Expand All @@ -546,6 +548,8 @@ void
glm_mat4_transpose(mat4 m) {
#if defined(__wasm__) && defined(__wasm_simd128__)
glm_mat4_transp_wasm(m, m);
#elif defined(__AVX__)
glm_mat4_transp_avx(m, m);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_transp_sse2(m, m);
#elif defined(CGLM_NEON_FP)
Expand Down Expand Up @@ -650,46 +654,37 @@ glm_mat4_inv(mat4 mat, mat4 dest) {
#elif defined(CGLM_NEON_FP)
glm_mat4_inv_neon(mat, dest);
#else
float t[6];
float det;
float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3];

t[0] = k * p - o * l; t[1] = j * p - n * l; t[2] = j * o - n * k;
t[3] = i * p - m * l; t[4] = i * o - m * k; t[5] = i * n - m * j;

dest[0][0] = f * t[0] - g * t[1] + h * t[2];
dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]);
dest[2][0] = e * t[1] - f * t[3] + h * t[5];
dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]);

dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]);
dest[1][1] = a * t[0] - c * t[3] + d * t[4];
dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]);
dest[3][1] = a * t[2] - b * t[4] + c * t[5];

t[0] = g * p - o * h; t[1] = f * p - n * h; t[2] = f * o - n * g;
t[3] = e * p - m * h; t[4] = e * o - m * g; t[5] = e * n - m * f;

dest[0][2] = b * t[0] - c * t[1] + d * t[2];
dest[1][2] =-(a * t[0] - c * t[3] + d * t[4]);
dest[2][2] = a * t[1] - b * t[3] + d * t[5];
dest[3][2] =-(a * t[2] - b * t[4] + c * t[5]);

t[0] = g * l - k * h; t[1] = f * l - j * h; t[2] = f * k - j * g;
t[3] = e * l - i * h; t[4] = e * k - i * g; t[5] = e * j - i * f;

dest[0][3] =-(b * t[0] - c * t[1] + d * t[2]);
dest[1][3] = a * t[0] - c * t[3] + d * t[4];
dest[2][3] =-(a * t[1] - b * t[3] + d * t[5]);
dest[3][3] = a * t[2] - b * t[4] + c * t[5];

det = 1.0f / (a * dest[0][0] + b * dest[1][0]
+ c * dest[2][0] + d * dest[3][0]);

glm_mat4_scale_p(dest, det);
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3],

c1 = k * p - l * o, c2 = c * h - d * g, c3 = i * p - l * m,
c4 = a * h - d * e, c5 = j * p - l * n, c6 = b * h - d * f,
c7 = i * n - j * m, c8 = a * f - b * e, c9 = j * o - k * n,
c10 = b * g - c * f, c11 = i * o - k * m, c12 = a * g - c * e,

idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;

dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt;
dest[0][2] = (n * c2 - o * c6 + p * c10) * idt;
dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt;

dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt;
dest[1][1] = (a * c1 - c * c3 + d * c11) * idt;
dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt;
dest[1][3] = (i * c2 - k * c4 + l * c12) * idt;

dest[2][0] = (e * c5 - f * c3 + h * c7) * idt;
dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt;
dest[2][2] = (m * c6 - n * c4 + p * c8) * idt;
dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt;

dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt;
dest[3][1] = (a * c9 - b * c11 + c * c7) * idt;
dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt;
dest[3][3] = (i * c10 - j * c12 + k * c8) * idt;
#endif
}

Expand Down
19 changes: 19 additions & 0 deletions include/cglm/simd/arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,21 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_
static inline
float32x4_t
glmm_vhadd(float32x4_t v) {
#if CGLM_ARM64
float32x4_t p;
p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
return vpaddq_f32(p, p); /* [t, t, t, t] */;
#else
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
#endif
/* TODO: measure speed of this compare to above */
/* return vdupq_n_f32(vaddvq_f32(v)); */

/*
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
*/
/*
this seems slower:
v = vaddq_f32(v, vrev64q_f32(v));
Expand Down Expand Up @@ -108,6 +121,12 @@ glmm_dot(float32x4_t a, float32x4_t b) {
return glmm_hadd(vmulq_f32(a, b));
}

static inline
float32x4_t
glmm_vdot(float32x4_t a, float32x4_t b) {
return glmm_vhadd(vmulq_f32(a, b));
}

static inline
float
glmm_norm(float32x4_t a) {
Expand Down
93 changes: 66 additions & 27 deletions include/cglm/simd/avx/mat4.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,55 @@
#include "../../common.h"
#include "../intrin.h"

#include <immintrin.h>

CGLM_INLINE
void
glm_mat4_scale_avx(mat4 m, float s) {
__m256 y0;
y0 = _mm256_set1_ps(s);
__m256 y0, y1, y2, y3, y4;

y0 = glmm_load256(m[0]); /* h g f e d c b a */
y1 = glmm_load256(m[2]); /* p o n m l k j i */

y2 = _mm256_broadcast_ss(&s);

y3 = _mm256_mul_ps(y0, y2);
y4 = _mm256_mul_ps(y1, y2);

glmm_store256(m[0], y3);
glmm_store256(m[2], y4);
}

/* TODO: this must be tested and compared to SSE version, may be slower!!! */
CGLM_INLINE
void
glm_mat4_transp_avx(mat4 m, mat4 dest) {
__m256 y0, y1, y2, y3;

y0 = glmm_load256(m[0]); /* h g f e d c b a */
y1 = glmm_load256(m[2]); /* p o n m l k j i */

y2 = _mm256_unpacklo_ps(y0, y1); /* n f m e j b i a */
y3 = _mm256_unpackhi_ps(y0, y1); /* p h o g l d k c */

glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */

y2 = _mm256_unpacklo_ps(y0, y1); /* o k g c m i e a */
y3 = _mm256_unpackhi_ps(y0, y1); /* p l h d n j f b */

y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */

glmm_store256(dest[0], y0);
glmm_store256(dest[2], y1);
}

CGLM_INLINE
void
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */

__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13;
__m256i yi0, yi1, yi2, yi3;

y0 = glmm_load256(m2[0]); /* h g f e d c b a */
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
Expand All @@ -41,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */

yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2);
yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1);
yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3);

/* f f f f a a a a */
/* h h h h c c c c */
/* e e e e b b b b */
/* g g g g d d d d */
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));

glmm_store256(dest[0],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y3, y7)),
_mm256_add_ps(_mm256_mul_ps(y4, y8),
_mm256_mul_ps(y5, y9))));
y6 = _mm256_permutevar_ps(y0, yi0);
y7 = _mm256_permutevar_ps(y0, yi1);
y8 = _mm256_permutevar_ps(y0, yi2);
y9 = _mm256_permutevar_ps(y0, yi3);

/* n n n n i i i i */
/* p p p p k k k k */
/* m m m m j j j j */
/* o o o o l l l l */
y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));

glmm_store256(dest[2],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y3, y7)),
_mm256_add_ps(_mm256_mul_ps(y4, y8),
_mm256_mul_ps(y5, y9))));
y10 = _mm256_permutevar_ps(y1, yi0);
y11 = _mm256_permutevar_ps(y1, yi1);
y12 = _mm256_permutevar_ps(y1, yi2);
y13 = _mm256_permutevar_ps(y1, yi3);

y0 = _mm256_mul_ps(y2, y6);
y1 = _mm256_mul_ps(y2, y10);

y0 = glmm256_fmadd(y3, y7, y0);
y1 = glmm256_fmadd(y3, y11, y1);

y0 = glmm256_fmadd(y4, y8, y0);
y1 = glmm256_fmadd(y4, y12, y1);

y0 = glmm256_fmadd(y5, y9, y0);
y1 = glmm256_fmadd(y5, y13, y1);

glmm_store256(dest[0], y0);
glmm_store256(dest[2], y1);
}

#endif
Expand Down
40 changes: 28 additions & 12 deletions include/cglm/simd/intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#ifndef cglm_intrin_h
#define cglm_intrin_h

#if defined( _MSC_VER )
#if defined(_MSC_VER) && !defined(_M_ARM64EC)
# if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
# ifndef __SSE__
# define __SSE__
Expand All @@ -20,13 +20,37 @@
# ifndef __SSE__
# define __SSE__
# endif
#endif
# endif
/* do not use alignment for older visual studio versions */
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
/* also ARM32 also causes similar error, disable it for now on ARM32 too */
# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */
# define CGLM_ALL_UNALIGNED
# endif
#endif

#ifdef __AVX__
# include <immintrin.h>
# define CGLM_AVX_FP 1
# ifndef __SSE2__
# define __SSE2__
# endif
# ifndef __SSE3__
# define __SSE3__
# endif
# ifndef __SSE4__
# define __SSE4__
# endif
# ifndef __SSE4_1__
# define __SSE4_1__
# endif
# ifndef __SSE4_2__
# define __SSE4_2__
# endif
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif

#if defined(__SSE__)
# include <xmmintrin.h>
# define CGLM_SSE_FP 1
Expand Down Expand Up @@ -64,14 +88,6 @@
# endif
#endif

#ifdef __AVX__
# include <immintrin.h>
# define CGLM_AVX_FP 1
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif

/* ARM Neon */
#if defined(_WIN32) && defined(_MSC_VER)
/* TODO: non-ARM stuff already inported, will this be better option */
Expand Down Expand Up @@ -100,7 +116,7 @@
#else /* non-windows */
# if defined(__ARM_NEON) || defined(__ARM_NEON__)
# include <arm_neon.h>
# if defined(__ARM_NEON_FP)
# if defined(__ARM_NEON_FP) || defined(__ARM_FP)
# define CGLM_NEON_FP 1
# endif
# ifndef CGLM_SIMD_ARM
Expand Down
Loading
Loading