Skip to content

Commit

Permalink
fix(simd.h): Make all-architecture matrix44::inverse() (#4076)
Browse files Browse the repository at this point in the history
We had an awkward situation of the simd matrix44::inverse() only being
defined for Intel SSE, and in all other cases falling back on
Imath::M44f::inverse. The problem with that is that it made simd.h
depend either directly on ImathMatrix.h, or assume that it would be
included prior to inclusion of simd.h. This was not good.

This patch rewrites inverse() to eliminate all of the direct SSE
intrinsics in terms of other simd wrapper functions, which means that
they will be defined one way or another on all architectures.

Along the way I added a new vfloat4 shuffle that combines two vectors
(corresponding to `_mm_shuffle_ps` in SSE land), and a vfloat4
constructor from 2 float*', where the first two elements come from the
first ptr and the second two elements come from the second ptr.
Ultimately, the final implementation I settled on for the new inverse
didn't use these (an earlier version did), but I want to keep them
around anyway in case they are useful in the future. I did add a test
for them.

Signed-off-by: Larry Gritz <[email protected]>
  • Loading branch information
lgritz committed Jan 2, 2024
1 parent 8f9b21f commit 4cb956b
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 26 deletions.
95 changes: 70 additions & 25 deletions src/include/OpenImageIO/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,10 @@ class vfloat4 {
void load (const half *values);
#endif /* _HALF_H_ or _IMATH_H_ */

/// Load the first 2 elements from lo[0..1] and the second two elements
/// from hi[0..1].
void load_pairs(const float* lo, const float* hi);

void store (float *values) const;

/// Store the first n values into memory
Expand Down Expand Up @@ -2123,6 +2127,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);

/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
/// 2-bit indices packed into the template parameter i (going from the low
/// 2-bit pair to the high 2-bit pair).
template<int i> OIIO_FORCEINLINE vfloat4
shuffle(const vfloat4& a, const vfloat4& b);

/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
Expand Down Expand Up @@ -6895,6 +6905,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
}
#endif /* _HALF_H_ or _IMATH_H_ */

OIIO_FORCEINLINE void
vfloat4::load_pairs(const float* lo, const float* hi)
{
#if OIIO_SIMD_SSE
m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi);
#else
m_val[0] = lo[0];
m_val[1] = lo[1];
m_val[2] = hi[0];
m_val[3] = hi[1];
#endif
}

OIIO_FORCEINLINE void vfloat4::store (float *values) const {
#if OIIO_SIMD_SSE
// Use an unaligned store -- it's just as fast when the memory turns
Expand Down Expand Up @@ -7336,6 +7359,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
#endif


template<int i>
OIIO_FORCEINLINE vfloat4
shuffle(const vfloat4& a, const vfloat4& b)
{
#if OIIO_SIMD_SSE
return vfloat4(_mm_shuffle_ps(a, b, i));
#else
return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
#endif
}


/// Helper: as rapid as possible extraction of one component, when the
/// index is fixed.
Expand Down Expand Up @@ -8462,23 +8497,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
}


#if OIIO_SIMD_SSE
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {

inline matrix44 matrix44::inverse() const
{
// Adapted from this code from Intel:
// ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
vfloat4 minor0, minor1, minor2, minor3;
vfloat4 row0, row1, row2, row3;
vfloat4 det, tmp1;
const float *src = (const float *)this;
vfloat4 zero = vfloat4::Zero();
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
#if 0
// Original code looked like this:
vfloat4 row0, row1, row2, row3;
const float *src = (const float *)&msrc;
tmp1.load_pairs(src, src+ 4);
row1.load_pairs(src+8, src+12);
row0 = shuffle<0x88>(tmp1, row1);
row1 = shuffle<0xDD>(row1, tmp1);
tmp1.load_pairs(src+ 2, src+ 6);
row3.load_pairs(src+10, src+14);
row2 = shuffle<0x88>(tmp1, row3);
row3 = shuffle<0xDD>(row3, tmp1);
#else
// But this is simpler and easier to understand:
matrix44 Mt = this->transposed();
vfloat4 row0 = Mt[0];
vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
vfloat4 row2 = Mt[2];
vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
#endif
// At this point, the row variables should contain the following indices
// of the original input matrix:
// row0 = 0 4 8 12
// row1 = 9 13 1 5
// row2 = 2 6 10 14
// row3 = 11 15 3 7

// -----------------------------------------------
tmp1 = row2 * row3;
tmp1 = shuffle<1,0,3,2>(tmp1);
Expand Down Expand Up @@ -8533,20 +8585,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
minor3 = (row1 * tmp1) + minor3;
// -----------------------------------------------
det = row0 * minor0;
det = shuffle<2,3,0,1>(det) + det;
det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
tmp1 = vfloat4(_mm_rcp_ss(det));
det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
det = shuffle<0>(det);
float det0 = reduce_add(det);
float tmp1_0 = 1.0f / det0;
det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
det = vfloat4(det0);
return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
}
#elif defined(INCLUDED_IMATHMATRIX_H)
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
return matrix44 (((Imath::M44f*)this)->inverse());
}
#else
#error "Don't know how to compute matrix44::inverse()"
#endif



inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
Expand Down
38 changes: 37 additions & 1 deletion src/libutil/simd_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1559,6 +1559,22 @@ test_special()
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)/vfloat4(127.0), vfloat4(1.0f));
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)*vfloat4(1.0f/127.0), vfloat4(1.0f));
}

// Test the 2-vfloat4 shuffle
{
#define PERMUTE(a,b,c,d) ((d<<6)|(c<<4)|(b<<2)|(a<<0))
vfloat4 a(10, 11, 12, 13);
vfloat4 b(20, 21, 22, 23);
OIIO_CHECK_SIMD_EQUAL(shuffle<PERMUTE(2,0,1,3)>(a,b),
vfloat4(12, 10, 21, 23));
}
// Test vfloat4::load_pairs
{
vfloat4 x;
static const float vals[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
x.load_pairs(vals+2, vals+5);
OIIO_CHECK_SIMD_EQUAL(x, vfloat4(2, 3, 5, 6));
}
}


Expand Down Expand Up @@ -1825,7 +1841,7 @@ test_matrix()
{
Imath::V3f P(1.0f, 0.0f, 0.0f);
Imath::M44f Mtrans(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI_2, 0.0f));
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI/4.0f, 0.0f));

test_heading("Testing matrix ops:");
std::cout << " P = " << P << "\n";
Expand Down Expand Up @@ -1878,6 +1894,26 @@ test_matrix()
OIIO_CHECK_ASSERT(
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
1.0e-6f));

// Test that matrix44::inverse always matches Imath::M44f::inverse
Imath::M44f rts = (Mtrans * Mrot) * Imath::M44f(2.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f);
OIIO_CHECK_ASSERT(
mx_equal_thresh(matrix44(rts.inverse()), matrix44(rts).inverse(),
1.0e-5f));
OIIO_CHECK_ASSERT(
mx_equal_thresh(matrix44(Mtrans.inverse()), matrix44(Mtrans).inverse(),
1.0e-6f));
OIIO_CHECK_ASSERT(
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
1.0e-6f));
Imath::M44f m123(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 1.0f);
OIIO_CHECK_ASSERT(
mx_equal_thresh(matrix44(m123.inverse()), matrix44(m123).inverse(),
1.0e-6f));

OIIO_CHECK_EQUAL(
matrix44(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
Imath::M44f(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
Expand Down

0 comments on commit 4cb956b

Please sign in to comment.