From 4cb956b0a0a123376e468a5b5e6e188178ec077e Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 1 Jan 2024 17:16:13 -0800
Subject: [PATCH] fix(simd.h): Make all-architecture matrix44::inverse()
 (#4076)

We had an awkward situation of the simd matrix44::inverse() only being
defined for Intel SSE, and in all other cases falling back on
Imath::M44f::inverse. The problem with that is that it made simd.h
depend either directly on ImathMatrix.h, or assume that it would be
included prior to inclusion of simd.h. This was not good.

This patch rewrites inverse() to eliminate all of the direct SSE
intrinsics in terms of other simd wrapper functions, which means that
they will be defined one way or another on all architectures.

Along the way I added a new vfloat4 shuffle that combines two vectors
(corresponding to `_mm_shuffle_ps` in SSE land), and a vfloat4
constructor from 2 float*', where the first two elements come from the
first ptr and the second two elements come from the second ptr.
Ultimately, the final implementation I settled on for the new inverse
didn't use these (an earlier version did), but I want to keep them
around anyway in case they are useful in the future. I did add a test
for them.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
---
 src/include/OpenImageIO/simd.h | 95 +++++++++++++++++++++++++---------
 src/libutil/simd_test.cpp      | 38 +++++++++++++-
 2 files changed, 107 insertions(+), 26 deletions(-)
diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h
index f6b4b8faa7..0f8e0f5430 100644
--- a/src/include/OpenImageIO/simd.h
+++ b/src/include/OpenImageIO/simd.h
@@ -2027,6 +2027,10 @@ class vfloat4 {
     void load (const half *values);
 #endif /* _HALF_H_ or _IMATH_H_ */
 
+    /// Load the first 2 elements from lo[0..1] and the second two elements
+    /// from hi[0..1].
+    void load_pairs(const float* lo, const float* hi);
+
     void store (float *values) const;
 
     /// Store the first n values into memory
@@ -2123,6 +2127,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
 
+/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
+/// 2-bit indices packed into the template parameter i (going from the low
+/// 2-bit pair to the high 2-bit pair).
+template<int i> OIIO_FORCEINLINE vfloat4
+shuffle(const vfloat4& a, const vfloat4& b);
+
 /// Helper: as rapid as possible extraction of one component, when the
 /// index is fixed.
 template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
@@ -6895,6 +6905,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
 }
 #endif /* _HALF_H_ or _IMATH_H_ */
 
+OIIO_FORCEINLINE void
+vfloat4::load_pairs(const float* lo, const float* hi)
+{
+#if OIIO_SIMD_SSE
+    m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi);
+#else
+    m_val[0] = lo[0];
+    m_val[1] = lo[1];
+    m_val[2] = hi[0];
+    m_val[3] = hi[1];
+#endif
+}
+
 OIIO_FORCEINLINE void vfloat4::store (float *values) const {
 #if OIIO_SIMD_SSE
     // Use an unaligned store -- it's just as fast when the memory turns
@@ -7336,6 +7359,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
 #endif
 
 
+template<int i>
+OIIO_FORCEINLINE vfloat4
+shuffle(const vfloat4& a, const vfloat4& b)
+{
+#if OIIO_SIMD_SSE
+    return vfloat4(_mm_shuffle_ps(a, b, i));
+#else
+    return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
+                   b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
+#endif
+}
+
 
 /// Helper: as rapid as possible extraction of one component, when the
 /// index is fixed.
@@ -8462,23 +8497,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
 }
 
 
-#if OIIO_SIMD_SSE
-OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
+
+inline matrix44 matrix44::inverse() const
+{
     // Adapted from this code from Intel:
     // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
     vfloat4 minor0, minor1, minor2, minor3;
-    vfloat4 row0, row1, row2, row3;
     vfloat4 det, tmp1;
-    const float *src = (const float *)this;
-    vfloat4 zero = vfloat4::Zero();
-    tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
-    row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
-    row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
-    row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
-    tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
-    row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
-    row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
-    row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
+#if 0
+    // Original code looked like this:
+    vfloat4 row0, row1, row2, row3;
+    const float *src = (const float *)&msrc;
+    tmp1.load_pairs(src, src+ 4);
+    row1.load_pairs(src+8, src+12);
+    row0 = shuffle<0x88>(tmp1, row1);
+    row1 = shuffle<0xDD>(row1, tmp1);
+    tmp1.load_pairs(src+ 2, src+ 6);
+    row3.load_pairs(src+10, src+14);
+    row2 = shuffle<0x88>(tmp1, row3);
+    row3 = shuffle<0xDD>(row3, tmp1);
+#else
+    // But this is simpler and easier to understand:
+    matrix44 Mt = this->transposed();
+    vfloat4 row0 = Mt[0];
+    vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
+    vfloat4 row2 = Mt[2];
+    vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
+#endif
+    // At this point, the row variables should contain the following indices
+    // of the original input matrix:
+    // row0 = 0 4 8 12
+    // row1 = 9 13 1 5
+    // row2 = 2 6 10 14
+    // row3 = 11 15 3 7
+
     // -----------------------------------------------
     tmp1 = row2 * row3;
     tmp1 = shuffle<1,0,3,2>(tmp1);
@@ -8533,20 +8585,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
     minor3 = (row1 * tmp1) + minor3;
     // -----------------------------------------------
     det = row0 * minor0;
-    det = shuffle<2,3,0,1>(det) + det;
-    det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
-    tmp1 = vfloat4(_mm_rcp_ss(det));
-    det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
-    det = shuffle<0>(det);
+    float det0 = reduce_add(det);
+    float tmp1_0 = 1.0f / det0;
+    det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
+    det = vfloat4(det0);
     return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
 }
-#elif defined(INCLUDED_IMATHMATRIX_H)
-OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
-    return matrix44 (((Imath::M44f*)this)->inverse());
-}
-#else
-#error "Don't know how to compute matrix44::inverse()"
-#endif
+
 
 
 inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp
index e94f3a1305..4ecee7054e 100644
--- a/src/libutil/simd_test.cpp
+++ b/src/libutil/simd_test.cpp
@@ -1559,6 +1559,22 @@ test_special()
         OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)/vfloat4(127.0), vfloat4(1.0f));
         OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)*vfloat4(1.0f/127.0), vfloat4(1.0f));
     }
+
+    // Test the 2-vfloat4 shuffle
+    {
+        #define PERMUTE(a,b,c,d) ((d<<6)|(c<<4)|(b<<2)|(a<<0))
+        vfloat4 a(10, 11, 12, 13);
+        vfloat4 b(20, 21, 22, 23);
+        OIIO_CHECK_SIMD_EQUAL(shuffle<PERMUTE(2,0,1,3)>(a,b),
+                              vfloat4(12, 10, 21, 23));
+    }
+    // Test vfloat4::load_pairs
+    {
+        vfloat4 x;
+        static const float vals[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+        x.load_pairs(vals+2, vals+5);
+        OIIO_CHECK_SIMD_EQUAL(x, vfloat4(2, 3, 5, 6));
+    }
 }
 
 
@@ -1825,7 +1841,7 @@ test_matrix()
 {
     Imath::V3f P(1.0f, 0.0f, 0.0f);
     Imath::M44f Mtrans(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
-    Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI_2, 0.0f));
+    Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI/4.0f, 0.0f));
 
     test_heading("Testing matrix ops:");
     std::cout << "  P = " << P << "\n";
@@ -1878,6 +1894,26 @@ test_matrix()
     OIIO_CHECK_ASSERT(
         mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
                         1.0e-6f));
+
+    // Test that matrix44::inverse always matches Imath::M44f::inverse
+    Imath::M44f rts = (Mtrans * Mrot) * Imath::M44f(2.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 1.0f);
+    OIIO_CHECK_ASSERT(
+        mx_equal_thresh(matrix44(rts.inverse()), matrix44(rts).inverse(),
+                        1.0e-5f));
+    OIIO_CHECK_ASSERT(
+        mx_equal_thresh(matrix44(Mtrans.inverse()), matrix44(Mtrans).inverse(),
+                        1.0e-6f));
+    OIIO_CHECK_ASSERT(
+        mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
+                        1.0e-6f));
+    Imath::M44f m123(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 1.0f);
+    OIIO_CHECK_ASSERT(
+        mx_equal_thresh(matrix44(m123.inverse()), matrix44(m123).inverse(),
+                        1.0e-6f));    
+
     OIIO_CHECK_EQUAL(
         matrix44(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
         Imath::M44f(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));