Merge pull request #737 from valassi/icx

Several fixes for icx2023.2 (including fixes for sqrt FPEs in ixx/oxx/vxx)
madgraph5 · Jul 26, 2023 · a4b9d6b · a4b9d6b
2 parents 39de7b3 + b38bea0
commit a4b9d6b
Show file tree

Hide file tree

Showing 116 changed files with 1,815 additions and 730 deletions.
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
@@ -168,6 +168,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -178,7 +179,12 @@
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -227,9 +233,10 @@
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -247,16 +254,20 @@
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -413,6 +424,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -425,11 +437,11 @@
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -457,6 +469,10 @@
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -487,7 +503,12 @@
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -564,6 +585,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -574,8 +596,8 @@
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -608,6 +630,8 @@
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -627,9 +651,10 @@
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -647,16 +672,20 @@
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif

diff --git a/...udacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/...udacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {

diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif

diff --git a/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {

diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );