diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
index 931b0c5e25..e9d2887c04 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
@@ -168,6 +168,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -178,7 +179,12 @@
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -227,9 +233,10 @@
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -247,16 +254,20 @@
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -413,6 +424,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -425,11 +437,11 @@
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -457,6 +469,10 @@
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -487,7 +503,12 @@
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -564,6 +585,7 @@
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -574,8 +596,8 @@
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -608,6 +630,8 @@
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -627,9 +651,10 @@
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -647,16 +672,20 @@
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 8162dee42b..222d75f846 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
index 5a5e1dffbf..3e6569b553 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index d838ca8455..759bbd80d8 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004617929458618164 [0m
+[1;32mDEBUG: model prefixing  takes 0.005298137664794922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f031e304b80> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0559c1aa00> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -210,12 +210,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.093 s
+Wrote files for 8 helas calls in 0.092 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.175 s
+ALOHA: aloha creates 3 routines in  0.167 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.749s
-user	0m1.524s
-sys	0m0.212s
+real	0m2.155s
+user	0m1.726s
+sys	0m0.362s
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
index 4f385d6435..738db319fd 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV2_3.o FFV2_0.o FFV4_0.o FFV4_3.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 72bdda81f8..d6827356eb 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index bea370fede..275576a43d 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0048067569732666016 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053577423095703125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.232 s
+ALOHA: aloha creates 4 routines in  0.216 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -225,6 +225,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.660s
-user	0m0.584s
-sys	0m0.063s
+real	0m0.960s
+user	0m0.762s
+sys	0m0.119s
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 72bdda81f8..d6827356eb 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index fe51359c33..dd4e2c0fba 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0047855377197265625 [0m
+[1;32mDEBUG: model prefixing  takes 0.0052073001861572266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.007 s
 Total: 1 processes with 3 diagrams
 output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f46510d5760> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f95c351daf0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -214,16 +214,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.104 s
+Wrote files for 10 helas calls in 0.102 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.259 s
+ALOHA: aloha creates 2 routines in  0.114 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.114 s
+ALOHA: aloha creates 4 routines in  0.111 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.770s
-user	0m1.390s
-sys	0m0.213s
+real	0m2.501s
+user	0m1.613s
+sys	0m0.365s
diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
index 59e590217d..5597c614b0 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_2.o VVV1P0_1.o FFV1_0.o
+ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index a6f1b62e82..f6d1694588 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index aa517f68e9..d85f35fcaa 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0047032833099365234 [0m
+[1;32mDEBUG: model prefixing  takes 0.005174160003662109 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -205,7 +205,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.123 s
+ALOHA: aloha creates 2 routines in  0.114 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -227,6 +227,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.339s
-user	0m0.494s
-sys	0m0.060s
+real	0m0.866s
+user	0m0.677s
+sys	0m0.118s
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index a6f1b62e82..f6d1694588 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index e051b99bce..c4862ef786 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~; add process g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004668235778808594 [0m
+[1;32mDEBUG: model prefixing  takes 0.0051081180572509766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -186,7 +186,7 @@ INFO: Creating files in directory P2_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff334462af0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8237742310> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -228,7 +228,7 @@ INFO: Creating files in directory P1_gg_ttx
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff334460c70> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f82378e4cd0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -264,15 +264,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.040 s
-Wrote files for 46 helas calls in 0.249 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s
+Wrote files for 46 helas calls in 0.247 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.285 s
+ALOHA: aloha creates 5 routines in  0.265 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -280,7 +280,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.267 s
+ALOHA: aloha creates 10 routines in  0.281 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -316,6 +316,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.128s
-user	0m1.898s
-sys	0m0.222s
+real	0m2.560s
+user	0m2.136s
+sys	0m0.349s
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
index 4f2ef3d0d8..50c12b0804 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index dd17387b4e..03afcd6a5f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 5b2ceae09b..f3823147cb 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004907846450805664 [0m
+[1;32mDEBUG: model prefixing  takes 0.005174398422241211 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f169b890190> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f92d6f52e20> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -215,15 +215,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s
-Wrote files for 36 helas calls in 0.153 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.035 s
+Wrote files for 36 helas calls in 0.151 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.277 s
+ALOHA: aloha creates 5 routines in  0.265 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.261 s
+ALOHA: aloha creates 10 routines in  0.284 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -267,6 +267,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.058s
-user	0m1.793s
-sys	0m0.205s
+real	0m2.462s
+user	0m2.051s
+sys	0m0.329s
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
index 4f2ef3d0d8..50c12b0804 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index dd17387b4e..03afcd6a5f 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 66f01e21f5..7ec640308e 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004968166351318359 [0m
+[1;32mDEBUG: model prefixing  takes 0.0051500797271728516 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.020 s
+1 processes with 16 diagrams generated in 0.019 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.292 s
+ALOHA: aloha creates 5 routines in  0.269 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,6 +237,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.907s
-user	0m0.720s
-sys	0m0.069s
+real	0m1.092s
+user	0m0.905s
+sys	0m0.108s
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index dd17387b4e..03afcd6a5f 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index c8f29ccb7d..8c2a3bf79e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005085468292236328 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053708553314208984 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.143 s
+1 processes with 123 diagrams generated in 0.144 s
 Total: 1 processes with 123 diagrams
 output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fad092fa0d0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0ffa4de340> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.384 s
-Wrote files for 222 helas calls in 0.659 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s
+Wrote files for 222 helas calls in 0.669 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.282 s
+ALOHA: aloha creates 5 routines in  0.277 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.271 s
+ALOHA: aloha creates 10 routines in  0.268 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.022s
-user	0m2.797s
-sys	0m0.209s
+real	0m3.653s
+user	0m3.071s
+sys	0m0.363s
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
index cf4ec946f8..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index b68cdea0db..6db5ca82f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index d778364a78..7204c0dd4c 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004860401153564453 [0m
+[1;32mDEBUG: model prefixing  takes 0.005178928375244141 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.143 s
+1 processes with 123 diagrams generated in 0.145 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.381 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.378 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.276 s
+ALOHA: aloha creates 5 routines in  0.271 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -242,6 +242,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m1.371s
-user	0m1.299s
-sys	0m0.056s
+real	0m1.686s
+user	0m1.484s
+sys	0m0.120s
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index b68cdea0db..6db5ca82f3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 466db9acd7..f8ad8149f8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004748106002807617 [0m
+[1;32mDEBUG: model prefixing  takes 0.00522160530090332 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.697 s
+1 processes with 1240 diagrams generated in 1.741 s
 Total: 1 processes with 1240 diagrams
 output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1592 term in 30s. Introduce 2768 contraction 
+INFO: Color-Flow passed to 1592 term in 32s. Introduce 2768 contraction 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1f5cab19a0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3649890b80> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.832 s
-Wrote files for 2281 helas calls in 39.711 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.012 s
+Wrote files for 2281 helas calls in 41.927 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.266 s
+ALOHA: aloha creates 5 routines in  0.328 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.420 s
+ALOHA: aloha creates 10 routines in  0.269 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -276,6 +276,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m49.506s
-user	0m48.230s
-sys	0m0.881s
+real	0m52.327s
+user	0m50.885s
+sys	0m1.141s
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
index cf4ec946f8..ec923afd6d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o
+ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index b68cdea0db..6db5ca82f3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index eb04f4adfc..1c454bccf7 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004965782165527344 [0m
+[1;32mDEBUG: model prefixing  takes 0.005292177200317383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.750 s
+1 processes with 1240 diagrams generated in 1.707 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 5.770 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.005 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.298 s
+ALOHA: aloha creates 5 routines in  0.266 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -244,6 +244,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m11.557s
-user	0m11.387s
-sys	0m0.119s
+real	0m12.140s
+user	0m11.838s
+sys	0m0.186s
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index b68cdea0db..6db5ca82f3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 5566a87b0a..6dec1d926a 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005097150802612305 [0m
+[1;32mDEBUG: model prefixing  takes 0.0051157474517822266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.070 s
+8 processes with 40 diagrams generated in 0.072 s
 Total: 8 processes with 40 diagrams
 output madevent CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -199,7 +199,7 @@ INFO: Creating files in directory P1_gu_ttxu
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fae51e99e20> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3b343daeb0> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -237,7 +237,7 @@ INFO: Creating files in directory P1_gux_ttxux
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1039][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1040][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1045][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fae51ed3fd0> [1;30m[export_v4.py at line 6174][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3b343d4520> [1;30m[export_v4.py at line 6174][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1297][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1299][0m [0m
@@ -271,17 +271,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s
-Wrote files for 32 helas calls in 0.214 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s
+Wrote files for 32 helas calls in 0.212 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.125 s
+ALOHA: aloha creates 2 routines in  0.114 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.115 s
+ALOHA: aloha creates 4 routines in  0.112 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -313,6 +313,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.810s
-user	0m1.575s
-sys	0m0.222s
+real	0m2.217s
+user	0m1.790s
+sys	0m0.357s
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
index 0c895f2b2c..4457933199 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc
@@ -1 +1 @@
-ALOHARoutine = FFV1_1.o FFV1_2.o VVV1_0.o FFV1_0.o FFV1P0_3.o
+ALOHARoutine = FFV1_1.o FFV1_0.o VVV1_0.o FFV1_2.o FFV1P0_3.o
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index 2e914de002..0413379e94 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 985a96a86e..fb27a52201 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~; generate g q > t t~ q
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.004664897918701172 [0m
+[1;32mDEBUG: model prefixing  takes 0.005094289779663086 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.072 s
+8 processes with 40 diagrams generated in 0.071 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -251,12 +251,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1430][0m [0m
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1441][0m [0m
 [1;32mDEBUG:  'Copying test reference file: ', template_ref  = [0m Copying test reference file:  /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gux_ttxux.txt [1;30m[model_handling.py at line 1335][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.027 s
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.123 s
+ALOHA: aloha creates 2 routines in  0.117 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -279,6 +279,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.678s
-user	0m0.612s
-sys	0m0.060s
+real	0m0.994s
+user	0m0.805s
+sys	0m0.121s
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
index cde18056a9..e40f635e46 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index 2e914de002..0413379e94 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 97e2a77fac..419ccb30d8 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
  MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
 
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
-No valid eps viewer found. Please set in ./input/mg5_configuration.txt
-Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
+Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
 import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
@@ -127,7 +127,7 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1  
 INFO: Process has 1 diagrams 
-1 processes with 1 diagrams generated in 0.004 s
+1 processes with 1 diagrams generated in 0.003 s
 Total: 1 processes with 1 diagrams
 output standalone_cudacpp CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_SA_OUTPUT
@@ -176,7 +176,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 194][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.053 s
+ALOHA: aloha creates 1 routines in  0.052 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -195,6 +195,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 203][0m [0m
 quit
 
-real	0m0.872s
-user	0m0.396s
-sys	0m0.057s
+real	0m0.777s
+user	0m0.574s
+sys	0m0.129s
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
index fd7734ce42..ffe3b84d53 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h
@@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
 /// and compares momenta and matrix elements with a reference file.
 TEST_P( MadgraphTest, CompareMomentaAndME )
 {
-  // Set to true to dump events:
-  constexpr bool dumpEvents = false;
-  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
-  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
 #ifdef __APPLE__
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
 #else
   const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
 #endif
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  // Dump events to a new reference file?
+  constexpr bool dumpEvents = false;
   std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
   while( dumpFileName.find( '/' ) != std::string::npos )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
index c2f7b8e0f6..43cee0977e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk
@@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
+# Compiler-specific googletest build directory (#125 and #738)
+ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
+override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
+override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
+else
+override CXXNAME = unknown
+endif
+###$(info CXXNAME=$(CXXNAME))
+override CXXNAMESUFFIX = _$(CXXNAME)
+export CXXNAMESUFFIX
+
 # Dependency on test directory
 # Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
 # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
@@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
 TESTDIR =
 else ifneq ($(LOCALGTEST),)
 TESTDIR=$(TESTDIRLOCAL)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
 TESTDIR = $(TESTDIRCOMMON)
-GTEST_ROOT = $(TESTDIR)/googletest/install
+GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
 else
 TESTDIR =
 endif
diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
index 5ea4dc68ad..f1f281f78a 100644
--- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
+++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc
@@ -47,7 +47,7 @@ namespace mg5amcCpu
 #else
     std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
 #endif
-    exit( 0 );
+    exit( 1 );
   }
 }
 
@@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
     const fptype p1 = par0[ievt * np4 + 1];
     const fptype p2 = par0[ievt * np4 + 2];
     const fptype p3 = par0[ievt * np4 + 3];
-    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
+    if( m2 > 0 )
+      mass0[ievt] = fpsqrt( (fptype)m2 );
+    else
+      mass0[ievt] = 0;
     ispzgt0[ievt] = ( p3 > 0 );
     ispzlt0[ievt] = ( p3 < 0 );
     isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
index 46b9942456..3f64745b75 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h
@@ -195,6 +195,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -205,7 +206,12 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+#endif
       // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
       // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
       // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
@@ -254,9 +260,10 @@ namespace mg5amcCpu
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ),     // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
                                            cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -274,16 +281,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
-                                          fptype_sv{ 0 },
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                            cxternary( sqp0p3 == 0,
                                       cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
                                       cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
 #endif
@@ -440,6 +451,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -452,11 +464,11 @@ namespace mg5amcCpu
     if( vmass != 0. )
     {
       const int nsvahl = nsv * std::abs( hel );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
       const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
-      const fptype hel0 = 1. - std::abs( hel );
-#ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         vc[2] = cxmake( 0., 0. );
@@ -484,6 +496,10 @@ namespace mg5amcCpu
         }
       }
 #else
+      volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
       // Branch A: pp == 0.
       const cxtype vcA_2 = cxmake( 0, 0 );
       const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
@@ -514,7 +530,12 @@ namespace mg5amcCpu
     else
     {
       const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+#ifndef MGONGPU_CPPSIMD
       const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+#else
+      volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
+      const fptype_sv pt = fpsqrt( pt2 );
+#endif
       vc[2] = cxzero_sv();
       vc[5] = cxmake( hel * pt / pp * sqh, 0. );
 #ifndef MGONGPU_CPPSIMD
@@ -591,6 +612,7 @@ namespace mg5amcCpu
     // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
     // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
     // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
+    // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -601,8 +623,8 @@ namespace mg5amcCpu
     const int nh = nhel * nsf;
     if( fmass != 0. )
     {
-      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
 #ifndef MGONGPU_CPPSIMD
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
@@ -635,6 +657,8 @@ namespace mg5amcCpu
         fo[5] = sfomeg[0] * chi[ip];
       }
 #else
+      volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
@@ -654,9 +678,10 @@ namespace mg5amcCpu
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
-      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );    // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM;              // volatile fixes #736
+      const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ),    // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
                                              cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
@@ -674,16 +699,20 @@ namespace mg5amcCpu
     }
     else
     {
-      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
-                                          0,
-                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
 #ifdef MGONGPU_CPPSIMD
-      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
-      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+      volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
+      volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                             fptype_sv{ 0 },
+                                             fpsqrt( p0p3 ) * (fptype)nsf );
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
                                 cxternary( ( sqp0p3 == 0. ),
                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
 #else
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
                                  ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
 #endif
diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
index 0d3892d026..e1299ba81e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h
@@ -236,6 +236,20 @@ namespace mg5amcCpu
   // Functions and operators for fptype_v
 
 #ifdef MGONGPU_CPPSIMD
+  inline fptype_v
+  fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
+  {
+    // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+    fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+    for( int i = 0; i < neppV; i++ )
+    {
+      volatile fptype outi = 0; // volatile fixes #736
+      if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
+      out[i] = outi;
+    }
+    return out;
+  }
+
   inline fptype_v
   fpsqrt( const fptype_v& v )
   {
diff --git a/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk
index 477a37e27c..39ed957600 100644
--- a/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk
@@ -5,28 +5,36 @@
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
+# Compiler-specific googletest build directory (#125 and #738)
+# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
+# In epoch1/epoch2, CXXNAMESUFFIX is undefined
+$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
+BUILDDIR = build$(CXXNAMESUFFIX)
+###$(info BUILDDIR=$(BUILDDIR))
+INSTALLDIR = install$(CXXNAMESUFFIX)
+###$(info INSTALLDIR=$(INSTALLDIR))
+
 CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 
-all: googletest/install/lib64/libgtest.a
+all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
 	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
 
-googletest/build/Makefile: googletest/CMakeLists.txt
-	mkdir -p googletest/build
-	cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
+googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
+	mkdir -p googletest/$(BUILDDIR)
+	cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
 
-googletest/build/lib/libgtest.a: googletest/build/Makefile
-	$(MAKE) -C googletest/build
+googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
+	$(MAKE) -C googletest/$(BUILDDIR)
 
 # NB 'make install' is no longer supported in googletest (issue 328)
 # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
-googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
-	mkdir -p googletest/install/lib64
-	cp googletest/build/lib/lib*.a googletest/install/lib64/
-	mkdir -p googletest/install/include
-	cp -r googletest/googletest/include/gtest googletest/install/include/
+googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
+	mkdir -p googletest/$(INSTALLDIR)/lib64
+	cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
+	mkdir -p googletest/$(INSTALLDIR)/include
+	cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/
 
 clean:
 	rm -rf googletest
-
diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh
index fbd9bbf3f3..07a14e6b61 100755
--- a/epochX/cudacpp/tput/teeThroughputX.sh
+++ b/epochX/cudacpp/tput/teeThroughputX.sh
@@ -176,7 +176,8 @@ for step in $steps; do
               printf "*** ./throughputX.sh $args | tee $logfile"
               printf "\n%80s\n" |tr " " "*"
               mkdir -p $(dirname $logfile)
-              if ! ./throughputX.sh $args -gtest | tee $logfile; then status=2; fi
+              ./throughputX.sh $args -gtest | tee $logfile 
+              if [ ${PIPESTATUS[0]} -ne "0" ]; then status=2; fi
             fi
           done
         done
diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh
index 7ca5837d5f..bd656c5b93 100755
--- a/epochX/cudacpp/tput/throughputX.sh
+++ b/epochX/cudacpp/tput/throughputX.sh
@@ -664,6 +664,7 @@ for exe in $exes; do
       exe2=${exe/check/runTest}
       echo "runExe $exe2"
       $exe2 2>&1 | tail -1
+      if [ ${PIPESTATUS[0]} -ne "0" ]; then exit 1; fi 
     fi
   elif [ "${exe%%/gcheck*}" != "${exe}" ] ||  [ "${exe%%/alpcheck*}" != "${exe}" ]; then 
     runNcu $exe "$ncuArgs"
diff --git a/tools/compilers/mg-lcg-clang++-16.0.3 b/tools/compilers/mg-lcg-clang++-16.0.3
new file mode 100755
index 0000000000..3ade60c2d7
--- /dev/null
+++ b/tools/compilers/mg-lcg-clang++-16.0.3
@@ -0,0 +1,16 @@
+#!/bin/sh
+redrel=$(cat /etc/redhat-release 2> /dev/null)
+if [ "${redrel##*release 7}" != "${redrel}" ]; then
+  export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/bin${PATH:+:$PATH}
+  LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g')
+  export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+  exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin/clang++ --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7 "$@"
+elif [ "${redrel##*release 9}" != "${redrel}" ]; then
+  export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/bin${PATH:+:$PATH}
+  LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g')
+  export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+  exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin/clang++ --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9 "$@"
+else
+  echo "ERROR! RedHat release ${redrel} is not supported by $0"
+  exit 1
+fi
diff --git a/tools/compilers/mg-lcg-clang-16.0.3 b/tools/compilers/mg-lcg-clang-16.0.3
new file mode 100755
index 0000000000..aad1f9af16
--- /dev/null
+++ b/tools/compilers/mg-lcg-clang-16.0.3
@@ -0,0 +1,16 @@
+#!/bin/sh
+redrel=$(cat /etc/redhat-release 2> /dev/null)
+if [ "${redrel##*release 7}" != "${redrel}" ]; then
+  export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/bin${PATH:+:$PATH}
+  LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g')
+  export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+  exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin/clang --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7 "$@"
+elif [ "${redrel##*release 9}" != "${redrel}" ]; then
+  export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/bin${PATH:+:$PATH}
+  LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g')
+  export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+  exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin/clang --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9 "$@"
+else
+  echo "ERROR! RedHat release ${redrel} is not supported by $0"
+  exit 1
+fi
diff --git a/tools/compilers/setup-clang-16.0.3.sh b/tools/compilers/setup-clang-16.0.3.sh
new file mode 100644
index 0000000000..78ce015ba4
--- /dev/null
+++ b/tools/compilers/setup-clang-16.0.3.sh
@@ -0,0 +1,16 @@
+if [ "$BASH_SOURCE" = "$0" ]; then echo "ERROR! This script ($0) was not sourced"; exit 1; fi
+if [ "$BASH_SOURCE" = "" ]; then echo "ERROR! This script was not sourced from bash"; return 1; fi
+scrdir=$(cd $(dirname ${BASH_SOURCE}); pwd)
+redrel=$(cat /etc/redhat-release 2> /dev/null)
+if [ "${redrel##*release 7}" != "${redrel}" ]; then
+  source /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/setup.sh
+elif [ "${redrel##*release 9}" != "${redrel}" ]; then
+  source /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/setup.sh
+else
+  echo "ERROR! RedHat release ${redrel} is not supported by ${BASH_SOURCE}"
+  return 1
+fi
+###export ALLOW_UNSUPPORTED_COMPILER_IN_CUDA=1
+export CC=$scrdir/mg-lcg-clang-16.0.3
+export CXX=$scrdir/mg-lcg-clang++-16.0.3
+