diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h index 931b0c5e25..e9d2887c04 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h @@ -168,6 +168,7 @@ // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -178,7 +179,12 @@ const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -227,9 +233,10 @@ omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -247,16 +254,20 @@ } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -413,6 +424,7 @@ // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -425,11 +437,11 @@ if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -457,6 +469,10 @@ } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -487,7 +503,12 @@ else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -564,6 +585,7 @@ // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -574,8 +596,8 @@ const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -608,6 +630,8 @@ fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -627,9 +651,10 @@ const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -647,16 +672,20 @@ } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 8162dee42b..222d75f846 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index 5a5e1dffbf..3e6569b553 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index d838ca8455..759bbd80d8 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004617929458618164  +DEBUG: model prefixing takes 0.005298137664794922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -210,12 +210,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.093 s +Wrote files for 8 helas calls in 0.092 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.175 s +ALOHA: aloha creates 3 routines in 0.167 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines @@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.749s -user 0m1.524s -sys 0m0.212s +real 0m2.155s +user 0m1.726s +sys 0m0.362s diff --git a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc index 4f385d6435..738db319fd 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/ee_mumu.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = FFV2_3.o FFV2_0.o FFV4_0.o FFV4_3.o FFV1_0.o FFV1P0_3.o +ALOHARoutine = FFV1_0.o FFV4_3.o FFV1P0_3.o FFV2_0.o FFV4_0.o FFV2_3.o diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 72bdda81f8..d6827356eb 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index bea370fede..275576a43d 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0048067569732666016  +DEBUG: model prefixing takes 0.0053577423095703125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.232 s +ALOHA: aloha creates 4 routines in 0.216 s FFV1 FFV1 FFV2 @@ -225,6 +225,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m0.660s -user 0m0.584s -sys 0m0.063s +real 0m0.960s +user 0m0.762s +sys 0m0.119s diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 72bdda81f8..d6827356eb 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index fe51359c33..dd4e2c0fba 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0047855377197265625  +DEBUG: model prefixing takes 0.0052073001861572266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.007 s Total: 1 processes with 3 diagrams output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttx DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -214,16 +214,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.104 s +Wrote files for 10 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.259 s +ALOHA: aloha creates 2 routines in 0.114 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.114 s +ALOHA: aloha creates 4 routines in 0.111 s VVV1 FFV1 FFV1 @@ -254,6 +254,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.770s -user 0m1.390s -sys 0m0.213s +real 0m2.501s +user 0m1.613s +sys 0m0.365s diff --git a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc index 59e590217d..5597c614b0 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = FFV1_1.o FFV1_2.o VVV1P0_1.o FFV1_0.o +ALOHARoutine = FFV1_1.o FFV1_0.o FFV1_2.o VVV1P0_1.o diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index a6f1b62e82..f6d1694588 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index aa517f68e9..d85f35fcaa 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0047032833099365234  +DEBUG: model prefixing takes 0.005174160003662109  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -205,7 +205,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.114 s VVV1 FFV1 FFV1 @@ -227,6 +227,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m1.339s -user 0m0.494s -sys 0m0.060s +real 0m0.866s +user 0m0.677s +sys 0m0.118s diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index a6f1b62e82..f6d1694588 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index e051b99bce..c4862ef786 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~; add process g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004668235778808594  +DEBUG: model prefixing takes 0.0051081180572509766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -186,7 +186,7 @@ INFO: Creating files in directory P2_gg_ttxg DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -228,7 +228,7 @@ INFO: Creating files in directory P1_gg_ttx DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -264,15 +264,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.040 s -Wrote files for 46 helas calls in 0.249 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.039 s +Wrote files for 46 helas calls in 0.247 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.285 s +ALOHA: aloha creates 5 routines in 0.265 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -280,7 +280,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.267 s +ALOHA: aloha creates 10 routines in 0.281 s VVV1 VVV1 FFV1 @@ -316,6 +316,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.128s -user 0m1.898s -sys 0m0.222s +real 0m2.560s +user 0m2.136s +sys 0m0.349s diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc index 4f2ef3d0d8..50c12b0804 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gg_tt01g.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o +ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index dd17387b4e..03afcd6a5f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 5b2ceae09b..f3823147cb 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004907846450805664  +DEBUG: model prefixing takes 0.005174398422241211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxg DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -215,15 +215,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.034 s -Wrote files for 36 helas calls in 0.153 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.035 s +Wrote files for 36 helas calls in 0.151 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.277 s +ALOHA: aloha creates 5 routines in 0.265 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -231,7 +231,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.261 s +ALOHA: aloha creates 10 routines in 0.284 s VVV1 VVV1 FFV1 @@ -267,6 +267,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.058s -user 0m1.793s -sys 0m0.205s +real 0m2.462s +user 0m2.051s +sys 0m0.329s diff --git a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc index 4f2ef3d0d8..50c12b0804 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gg_ttg.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o +ALOHARoutine = FFV1_1.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index dd17387b4e..03afcd6a5f 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 66f01e21f5..7ec640308e 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004968166351318359  +DEBUG: model prefixing takes 0.0051500797271728516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.019 s Total: 1 processes with 16 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -210,7 +210,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.292 s +ALOHA: aloha creates 5 routines in 0.269 s VVV1 VVV1 FFV1 @@ -237,6 +237,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m0.907s -user 0m0.720s -sys 0m0.069s +real 0m1.092s +user 0m0.905s +sys 0m0.108s diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index dd17387b4e..03afcd6a5f 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index c8f29ccb7d..8c2a3bf79e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005085468292236328  +DEBUG: model prefixing takes 0.0053708553314208984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.143 s +1 processes with 123 diagrams generated in 0.144 s Total: 1 processes with 123 diagrams output madevent CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxgg DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.384 s -Wrote files for 222 helas calls in 0.659 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.385 s +Wrote files for 222 helas calls in 0.669 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.282 s +ALOHA: aloha creates 5 routines in 0.277 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.271 s +ALOHA: aloha creates 10 routines in 0.268 s VVV1 VVV1 FFV1 @@ -272,6 +272,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.022s -user 0m2.797s -sys 0m0.209s +real 0m3.653s +user 0m3.071s +sys 0m0.363s diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc index cf4ec946f8..ec923afd6d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gg_ttgg.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o +ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index b68cdea0db..6db5ca82f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index d778364a78..7204c0dd4c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004860401153564453  +DEBUG: model prefixing takes 0.005178928375244141  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.143 s +1 processes with 123 diagrams generated in 0.145 s Total: 1 processes with 123 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -204,7 +204,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1430]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1441]  DEBUG: 'Copying test reference file: ', template_ref =  Copying test reference file: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxgg.txt [model_handling.py at line 1335]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.381 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.378 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -212,7 +212,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.276 s +ALOHA: aloha creates 5 routines in 0.271 s VVV1 VVV1 FFV1 @@ -242,6 +242,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m1.371s -user 0m1.299s -sys 0m0.056s +real 0m1.686s +user 0m1.484s +sys 0m0.120s diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index b68cdea0db..6db5ca82f3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 466db9acd7..f8ad8149f8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004748106002807617  +DEBUG: model prefixing takes 0.00522160530090332  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.697 s +1 processes with 1240 diagrams generated in 1.741 s Total: 1 processes with 1240 diagrams output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -175,11 +175,11 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1592 term in 30s. Introduce 2768 contraction +INFO: Color-Flow passed to 1592 term in 32s. Introduce 2768 contraction DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -221,15 +221,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.832 s -Wrote files for 2281 helas calls in 39.711 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.012 s +Wrote files for 2281 helas calls in 41.927 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.266 s +ALOHA: aloha creates 5 routines in 0.328 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -237,7 +237,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.420 s +ALOHA: aloha creates 10 routines in 0.269 s VVV1 VVV1 FFV1 @@ -276,6 +276,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m49.506s -user 0m48.230s -sys 0m0.881s +real 0m52.327s +user 0m50.885s +sys 0m1.141s diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc index cf4ec946f8..ec923afd6d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gg_ttggg.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = VVVV3_0.o VVVV4P0_1.o VVVV3P0_1.o VVVV1P0_1.o FFV1_1.o FFV1_2.o VVV1P0_1.o VVV1_0.o FFV1_0.o FFV1P0_3.o VVVV1_0.o VVVV4_0.o +ALOHARoutine = FFV1_1.o VVVV4_0.o VVVV4P0_1.o FFV1_0.o VVV1_0.o FFV1_2.o VVVV3_0.o VVVV1_0.o VVVV3P0_1.o VVVV1P0_1.o VVV1P0_1.o FFV1P0_3.o diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index b68cdea0db..6db5ca82f3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index eb04f4adfc..1c454bccf7 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004965782165527344  +DEBUG: model prefixing takes 0.005292177200317383  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.750 s +1 processes with 1240 diagrams generated in 1.707 s Total: 1 processes with 1240 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -206,7 +206,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1430]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1441]  DEBUG: 'Copying test reference file: ', template_ref =  Copying test reference file: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gg_ttxggg.txt [model_handling.py at line 1335]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 5.770 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.005 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.298 s +ALOHA: aloha creates 5 routines in 0.266 s VVV1 VVV1 FFV1 @@ -244,6 +244,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/ DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m11.557s -user 0m11.387s -sys 0m0.119s +real 0m12.140s +user 0m11.838s +sys 0m0.186s diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index b68cdea0db..6db5ca82f3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 5566a87b0a..6dec1d926a 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~; generate g q > t t~ q INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005097150802612305  +DEBUG: model prefixing takes 0.0051157474517822266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.070 s +8 processes with 40 diagrams generated in 0.072 s Total: 8 processes with 40 diagrams output madevent CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -199,7 +199,7 @@ INFO: Creating files in directory P1_gu_ttxu DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -237,7 +237,7 @@ INFO: Creating files in directory P1_gux_ttxux DEBUG: Entering PLUGIN_OneProcessExporter.__init__ [model_handling.py at line 1039]  DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1040]  DEBUG: proc_id =  1 [model_handling.py at line 1045]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6174]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1297]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1299]  @@ -271,17 +271,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s -Wrote files for 32 helas calls in 0.214 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.029 s +Wrote files for 32 helas calls in 0.212 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.125 s +ALOHA: aloha creates 2 routines in 0.114 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.115 s +ALOHA: aloha creates 4 routines in 0.112 s FFV1 FFV1 FFV1 @@ -313,6 +313,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.810s -user 0m1.575s -sys 0m0.222s +real 0m2.217s +user 0m1.790s +sys 0m0.357s diff --git a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc index 0c895f2b2c..4457933199 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc +++ b/epochX/cudacpp/gq_ttq.mad/Source/DHELAS/aloha_file.inc @@ -1 +1 @@ -ALOHARoutine = FFV1_1.o FFV1_2.o VVV1_0.o FFV1_0.o FFV1P0_3.o +ALOHARoutine = FFV1_1.o FFV1_0.o VVV1_0.o FFV1_2.o FFV1P0_3.o diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index 2e914de002..0413379e94 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 985a96a86e..fb27a52201 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~; generate g q > t t~ q INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.004664897918701172  +DEBUG: model prefixing takes 0.005094289779663086  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.072 s +8 processes with 40 diagrams generated in 0.071 s Total: 8 processes with 40 diagrams output standalone_cudacpp CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_SA_OUTPUT @@ -251,12 +251,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1430]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1441]  DEBUG: 'Copying test reference file: ', template_ref =  Copying test reference file: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/../../../test/ref/dump_CPUTest.Sigma_sm_gux_ttxux.txt [model_handling.py at line 1335]  -Generated helas calls for 2 subprocesses (10 diagrams) in 0.027 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.028 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.123 s +ALOHA: aloha creates 2 routines in 0.117 s FFV1 FFV1 FFV1 @@ -279,6 +279,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gq_ttq/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m0.678s -user 0m0.612s -sys 0m0.060s +real 0m0.994s +user 0m0.805s +sys 0m0.121s diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc index cde18056a9..e40f635e46 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index 2e914de002..0413379e94 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 97e2a77fac..419ccb30d8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -51,8 +51,8 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -127,7 +127,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Process has 1 diagrams -1 processes with 1 diagrams generated in 0.004 s +1 processes with 1 diagrams generated in 0.003 s Total: 1 processes with 1 diagrams output standalone_cudacpp CODEGEN_cudacpp_heft_gg_h Load PLUGIN.CUDACPP_SA_OUTPUT @@ -176,7 +176,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 194]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.053 s +ALOHA: aloha creates 1 routines in 0.052 s VVS3 FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/. @@ -195,6 +195,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 203]  quit -real 0m0.872s -user 0m0.396s -sys 0m0.057s +real 0m0.777s +user 0m0.574s +sys 0m0.129s diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h index fd7734ce42..ffe3b84d53 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + constexpr bool dumpEvents = false; std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; while( dumpFileName.find( '/' ) != std::string::npos ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index c2f7b8e0f6..43cee0977e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX + # Dependency on test directory # Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) # Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),) TESTDIR = else ifneq ($(LOCALGTEST),) TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) else TESTDIR = endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc index 5ea4dc68ad..f1f281f78a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc @@ -47,7 +47,7 @@ namespace mg5amcCpu #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #endif - exit( 0 ); + exit( 1 ); } } @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h index 46b9942456..3f64745b75 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h @@ -195,6 +195,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -205,7 +206,12 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used @@ -254,9 +260,10 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -274,16 +281,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), - fptype_sv{ 0 }, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( sqp0p3 == 0, cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; #endif @@ -440,6 +451,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -452,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -484,6 +496,10 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); @@ -514,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -591,6 +612,7 @@ namespace mg5amcCpu // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -601,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -635,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -654,9 +678,10 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 - volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 @@ -674,16 +699,20 @@ namespace mg5amcCpu } else { - const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), - 0, - fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); #ifdef MGONGPU_CPPSIMD - volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 - const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ), + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 #else + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h index 0d3892d026..e1299ba81e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h @@ -236,6 +236,20 @@ namespace mg5amcCpu // Functions and operators for fptype_v #ifdef MGONGPU_CPPSIMD + inline fptype_v + fpsqrt( const volatile fptype_v& v ) // volatile fixes #736 + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) + { + volatile fptype outi = 0; // volatile fixes #736 + if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + inline fptype_v fpsqrt( const fptype_v& v ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk index 477a37e27c..39ed957600 100644 --- a/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk +++ b/epochX/cudacpp/heft_gg_h.sa/test/cudacpp_test.mk @@ -5,28 +5,36 @@ THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 -all: googletest/install/lib64/libgtest.a +all: googletest/$(INSTALLDIR)/lib64/libgtest.a googletest/CMakeLists.txt: git clone https://github.com/google/googletest.git -b release-1.11.0 googletest -googletest/build/Makefile: googletest/CMakeLists.txt - mkdir -p googletest/build - cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ -googletest/build/lib/libgtest.a: googletest/build/Makefile - $(MAKE) -C googletest/build +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) # NB 'make install' is no longer supported in googletest (issue 328) # NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations -googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a - mkdir -p googletest/install/lib64 - cp googletest/build/lib/lib*.a googletest/install/lib64/ - mkdir -p googletest/install/include - cp -r googletest/googletest/include/gtest googletest/install/include/ +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ clean: rm -rf googletest - diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index fbd9bbf3f3..07a14e6b61 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -176,7 +176,8 @@ for step in $steps; do printf "*** ./throughputX.sh $args | tee $logfile" printf "\n%80s\n" |tr " " "*" mkdir -p $(dirname $logfile) - if ! ./throughputX.sh $args -gtest | tee $logfile; then status=2; fi + ./throughputX.sh $args -gtest | tee $logfile + if [ ${PIPESTATUS[0]} -ne "0" ]; then status=2; fi fi done done diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 7ca5837d5f..bd656c5b93 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -664,6 +664,7 @@ for exe in $exes; do exe2=${exe/check/runTest} echo "runExe $exe2" $exe2 2>&1 | tail -1 + if [ ${PIPESTATUS[0]} -ne "0" ]; then exit 1; fi fi elif [ "${exe%%/gcheck*}" != "${exe}" ] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then runNcu $exe "$ncuArgs" diff --git a/tools/compilers/mg-lcg-clang++-16.0.3 b/tools/compilers/mg-lcg-clang++-16.0.3 new file mode 100755 index 0000000000..3ade60c2d7 --- /dev/null +++ b/tools/compilers/mg-lcg-clang++-16.0.3 @@ -0,0 +1,16 @@ +#!/bin/sh +redrel=$(cat /etc/redhat-release 2> /dev/null) +if [ "${redrel##*release 7}" != "${redrel}" ]; then + export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/bin${PATH:+:$PATH} + LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g') + export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin/clang++ --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7 "$@" +elif [ "${redrel##*release 9}" != "${redrel}" ]; then + export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/bin${PATH:+:$PATH} + LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g') + export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin/clang++ --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9 "$@" +else + echo "ERROR! RedHat release ${redrel} is not supported by $0" + exit 1 +fi diff --git a/tools/compilers/mg-lcg-clang-16.0.3 b/tools/compilers/mg-lcg-clang-16.0.3 new file mode 100755 index 0000000000..aad1f9af16 --- /dev/null +++ b/tools/compilers/mg-lcg-clang-16.0.3 @@ -0,0 +1,16 @@ +#!/bin/sh +redrel=$(cat /etc/redhat-release 2> /dev/null) +if [ "${redrel##*release 7}" != "${redrel}" ]; then + export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/bin${PATH:+:$PATH} + LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g') + export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-centos7/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/bin/clang --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-centos7 "$@" +elif [ "${redrel##*release 9}" != "${redrel}" ]; then + export PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/bin:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/bin${PATH:+:$PATH} + LD_LIBRARY_PATH=$(printenv LD_LIBRARY_PATH | sed 's-[^:]*/\(gcc\|llvm\|clang\|binutils\)/[^:]*:\?--g') + export LD_LIBRARY_PATH=/cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/lib:/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9/lib64:/cvmfs/sft.cern.ch/lcg/releases/binutils/2.40-acaab/x86_64-el9/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + exec /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/bin/clang --gcc-toolchain=/cvmfs/sft.cern.ch/lcg/releases/gcc/13.1.0-b3d18/x86_64-el9 "$@" +else + echo "ERROR! RedHat release ${redrel} is not supported by $0" + exit 1 +fi diff --git a/tools/compilers/setup-clang-16.0.3.sh b/tools/compilers/setup-clang-16.0.3.sh new file mode 100644 index 0000000000..78ce015ba4 --- /dev/null +++ b/tools/compilers/setup-clang-16.0.3.sh @@ -0,0 +1,16 @@ +if [ "$BASH_SOURCE" = "$0" ]; then echo "ERROR! This script ($0) was not sourced"; exit 1; fi +if [ "$BASH_SOURCE" = "" ]; then echo "ERROR! This script was not sourced from bash"; return 1; fi +scrdir=$(cd $(dirname ${BASH_SOURCE}); pwd) +redrel=$(cat /etc/redhat-release 2> /dev/null) +if [ "${redrel##*release 7}" != "${redrel}" ]; then + source /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-centos7/setup.sh +elif [ "${redrel##*release 9}" != "${redrel}" ]; then + source /cvmfs/sft.cern.ch/lcg/releases/clang/16.0.3-9dda8/x86_64-el9/setup.sh +else + echo "ERROR! RedHat release ${redrel} is not supported by ${BASH_SOURCE}" + return 1 +fi +###export ALLOW_UNSUPPORTED_COMPILER_IN_CUDA=1 +export CC=$scrdir/mg-lcg-clang-16.0.3 +export CXX=$scrdir/mg-lcg-clang++-16.0.3 +