Skip to content

Commit

Permalink
Merge pull request #737 from valassi/icx
Browse files Browse the repository at this point in the history
Several fixes for icx2023.2 (including fixes for sqrt FPEs in ixx/oxx/vxx)
  • Loading branch information
valassi committed Jul 26, 2023
2 parents 39de7b3 + b38bea0 commit a4b9d6b
Show file tree
Hide file tree
Showing 116 changed files with 1,815 additions and 730 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
// A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand All @@ -178,7 +179,12 @@
const int nh = nhel * nsf;
if( fmass != 0. )
{
#ifndef MGONGPU_CPPSIMD
const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
#else
volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
#endif
// In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
// for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
// NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
Expand Down Expand Up @@ -227,9 +233,10 @@
omega[1] = fmass / omega[0];
const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
const fptype_v pp3 = fpmax( pp + pvec3, 0 );
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736
const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
cxternary( ( pp3 == 0. ),
cxmake( -nh, 0 ),
cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
Expand All @@ -247,16 +254,20 @@
}
else
{
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
fptype_sv{ 0 },
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
#ifdef MGONGPU_CPPSIMD
volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
fptype_sv{ 0 },
fpsqrt( p0p3 ) * (fptype)nsf );
volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
cxternary( sqp0p3 == 0,
cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
#else
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
fptype_sv{ 0 },
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
#endif
Expand Down Expand Up @@ -413,6 +424,7 @@
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
// A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand All @@ -425,11 +437,11 @@
if( vmass != 0. )
{
const int nsvahl = nsv * std::abs( hel );
const fptype hel0 = 1. - std::abs( hel );
#ifndef MGONGPU_CPPSIMD
const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
const fptype hel0 = 1. - std::abs( hel );
#ifndef MGONGPU_CPPSIMD
if( pp == 0. )
{
vc[2] = cxmake( 0., 0. );
Expand Down Expand Up @@ -457,6 +469,10 @@
}
}
#else
volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736
const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
// Branch A: pp == 0.
const cxtype vcA_2 = cxmake( 0, 0 );
const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
Expand Down Expand Up @@ -487,7 +503,12 @@
else
{
const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
#ifndef MGONGPU_CPPSIMD
const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
#else
volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736
const fptype_sv pt = fpsqrt( pt2 );
#endif
vc[2] = cxzero_sv();
vc[5] = cxmake( hel * pt / pp * sqh, 0. );
#ifndef MGONGPU_CPPSIMD
Expand Down Expand Up @@ -564,6 +585,7 @@
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
// A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand All @@ -574,8 +596,8 @@
const int nh = nhel * nsf;
if( fmass != 0. )
{
const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
#ifndef MGONGPU_CPPSIMD
const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
if( pp == 0. )
{
// NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
Expand Down Expand Up @@ -608,6 +630,8 @@
fo[5] = sfomeg[0] * chi[ip];
}
#else
volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736
const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) );
// Branch A: pp == 0.
// NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
Expand All @@ -627,9 +651,10 @@
const int imB = ( 1 - nh ) / 2;
const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
const fptype_v pp3 = fpmax( pp + pvec3, 0. );
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736
const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
( cxternary( ( pp3 == 0. ),
cxmake( -nh, 0. ),
cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
Expand All @@ -647,16 +672,20 @@
}
else
{
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
0,
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
#ifdef MGONGPU_CPPSIMD
volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736
volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
fptype_sv{ 0 },
fpsqrt( p0p3 ) * (fptype)nsf );
volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ),
cxternary( ( sqp0p3 == 0. ),
cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
#else
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
0,
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,15 @@ class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
/// and compares momenta and matrix elements with a reference file.
TEST_P( MadgraphTest, CompareMomentaAndME )
{
// Set to true to dump events:
constexpr bool dumpEvents = false;
constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 4.E-2; // see #735
#ifdef __APPLE__
const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
#else
const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
#endif
constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
// Dump events to a new reference file?
constexpr bool dumpEvents = false;
std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
while( dumpFileName.find( '/' ) != std::string::npos )
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,20 @@ MG5AMC_COMMONLIB = mg5amc_common
LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
INCFLAGS += -I../../src

# Compiler-specific googletest build directory (#125 and #738)
ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),)
override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5)
else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),)
override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3)
else
override CXXNAME = unknown
endif
###$(info CXXNAME=$(CXXNAME))
override CXXNAMESUFFIX = _$(CXXNAME)
export CXXNAMESUFFIX

# Dependency on test directory
# Within the madgraph4gpu git repo: by default use a common gtest installation in <topdir>/test (optionally use an external or local gtest)
# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest)
Expand All @@ -50,10 +64,10 @@ ifneq ($(wildcard $(GTEST_ROOT)),)
TESTDIR =
else ifneq ($(LOCALGTEST),)
TESTDIR=$(TESTDIRLOCAL)
GTEST_ROOT = $(TESTDIR)/googletest/install
GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),)
TESTDIR = $(TESTDIRCOMMON)
GTEST_ROOT = $(TESTDIR)/googletest/install
GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX)
else
TESTDIR =
endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,36 @@

THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

# Compiler-specific googletest build directory (#125 and #738)
# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk
# In epoch1/epoch2, CXXNAMESUFFIX is undefined
$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX))
BUILDDIR = build$(CXXNAMESUFFIX)
###$(info BUILDDIR=$(BUILDDIR))
INSTALLDIR = install$(CXXNAMESUFFIX)
###$(info INSTALLDIR=$(INSTALLDIR))

CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11

all: googletest/install/lib64/libgtest.a
all: googletest/$(INSTALLDIR)/lib64/libgtest.a

googletest/CMakeLists.txt:
git clone https://github.com/google/googletest.git -b release-1.11.0 googletest

googletest/build/Makefile: googletest/CMakeLists.txt
mkdir -p googletest/build
cd googletest/build && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../
googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
mkdir -p googletest/$(BUILDDIR)
cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../

googletest/build/lib/libgtest.a: googletest/build/Makefile
$(MAKE) -C googletest/build
googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile
$(MAKE) -C googletest/$(BUILDDIR)

# NB 'make install' is no longer supported in googletest (issue 328)
# NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations
googletest/install/lib64/libgtest.a: googletest/build/lib/libgtest.a
mkdir -p googletest/install/lib64
cp googletest/build/lib/lib*.a googletest/install/lib64/
mkdir -p googletest/install/include
cp -r googletest/googletest/include/gtest googletest/install/include/
googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a
mkdir -p googletest/$(INSTALLDIR)/lib64
cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/
mkdir -p googletest/$(INSTALLDIR)/include
cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/

clean:
rm -rf googletest

Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,20 @@ namespace mg5amcCpu
// Functions and operators for fptype_v

#ifdef MGONGPU_CPPSIMD
inline fptype_v
fpsqrt( const volatile fptype_v& v ) // volatile fixes #736
{
// See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
for( int i = 0; i < neppV; i++ )
{
volatile fptype outi = 0; // volatile fixes #736
if( v[i] > 0 ) outi = fpsqrt( (fptype)v[i] );
out[i] = outi;
}
return out;
}

inline fptype_v
fpsqrt( const fptype_v& v )
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace mg5amcCpu
#else
std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl;
#endif
exit( 0 );
exit( 1 );
}
}

Expand Down Expand Up @@ -129,7 +129,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
const fptype p1 = par0[ievt * np4 + 1];
const fptype p2 = par0[ievt * np4 + 2];
const fptype p3 = par0[ievt * np4 + 3];
mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736
if( m2 > 0 )
mass0[ievt] = fpsqrt( (fptype)m2 );
else
mass0[ievt] = 0;
ispzgt0[ievt] = ( p3 > 0 );
ispzlt0[ievt] = ( p3 < 0 );
isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
Expand Down
Loading

0 comments on commit a4b9d6b

Please sign in to comment.