Skip to content

Commit

Permalink
Merge pull request #723 from valassi/fpe
Browse files Browse the repository at this point in the history
Fixes in xxxxx for IEEE_DIVIDE_BY_ZERO FPE; separate cpu/gpu namespaces and fix runtest segfault
  • Loading branch information
valassi committed Jul 21, 2023
2 parents d1d87b6 + 49f9d3f commit 39de7b3
Show file tree
Hide file tree
Showing 383 changed files with 95,591 additions and 60,273 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
! Copyright (C) 2010 The ALOHA Development team and Contributors.
! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
!==========================================================================
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
! Copyright (C) 2010 The ALOHA Development team and Contributors.
! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
!==========================================================================
Expand Down Expand Up @@ -164,6 +165,9 @@
const int ipar ) // input: particle# out of npar
{
mgDebug( 0, __FUNCTION__ );
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand All @@ -175,15 +179,18 @@
if( fmass != 0. )
{
const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
// In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
// for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
// NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
const int ip = ( 1 + nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3+nh)/2 because omega(2) has indexes 1,2
const int im = ( 1 - nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3-nh)/2 because omega(2) has indexes 1,2
#ifndef MGONGPU_CPPSIMD
if( pp == 0. )
{
// NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
//sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
const int ip = ( 1 + nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
const int im = ( 1 - nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
fi[2] = cxmake( ip * sqm[ip], 0 );
fi[3] = cxmake( im * nsf * sqm[ip], 0 );
fi[4] = cxmake( ip * nsf * sqm[im], 0 );
Expand All @@ -195,8 +202,6 @@
fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
omega[1] = fmass / omega[0];
const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1
const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1
const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
const fptype pp3 = fpmax( pp + pvec3, 0. );
const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
Expand All @@ -207,8 +212,6 @@
fi[5] = sfomega[1] * chi[ip];
}
#else
const int ip = ( 1 + nh ) / 2;
const int im = ( 1 - nh ) / 2;
// Branch A: pp == 0.
// NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!)
Expand All @@ -224,10 +227,12 @@
omega[1] = fmass / omega[0];
const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
const fptype_v pp3 = fpmax( pp + pvec3, 0 );
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0 ),
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
cxternary( ( pp3 == 0. ),
cxmake( -nh, 0 ),
cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
const cxtype_v fiB_2 = sfomega[0] * chi[im];
const cxtype_v fiB_3 = sfomega[0] * chi[ip];
const cxtype_v fiB_4 = sfomega[1] * chi[im];
Expand All @@ -245,7 +250,16 @@
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
fptype_sv{ 0 },
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
#ifdef MGONGPU_CPPSIMD
volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
cxternary( sqp0p3 == 0,
cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
#else
const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
#endif
if( nh == 1 )
{
fi[2] = cxzero_sv();
Expand Down Expand Up @@ -396,6 +410,9 @@
const int ipar ) // input: particle# out of npar
{
mgDebug( 0, __FUNCTION__ );
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand Down Expand Up @@ -446,13 +463,15 @@
const cxtype vcA_4 = cxmake( 0, nsvahl * sqh );
const cxtype vcA_5 = cxmake( hel0, 0 );
// Branch B: pp != 0.
const fptype_v emp = pvec0 / ( vmass * pp );
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
const fptype_v emp = pvec0 / ( vmass * ppDENOM ); // hack: dummy[ieppV] is not used if pp[ieppV]==0
const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 );
const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0 );
const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / ppDENOM * sqh, 0 ); // hack: dummy[ieppV] is not used if pp[ieppV]==0
// Branch B1: pp != 0. and pt != 0.
const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / pt * sqh );
const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / pt * sqh );
volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. ); // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
const fptype_v pzpt = pvec3 / ( ppDENOM * ptDENOM ) * sqh * hel; // hack: dummy[ieppV] is not used if pp[ieppV]==0
const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
// Branch B2: pp != 0. and pt == 0.
const cxtype vcB2_3 = cxmake( -hel * sqh, 0. );
const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
Expand Down Expand Up @@ -487,9 +506,10 @@
}
#else
// Branch A: pt != 0.
const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / pt * sqh );
const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / pt * sqh );
volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. ); // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
const fptype_v pzpt = pvec3 / ( pp * ptDENOM ) * sqh * hel; // hack: dummy[ieppV] is not used if pt[ieppV]==0
const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
// Branch B: pt == 0.
const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 );
const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
Expand Down Expand Up @@ -541,6 +561,9 @@
const int ipar ) // input: particle# out of npar
{
mgDebug( 0, __FUNCTION__ );
// NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
// Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
// Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
Expand Down Expand Up @@ -604,10 +627,12 @@
const int imB = ( 1 - nh ) / 2;
const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
const fptype_v pp3 = fpmax( pp + pvec3, 0. );
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0. ),
volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
( cxternary( ( pp3 == 0. ),
cxmake( -nh, 0. ),
cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) ) };
cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
const cxtype_v foB_2 = sfomeg[1] * chi[imB];
const cxtype_v foB_3 = sfomeg[1] * chi[ipB];
const cxtype_v foB_4 = sfomeg[0] * chi[imB];
Expand All @@ -625,10 +650,16 @@
const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
0,
fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
#ifdef MGONGPU_CPPSIMD
volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
cxternary( ( sqp0p3 == 0. ),
cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
#else
const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
cxternary( ( sqp0p3 == 0. ),
cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
#endif
if( nh == 1 )
{
fo[2] = chi[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
#include <iomanip>
#include <iostream>

#ifdef __CUDACC__
using namespace mg5amcGpu;
#else
using namespace mg5amcCpu;
#endif

#ifndef MGONGPU_HARDCODE_PARAM

// Initialize static instance
Expand Down
Loading

0 comments on commit 39de7b3

Please sign in to comment.