Merge pull request #723 from valassi/fpe

Fixes in xxxxx for IEEE_DIVIDE_BY_ZERO FPE; separate cpu/gpu namespaces and fix runtest segfault
madgraph5 · Jul 21, 2023 · 39de7b3 · 39de7b3
2 parents d1d87b6 + 49f9d3f
commit 39de7b3
Show file tree

Hide file tree

Showing 383 changed files with 95,591 additions and 60,273 deletions.
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu
@@ -1,3 +1,4 @@
+! Copyright (C) 2010 The ALOHA Development team and Contributors.
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
 !==========================================================================

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
@@ -1,3 +1,4 @@
+! Copyright (C) 2010 The ALOHA Development team and Contributors.
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
 !==========================================================================
@@ -164,6 +165,9 @@
           const int ipar )        // input: particle# out of npar
   {
     mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -175,15 +179,18 @@
     if( fmass != 0. )
     {
       const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+      // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx:
+      // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1
+      // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used
+      const int ip = ( 1 + nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3+nh)/2 because omega(2) has indexes 1,2
+      const int im = ( 1 - nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3-nh)/2 because omega(2) has indexes 1,2
 #ifndef MGONGPU_CPPSIMD
       if( pp == 0. )
       {
         // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
         fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
         //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
         sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
-        const int ip = ( 1 + nh ) / 2;              // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
-        const int im = ( 1 - nh ) / 2;              // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
         fi[2] = cxmake( ip * sqm[ip], 0 );
         fi[3] = cxmake( im * nsf * sqm[ip], 0 );
         fi[4] = cxmake( ip * nsf * sqm[im], 0 );
@@ -195,8 +202,6 @@
                                fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
         fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
         omega[1] = fmass / omega[0];
-        const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1
-        const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1
         const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
         const fptype pp3 = fpmax( pp + pvec3, 0. );
         const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
@@ -207,8 +212,6 @@
         fi[5] = sfomega[1] * chi[ip];
       }
 #else
-      const int ip = ( 1 + nh ) / 2;
-      const int im = ( 1 - nh ) / 2;
       // Branch A: pp == 0.
       // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
       fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!)
@@ -224,10 +227,12 @@
       omega[1] = fmass / omega[0];
       const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0 );
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0 ),
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );             // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );          // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 cxternary( ( pp3 == 0. ),
                                            cxmake( -nh, 0 ),
-                                           cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+                                           cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
       const cxtype_v fiB_2 = sfomega[0] * chi[im];
       const cxtype_v fiB_3 = sfomega[0] * chi[ip];
       const cxtype_v fiB_4 = sfomega[1] * chi[im];
@@ -245,7 +250,16 @@
       const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
                                           fptype_sv{ 0 },
                                           fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
-      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
+#ifdef MGONGPU_CPPSIMD
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+                           cxternary( sqp0p3 == 0,
+                                      cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ),
+                                      cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
+#else
+      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+                                 ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
+#endif
       if( nh == 1 )
       {
         fi[2] = cxzero_sv();
@@ -396,6 +410,9 @@
           const int ipar )        // input: particle# out of npar
   {
     mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -446,13 +463,15 @@
       const cxtype vcA_4 = cxmake( 0, nsvahl * sqh );
       const cxtype vcA_5 = cxmake( hel0, 0 );
       // Branch B: pp != 0.
-      const fptype_v emp = pvec0 / ( vmass * pp );
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      const fptype_v emp = pvec0 / ( vmass * ppDENOM );         // hack: dummy[ieppV] is not used if pp[ieppV]==0
       const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 );
-      const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0 );
+      const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / ppDENOM * sqh, 0 ); // hack: dummy[ieppV] is not used if pp[ieppV]==0
       // Branch B1: pp != 0. and pt != 0.
-      const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
-      const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / pt * sqh );
-      const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / pt * sqh );
+      volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. );                                                     // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
+      const fptype_v pzpt = pvec3 / ( ppDENOM * ptDENOM ) * sqh * hel;                                              // hack: dummy[ieppV] is not used if pp[ieppV]==0
+      const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / ptDENOM * sqh );  // hack: dummy[ieppV] is not used if pt[ieppV]==0
       // Branch B2: pp != 0. and pt == 0.
       const cxtype vcB2_3 = cxmake( -hel * sqh, 0. );
       const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
@@ -487,9 +506,10 @@
       }
 #else
       // Branch A: pt != 0.
-      const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
-      const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / pt * sqh );
-      const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / pt * sqh );
+      volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. );                             // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0
+      const fptype_v pzpt = pvec3 / ( pp * ptDENOM ) * sqh * hel;                           // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0
+      const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / ptDENOM * sqh );  // hack: dummy[ieppV] is not used if pt[ieppV]==0
       // Branch B: pt == 0.
       const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 );
       const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
@@ -541,6 +561,9 @@
           const int ipar )        // input: particle# out of npar
   {
     mgDebug( 0, __FUNCTION__ );
+    // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701)
+    // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727)
+    // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724)
     const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
     const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
     const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
@@ -604,10 +627,12 @@
       const int imB = ( 1 - nh ) / 2;
       const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
       const fptype_v pp3 = fpmax( pp + pvec3, 0. );
-      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0. ),
+      volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. );              // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0
+      volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. );           // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / ppDENOM ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0
                                 ( cxternary( ( pp3 == 0. ),
                                              cxmake( -nh, 0. ),
-                                             cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) ) };
+                                             cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0
       const cxtype_v foB_2 = sfomeg[1] * chi[imB];
       const cxtype_v foB_3 = sfomeg[1] * chi[ipB];
       const cxtype_v foB_4 = sfomeg[0] * chi[imB];
@@ -625,10 +650,16 @@
       const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
                                           0,
                                           fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
+#ifdef MGONGPU_CPPSIMD
+      volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0
+      const cxtype_v chi[2] = { cxmake( sqp0p3, 0. ),
+                                cxternary( ( sqp0p3 == 0. ),
+                                           cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
+                                           cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0
+#else
       const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
-                                 cxternary( ( sqp0p3 == 0. ),
-                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
-                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
+                                 ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
+#endif
       if( nh == 1 )
       {
         fo[2] = chi[0];

diff --git a/...DEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/...DEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -15,6 +15,12 @@
 #include <iomanip>
 #include <iostream>
 
+#ifdef __CUDACC__
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
 #ifndef MGONGPU_HARDCODE_PARAM
 
 // Initialize static instance