Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Jooorgen committed Aug 16, 2023
2 parents d889085 + fe3cdf7 commit 3821d30
Show file tree
Hide file tree
Showing 284 changed files with 9,197 additions and 7,563 deletions.
2 changes: 1 addition & 1 deletion epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/commit.GIT
Original file line number Diff line number Diff line change
@@ -1 +1 @@
91f37b6aa
941832b9b
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,6 @@ index 617f10b93..dbe08b846 100644
+
+cleanall: cleanSource
+ for i in `ls -d ../SubProcesses/P*`; do cd $$i; make cleanavxs; cd -; done;
diff --git b/epochX/cudacpp/gg_tt.mad/Source/vector.inc a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 863eebbc7..92254c0f2 100644
--- b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -28,4 +28,5 @@ C BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
C (see https://github.com/madgraph5/madgraph4gpu/issues/458).
C
INTEGER VECSIZE_MEMMAX
- PARAMETER (VECSIZE_MEMMAX=16384)
+ PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
+c PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
index 348c283be..74db44d84 100644
--- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
Expand Down Expand Up @@ -319,10 +308,10 @@ index 57f5f7bb9..bd3c24228 100644

BIASLIBDIR=../../../lib/
diff --git b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 27cd896a7..c1e54d3cb 100755
index 8f8df219d..7624b9f55 100755
--- b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -4164,7 +4164,8 @@ class RunCardLO(RunCard):
@@ -4187,7 +4187,8 @@ class RunCardLO(RunCard):
self.add_param('mxxmin4pdg',[-1.], system=True)
self.add_param('mxxpart_antipart', [False], system=True)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cassert>
#include <cmath>
#include <cstring>
#include <filesystem>
#include <iostream>
#include <memory>
#include <type_traits>
Expand Down Expand Up @@ -244,14 +245,21 @@ namespace mg5amcCpu
}
std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
<< ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
CPPProcess process( /*verbose=*/false );
m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
#else
std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
CPPProcess process( /*verbose=*/false );
m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
#endif // MGONGPUCPP_GPUIMPL
process.initProc( "../../Cards/param_card.dat" );
// Create a process object, read param card and set parameters
// FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
// FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
CPPProcess process( /*verbose=*/false );
std::string paramCard = "../../Cards/param_card.dat";
if( !std::filesystem::exists( paramCard ) )
{
paramCard = "../" + paramCard;
}
process.initProc( paramCard );
}

#ifdef MGONGPUCPP_GPUIMPL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,10 +278,10 @@ main( int argc, char** argv )
const std::string procKey = "0a ProcInit";
timermap.start( procKey );

// Create a process object
// Create a process object, read param card and set parameters
// FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
// FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?)
CPPProcess process( verbose );

// Read param_card and set parameters
process.initProc( "../../Cards/param_card.dat" );
const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
//const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,7 @@ extern "C"
#ifdef MGONGPUCPP_GPUIMPL
GpuRuntime::setUp();
#endif
// Create a process object, read parm card and set parameters
// FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
// FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
CPPProcess process( /*verbose=*/false );
process.initProc( "../../Cards/param_card.dat" );
// (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor)
// FIXME: disable OMP in Bridge when called from Fortran
*ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,12 @@ namespace mg5amcCpu
fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation)
{ /* clang-format on */
fptype allMEsLast = 0;
{ /* clang-format on */
const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
allMEs[ievt] = 0;
for( int ihel = 0; ihel < ncomb; ihel++ )
{
// NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
allMEs[ievt] = 0;
// NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
Expand All @@ -265,12 +265,11 @@ namespace mg5amcCpu
#else
calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
#endif
if( allMEs[ievt] != allMEsLast )
if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
{
//if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
isGoodHel[ihel] = true;
}
allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
}
}
#else
Expand All @@ -289,19 +288,11 @@ namespace mg5amcCpu
//assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
// Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
fptype allMEsLast[maxtry0] = { 0 }; // allocated at build time: maxtry0 must be a constexpr
// Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
assert( nevt >= neppV );
const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)

// PART 0 - INITIALISATION (before calculate_wavefunctions)
// Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
for( int ievt = 0; ievt < maxtry; ++ievt )
{
allMEs[ievt] = 0; // all zeros
}

// PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
// HELICITY LOOP: CALCULATE WAVEFUNCTIONS
const int npagV = maxtry / neppV;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
// Mixed fptypes #537: float for color algebra and double elsewhere
Expand All @@ -320,6 +311,16 @@ namespace mg5amcCpu
#endif
for( int ihel = 0; ihel < ncomb; ihel++ )
{
// NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
for( int ieppV = 0; ieppV < neppV; ++ieppV )
{
const int ievt = ievt00 + ieppV;
allMEs[ievt] = 0;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
const int ievt2 = ievt00 + ieppV + neppV;
allMEs[ievt2] = 0;
#endif
}
constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
//std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
Expand All @@ -331,22 +332,18 @@ namespace mg5amcCpu
for( int ieppV = 0; ieppV < neppV; ++ieppV )
{
const int ievt = ievt00 + ieppV;
const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
if( differs )
if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
{
//if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
isGoodHel[ihel] = true;
}
allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
const int ievt2 = ievt00 + ieppV + neppV;
const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
if( differs2 )
if( allMEs[ievt2] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
{
//if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
isGoodHel[ihel] = true;
}
allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
#endif
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ struct CPUTest : public CUDA_CPU_TestBase
, hstSelCol( nevt )
, hstIsGoodHel( CPPProcess::ncomb )
{
// FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
// FIXME: the CPPProcess should really be a singleton?
process.initProc( "../../Cards/param_card.dat" );
}

Expand Down Expand Up @@ -183,6 +185,8 @@ struct CUDATest : public CUDA_CPU_TestBase
, devSelCol( nevt )
, devIsGoodHel( CPPProcess::ncomb )
{
// FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
// FIXME: the CPPProcess should really be a singleton?
process.initProc( "../../Cards/param_card.dat" );
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,7 @@ def __init__(self, *args, **kwargs):
for kwarg in kwargs: misc.sprint( 'kwargs[%s] = %s' %( kwarg, kwargs[kwarg] ) )
super().__init__(*args, **kwargs)
self.process_class = 'CPPProcess'
###if self.in_madevent_mode: proc_id = kwargs['prefix']+1 # madevent+cudacpp (NB: HERE SELF.IN_MADEVENT_MODE DOES NOT WORK!)
if 'prefix' in kwargs: proc_id = kwargs['prefix']+1 # madevent+cudacpp (ime+1 from ProcessExporterFortranMEGroup.generate_subprocess_directory)
else: proc_id = 0 # standalone_cudacpp
misc.sprint(proc_id)
Expand Down Expand Up @@ -1145,7 +1146,7 @@ def get_sigmaKin_lines(self, color_amplitudes, write=True):
misc.sprint(self.support_multichannel)
replace_dict = super().get_sigmaKin_lines(color_amplitudes, write=False)
replace_dict['proc_id'] = self.proc_id if self.proc_id>0 else 1
replace_dict['proc_id_source'] = 'madevent + cudacpp exporter' if self.proc_id>0 else 'standalone_cudacpp'
replace_dict['proc_id_source'] = 'madevent + cudacpp exporter' if self.proc_id>0 else 'standalone_cudacpp' # FIXME? use self.in_madevent_mode instead?
if write:
file = self.read_template_file(self.process_sigmaKin_function_template) % replace_dict
file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_sigmaKin_function.inc (copyright)
Expand Down Expand Up @@ -1296,7 +1297,7 @@ def generate_process_files(self):
"""Generate mgOnGpuConfig.h, CPPProcess.cc, CPPProcess.h, check_sa.cc, gXXX.cu links"""
misc.sprint('Entering PLUGIN_OneProcessExporter.generate_process_files')
if self.include_multi_channel:
misc.sprint('self.include_multi_channel is already defined: this is madevent+second_exporter mode')
misc.sprint('self.include_multi_channel is already defined: this is madevent+second_exporter mode') # FIXME? use self.in_madevent_mode instead?
else:
misc.sprint('self.include_multi_channel is not yet defined: this is standalone_cudacpp mode') # see issue #473
if self.matrix_elements[0].get('has_mirror_process'):
Expand Down
20 changes: 18 additions & 2 deletions epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (C) 2020-2023 CERN and UCLouvain.
# Licensed under the GNU Lesser General Public License (version 3 or later).
# Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
# Further modified by: O. Mattelaer, A. Valassi, J. Teig, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.
# Further modified by: O. Mattelaer, S. Roiser, A. Valassi, J. Teig, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin.

import os

Expand Down Expand Up @@ -149,6 +149,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):

# AV (default from OM's tutorial) - add a debug printout
def __init__(self, *args, **kwargs):
self.in_madevent_mode = False # see MR #747
misc.sprint('Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter)')
return super().__init__(*args, **kwargs)

Expand Down Expand Up @@ -200,7 +201,14 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
cmdhistory is the list of command used so far.
MG5options are all the options of the main interface
outputflags is a list of options provided when doing the output command"""
misc.sprint('Entering PLUGIN_ProcessExporter.finalize')
misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode)
if self.in_madevent_mode:
self.add_input_for_banner()
if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL']
else: patchlevel = ''
path = os.path.realpath(os.curdir + os.sep + 'PLUGIN' + os.sep + 'CUDACPP_SA_OUTPUT')
if os.system(path + os.sep + 'patchMad.sh ' + self.dir_path + ' PROD ' + patchlevel) != 0:
raise Exception('ERROR! the O/S call to patchMad.sh failed')
return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)

# AV (default from OM's tutorial) - overload settings and add a debug printout
Expand All @@ -213,4 +221,12 @@ def modify_grouping(self, matrix_element):
misc.sprint('Entering PLUGIN_ProcessExporter.modify_grouping')
return False, matrix_element

# OM additional fixes for madevent+cudacpp mode
def add_input_for_banner(self):
# Note: this is only called in madevent mode (self.in_madevent_mode = True)
new_parameters = ["{'name':'cudacpp_backend', 'value':'CPP', 'include':False, 'hidden':False}"]
finput = open(pjoin(self.dir_path, 'bin', 'internal', 'plugin_run_card'), 'w')
for entry in new_parameters:
finput.write(entry)

#------------------------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
# Created by: A. Valassi (Mar 2022) for the MG5aMC CUDACPP plugin.
# Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.

set -e # immediate exit on error

status=0

scrdir=$(cd $(dirname $0); pwd)

function usage()
{
echo "Usage: $0 <process.[madonly|mad]> <vecsize> <patch_dir> [--nopatch|--upstream]"
echo "ERROR! Unknown command '$0 $*'"
echo "Usage: $0 <process_dir> <patch_dir> [--nopatch|--upstream]"
exit 1
}

Expand All @@ -19,22 +22,19 @@ function usage()
###patchlevel=1 # [--nopatch] modify upstream MG5AMC but do not apply patch commands (reference to prepare new patches)
patchlevel=2 # [DEFAULT] complete generation of cudacpp .sa/.mad (copy templates and apply patch commands)

if [ "${1%.madonly}" == "$1" ] && [ "${1%.mad}" == "$1" ]; then
usage
elif [ "$3" == "" ]; then
usage
elif [ "$4" == "--nopatch" ]; then
if [ "$5" != "" ]; then usage; fi
if [ "$2" == "" ]; then
usage $*
elif [ "$3" == "--nopatch" ]; then
if [ "$4" != "" ]; then usage; fi
patchlevel=1
elif [ "$4" == "--upstream" ]; then
if [ "$5" != "" ]; then usage; fi
elif [ "$3" == "--upstream" ]; then
if [ "$4" != "" ]; then usage; fi
patchlevel=0
elif [ "$4" != "" ]; then
usage
elif [ "$3" != "" ]; then
usage $*
fi
dir=$1
vecsize=$2
dir_patches=$3
dir_patches=$2
###echo "Current dir: $pwd"
###echo "Input dir to patch: $dir"

Expand Down Expand Up @@ -67,8 +67,9 @@ cat ${dir}/Source/make_opts >> ${dir}/Source/make_opts.new
# Patch the default Fortran code to provide the integration with the cudacpp plugin
# (1) Process-independent patches
touch ${dir}/Events/.keep # this file should already be present (mg5amcnlo copies it from Template/LO/Events/.keep)
\cp -dpr ${scrdir}/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/.clang-format ${dir} # new file
\cp -dpr ${scrdir}/MG5aMC_patches/${dir_patches}/fbridge_common.inc ${dir}/SubProcesses # new file
sed -i 's/2 = sde_strategy/1 = sde_strategy/' ${dir}/Cards/run_card.dat # use strategy SDE=1 in multichannel mode (see #419)
sed -i 's/SDE_STRAT = 2/SDE_STRAT = 1/' ${dir}/Source/run_card.inc # use strategy SDE=1 in multichannel mode (see #419)
if [ "${patchlevel}" == "2" ]; then
cd ${dir}
sed -i 's/DEFAULT_F2PY_COMPILER=f2py3.*/DEFAULT_F2PY_COMPILER=f2py3/' Source/make_opts
Expand All @@ -93,9 +94,6 @@ for p1dir in ${dir}/SubProcesses/P*; do
ln -sf ../fbridge_common.inc . # new file
\cp -dpr ${scrdir}/MG5aMC_patches/${dir_patches}/counters.cc . # new file
\cp -dpr ${scrdir}/MG5aMC_patches/${dir_patches}/ompnumthreads.cc . # new file
if [ "${dir%.mad}" == "$1" ]; then
\cp -dpr ${scrdir}/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timer.h . # new file, already present via cudacpp in *.mad
fi
if [ "${patchlevel}" == "2" ]; then
echo "DEBUG: cd ${PWD}; patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1"
if ! patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1; then status=1; fi
Expand Down
Loading

0 comments on commit 3821d30

Please sign in to comment.