diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 index 26924d7fab..78f1a95f23 100644 --- a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 +++ b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 @@ -1,5 +1,5 @@ diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f -index a5c686eb1..20d101dcf 100644 +index 27ed1439e..3b24a9924 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -469,23 +469,140 @@ C @@ -157,7 +157,7 @@ index a5c686eb1..20d101dcf 100644 END diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f -index a04c93011..2825f59d0 100644 +index 71fbf2b25..0f1d199fc 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -74,13 +74,77 @@ c common/to_colstats/ncols,ncolflow,ncolalt,ic @@ -239,7 +239,7 @@ index a04c93011..2825f59d0 100644 c c Read process number c -@@ -202,8 +266,33 @@ c call sample_result(xsec,xerr) +@@ -207,8 +271,33 @@ c call sample_result(xsec,xerr) c write(*,*) 'Final xsec: ',xsec rewind(lun) @@ -274,7 +274,7 @@ index a04c93011..2825f59d0 100644 end c $B$ get_user_params $B$ ! tag for MadWeight -@@ -381,7 +470,7 @@ c +@@ -386,7 +475,7 @@ c fopened=.false. tempname=filename fine=index(tempname,' ') @@ -284,7 +284,7 @@ index a04c93011..2825f59d0 100644 open(unit=lun,file=tempname,status='old',ERR=20) fopened=.true. diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f -index b6b52f2df..582e2b564 100644 +index 3ac962688..ef18aff22 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f @@ -72,7 +72,10 @@ C diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common index f638e0ff17..7e53d38f8e 100644 --- a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common +++ b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common @@ -47,17 +47,20 @@ index 863eebbc7..92254c0f2 100644 + PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU +c PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498) diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile -index 348c283be..6999320d9 100644 +index 348c283be..74db44d84 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile -@@ -1,6 +1,19 @@ +@@ -1,6 +1,22 @@ +SHELL := /bin/bash + include ../../Source/make_opts FFLAGS+= -w +# Enable the C preprocessor https://gcc.gnu.org/onlinedocs/gfortran/Preprocessing-Options.html -+FFLAGS+= -cpp ++FFLAGS+= -cpp ++ ++# Compile counters with -O3 as in the cudacpp makefile (avoid being "unfair" to Fortran #740) ++CXXFLAGS = -O3 -Wall -Wshadow -Wextra + +# Enable ccache if USECCACHE=1 +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -70,7 +73,7 @@ index 348c283be..6999320d9 100644 # Load additional dependencies of the bias module, if present ifeq (,$(wildcard ../bias_dependencies)) BIASDEPENDENCIES = -@@ -24,7 +37,26 @@ else +@@ -24,7 +40,26 @@ else MADLOOP_LIB = endif @@ -98,7 +101,7 @@ index 348c283be..6999320d9 100644 LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUPC.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) -@@ -43,41 +75,112 @@ ifeq ($(strip $(MATRIX_HEL)),) +@@ -43,41 +78,112 @@ ifeq ($(strip $(MATRIX_HEL)),) endif @@ -179,23 +182,23 @@ index 348c283be..6999320d9 100644 +madevent_cuda_link: $(CUDACPP_BUILDDIR)/$(PROG)_cuda + rm -f $(PROG) + ln -s $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROG) -+ + +-$(LIBDIR)libpdf.$(libext): +- cd ../../Source/PDF; make +# Building $(PROG)_cpp also builds $(PROG)_cuda if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) +$(CUDACPP_BUILDDIR)/$(PROG)_cpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs + $(FC) -o $(CUDACPP_BUILDDIR)/$(PROG)_cpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/$(PROG)_cuda $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi --$(LIBDIR)libpdf.$(libext): -- cd ../../Source/PDF; make -+$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(CUDACPP_BUILDDIR)/$(PROG)_cpp - -$(LIBDIR)libgammaUPC.$(libext): - cd ../../Source/PDF/gammaUPC; make ++$(CUDACPP_BUILDDIR)/$(PROG)_cuda: $(CUDACPP_BUILDDIR)/$(PROG)_cpp ++ +counters.o: counters.cc timer.h -+ $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ++ $(CXX) $(CXXFLAGS) -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h -+ $(CXX) -I. -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ ++ $(CXX) -I. $(CXXFLAGS) $(OMPFLAGS) -c $< -o $@ + +$(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) @@ -227,7 +230,7 @@ index 348c283be..6999320d9 100644 # Dependencies -@@ -97,5 +200,61 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ +@@ -97,5 +203,61 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ run_config.inc initcluster.o: message.inc @@ -316,7 +319,7 @@ index 57f5f7bb9..bd3c24228 100644 BIASLIBDIR=../../../lib/ diff --git b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py -index 90434be1f..5d595b30b 100755 +index 27cd896a7..c1e54d3cb 100755 --- b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -4164,7 +4164,8 @@ class RunCardLO(RunCard): @@ -330,10 +333,10 @@ index 90434be1f..5d595b30b 100755 def check_validity(self): """ """ diff --git b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py -index 7d91ea6a1..ce7cb5735 100755 +index 4dd71db86..3b8ec3121 100755 --- b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py +++ a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py -@@ -367,8 +367,20 @@ class gensym(object): +@@ -380,8 +380,20 @@ class gensym(object): done = True if not done: raise Exception('Parsing error in gensym: %s' % stdout) @@ -357,7 +360,7 @@ index 7d91ea6a1..ce7cb5735 100755 self.submit_to_cluster(job_list) job_list = {} diff --git b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py -index a0dbe1766..f5e115dc3 100755 +index a056d3861..b70b548e5 100755 --- b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -3614,8 +3614,20 @@ Beware that this can be dangerous for local multicore runs.""")