From f174f4f854f9344ec96b8146a7a6627aeb155eb4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Jun 2024 18:48:02 +0200 Subject: [PATCH] [tmad] rerun 102 tput tests on itscrd90 - all ok (after merging #860 and #850 for Ccoeff #825) STARTED AT Mon Jun 3 05:51:20 PM CEST 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Mon Jun 3 06:12:34 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Mon Jun 3 06:20:51 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Mon Jun 3 06:29:05 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Mon Jun 3 06:31:55 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Mon Jun 3 06:34:42 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ENDED(6) AT Mon Jun 3 06:37:37 PM CEST 2024 [Status=0] ./tput/teeThroughputX.sh -mix -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb -makeclean ENDED(7) AT Mon Jun 3 06:47:12 PM CEST 2024 [Status=0] No errors found in logs --- .../log_eemumu_mad_d_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 96 ++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 96 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 94 +++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 94 +++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_common.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 96 ++++++------ .../log_eemumu_mad_f_inl0_hrd1.txt | 94 +++++------ .../log_eemumu_mad_f_inl1_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 94 +++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 94 +++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 100 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 96 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 104 ++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 92 +++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 122 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 124 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 110 ++++++------- .../log_ggtt_mad_f_inl1_hrd1.txt | 108 ++++++------- .../log_ggtt_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_ggtt_mad_m_inl0_hrd1.txt | 96 ++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 118 +++++++------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 118 +++++++------- .../log_ggttg_mad_d_inl0_hrd1.txt | 118 +++++++------- .../log_ggttg_mad_f_inl0_hrd0.txt | 148 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 148 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 148 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 114 +++++++------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 114 +++++++------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 136 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 134 ++++++++-------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 142 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 142 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 110 ++++++------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 138 ++++++++-------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 140 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 138 ++++++++-------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_gqttq_mad_d_inl0_hrd0.txt | 130 +++++++-------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 130 +++++++-------- .../log_gqttq_mad_d_inl0_hrd1.txt | 130 +++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 142 ++++++++--------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 144 ++++++++--------- .../log_gqttq_mad_f_inl0_hrd1.txt | 142 ++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_gqttq_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_heftggbb_mad_d_inl0_hrd0.txt | 100 ++++++------ .../log_heftggbb_mad_d_inl0_hrd1.txt | 100 ++++++------ .../log_heftggbb_mad_f_inl0_hrd0.txt | 126 ++++++++------- .../log_heftggbb_mad_f_inl0_hrd1.txt | 122 +++++++-------- .../log_heftggbb_mad_m_inl0_hrd0.txt | 96 ++++++------ .../log_heftggbb_mad_m_inl0_hrd1.txt | 96 ++++++------ .../log_smeftggtttt_mad_d_inl0_hrd0.txt | 110 ++++++------- .../log_smeftggtttt_mad_d_inl0_hrd1.txt | 110 ++++++------- .../log_smeftggtttt_mad_f_inl0_hrd0.txt | 122 +++++++-------- .../log_smeftggtttt_mad_f_inl0_hrd1.txt | 122 +++++++-------- .../log_smeftggtttt_mad_m_inl0_hrd0.txt | 110 ++++++------- .../log_smeftggtttt_mad_m_inl0_hrd1.txt | 110 ++++++------- .../log_susyggt1t1_mad_d_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_d_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd0.txt | 146 ++++++++--------- .../log_susyggt1t1_mad_f_inl0_hrd1.txt | 146 ++++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggt1t1_mad_m_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggtt_mad_d_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggtt_mad_d_inl0_hrd1.txt | 144 ++++++++--------- .../log_susyggtt_mad_f_inl0_hrd0.txt | 146 ++++++++--------- .../log_susyggtt_mad_f_inl0_hrd1.txt | 146 ++++++++--------- .../log_susyggtt_mad_m_inl0_hrd0.txt | 144 ++++++++--------- .../log_susyggtt_mad_m_inl0_hrd1.txt | 144 ++++++++--------- 102 files changed, 5969 insertions(+), 5973 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index de0caca761..56ff30cafc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:48:44 +DATE: 2024-06-03_17:55:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.447081e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.931434e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173062e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.381319e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.697367e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143178e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676046 sec +TOTAL : 0.737241 sec INFO: No Floating Point Exceptions have been reported - 2,567,759,777 cycles # 2.819 GHz - 3,947,530,526 instructions # 1.54 insn per cycle - 0.969595478 seconds time elapsed + 2,630,475,030 cycles # 2.802 GHz + 4,094,804,374 instructions # 1.56 insn per cycle + 1.031371639 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052568e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.236916e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.236916e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.012540e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.179773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.179773e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.401690 sec +TOTAL : 6.634856 sec INFO: No Floating Point Exceptions have been reported - 18,320,184,384 cycles # 2.860 GHz - 43,970,344,438 instructions # 2.40 insn per cycle - 6.407522814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,043,816,100 cycles # 2.871 GHz + 46,110,907,096 instructions # 2.42 insn per cycle + 6.640203777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556597e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031950e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536456e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.001382e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.001382e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.466648 sec +TOTAL : 4.519211 sec INFO: No Floating Point Exceptions have been reported - 12,746,464,526 cycles # 2.851 GHz - 30,998,051,748 instructions # 2.43 insn per cycle - 4.472203598 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 12,941,312,867 cycles # 2.861 GHz + 31,615,854,685 instructions # 2.44 insn per cycle + 4.524499594 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.919243e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.664717e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.664717e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.916088e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660561e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660561e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.699016 sec +TOTAL : 3.706215 sec INFO: No Floating Point Exceptions have been reported - 10,057,139,705 cycles # 2.715 GHz - 19,364,699,903 instructions # 1.93 insn per cycle - 3.704443201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,084,827,662 cycles # 2.718 GHz + 19,615,618,896 instructions # 1.95 insn per cycle + 3.711412090 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993729e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.809982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.809982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.942239e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.710200e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.710200e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.577215 sec +TOTAL : 3.659343 sec INFO: No Floating Point Exceptions have been reported - 9,735,076,070 cycles # 2.718 GHz - 18,976,322,211 instructions # 1.95 insn per cycle - 3.583082575 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 9,850,079,029 cycles # 2.688 GHz + 19,274,334,982 instructions # 1.96 insn per cycle + 3.664605897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.660221e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.180702e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.180702e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.674322e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.214341 sec +TOTAL : 4.178974 sec INFO: No Floating Point Exceptions have been reported - 8,602,295,276 cycles # 2.039 GHz - 15,727,245,583 instructions # 1.83 insn per cycle - 4.219911758 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,571,642,787 cycles # 2.049 GHz + 15,729,577,049 instructions # 1.84 insn per cycle + 4.184518706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 7ea10d000a..e4612fa859 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:26:15 +DATE: 2024-06-03_18:23:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.576625e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738660e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738660e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.676827e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.079718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.079718e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.288746 sec +TOTAL : 2.257615 sec INFO: No Floating Point Exceptions have been reported - 7,214,350,790 cycles # 2.846 GHz - 12,908,181,952 instructions # 1.79 insn per cycle - 2.590853100 seconds time elapsed + 7,105,304,866 cycles # 2.844 GHz + 12,695,101,201 instructions # 1.79 insn per cycle + 2.557593286 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,16 +90,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.021503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.191283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.191283e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.768920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132392e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132392e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.785821 sec +TOTAL : 7.067791 sec INFO: No Floating Point Exceptions have been reported - 19,486,737,909 cycles # 2.870 GHz - 44,194,389,028 instructions # 2.27 insn per cycle - 6.792710044 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 20,274,216,454 cycles # 2.867 GHz + 46,340,905,474 instructions # 2.29 insn per cycle + 7.074366711 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -119,16 +119,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.480951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.459618e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878889e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878889e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.890119 sec +TOTAL : 4.950844 sec INFO: No Floating Point Exceptions have been reported - 14,036,550,987 cycles # 2.867 GHz - 31,841,545,843 instructions # 2.27 insn per cycle - 4.897057684 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 14,186,861,569 cycles # 2.862 GHz + 32,460,104,656 instructions # 2.29 insn per cycle + 4.957781944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.794479e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.441373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.441373e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.784896e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.422990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.422990e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.161489 sec +TOTAL : 4.177427 sec INFO: No Floating Point Exceptions have been reported - 11,384,673,501 cycles # 2.732 GHz - 20,728,132,603 instructions # 1.82 insn per cycle - 4.168544668 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 11,344,391,753 cycles # 2.712 GHz + 20,973,942,956 instructions # 1.85 insn per cycle + 4.184302710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875462e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.582851e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.582851e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838122e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.522202e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.522202e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.002181 sec +TOTAL : 4.071273 sec INFO: No Floating Point Exceptions have been reported - 10,994,800,793 cycles # 2.743 GHz - 20,338,605,981 instructions # 1.85 insn per cycle - 4.009209282 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 11,127,064,908 cycles # 2.729 GHz + 20,623,612,207 instructions # 1.85 insn per cycle + 4.078239476 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,16 +206,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.543831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.990093e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.990093e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.580968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.045754e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.045754e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.737577 sec +TOTAL : 4.634026 sec INFO: No Floating Point Exceptions have been reported - 9,979,903,374 cycles # 2.116 GHz - 16,882,096,595 instructions # 1.69 insn per cycle - 4.744772163 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 9,906,245,376 cycles # 2.135 GHz + 16,874,719,580 instructions # 1.70 insn per cycle + 4.640929218 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 5164f42c9d..d50297454d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:37:21 +DATE: 2024-06-03_18:34:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.007128e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.848481e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.122082e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.267055e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.830379e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.124220e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.379492 sec +TOTAL : 1.364180 sec INFO: No Floating Point Exceptions have been reported - 4,577,420,690 cycles # 2.842 GHz - 7,053,134,533 instructions # 1.54 insn per cycle - 1.667006809 seconds time elapsed + 4,532,864,593 cycles # 2.840 GHz + 7,022,211,486 instructions # 1.55 insn per cycle + 1.651988516 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053204e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.237226e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.237226e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.009353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.177013e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177013e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.775998 sec +TOTAL : 7.030647 sec INFO: No Floating Point Exceptions have been reported - 19,412,850,180 cycles # 2.863 GHz - 44,070,335,531 instructions # 2.27 insn per cycle - 6.781477582 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 20,161,768,446 cycles # 2.866 GHz + 46,214,693,694 instructions # 2.29 insn per cycle + 7.036306592 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.545449e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.023167e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.023167e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.533517e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.003656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003656e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.877650 sec +TOTAL : 4.909085 sec INFO: No Floating Point Exceptions have been reported - 13,891,433,885 cycles # 2.845 GHz - 31,001,668,128 instructions # 2.23 insn per cycle - 4.883211371 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 14,057,102,230 cycles # 2.861 GHz + 31,619,083,498 instructions # 2.25 insn per cycle + 4.914930210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.893576e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.631253e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.631253e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674802e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674802e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.124694 sec +TOTAL : 4.087877 sec INFO: No Floating Point Exceptions have been reported - 11,185,517,102 cycles # 2.709 GHz - 19,267,834,957 instructions # 1.72 insn per cycle - 4.130263002 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 11,204,101,414 cycles # 2.738 GHz + 19,516,255,163 instructions # 1.74 insn per cycle + 4.093488605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993280e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.807930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.807930e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.960651e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.761105e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.761105e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.962775 sec +TOTAL : 4.017981 sec INFO: No Floating Point Exceptions have been reported - 10,857,852,651 cycles # 2.737 GHz - 18,688,313,206 instructions # 1.72 insn per cycle - 3.968277953 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 11,005,784,213 cycles # 2.736 GHz + 18,974,990,518 instructions # 1.72 insn per cycle + 4.023446242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.669707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199425e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199425e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.671209e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.198815e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.198815e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.577580 sec +TOTAL : 4.571742 sec INFO: No Floating Point Exceptions have been reported - 9,715,691,578 cycles # 2.121 GHz - 15,431,480,999 instructions # 1.59 insn per cycle - 4.583151536 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 9,691,143,886 cycles # 2.118 GHz + 15,430,112,477 instructions # 1.59 insn per cycle + 4.577138966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index cd3c8cd8c3..5ad03591f2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:34:38 +DATE: 2024-06-03_18:31:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.026327e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798452e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.142109e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.316519e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.874601e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.146758e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.998322 sec +TOTAL : 0.987264 sec INFO: No Floating Point Exceptions have been reported - 3,480,084,828 cycles # 2.833 GHz - 7,034,366,070 instructions # 2.02 insn per cycle - 1.285315972 seconds time elapsed + 3,447,960,914 cycles # 2.834 GHz + 6,935,540,646 instructions # 2.01 insn per cycle + 1.273699042 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051590e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.235793e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.235793e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.009012e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.176925e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.406221 sec +TOTAL : 6.658869 sec INFO: No Floating Point Exceptions have been reported - 18,351,317,436 cycles # 2.863 GHz - 43,970,738,136 instructions # 2.40 insn per cycle - 6.411801951 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,081,258,056 cycles # 2.864 GHz + 46,107,329,358 instructions # 2.42 insn per cycle + 6.664506569 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.022660e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.022660e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.514278e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976583e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.494676 sec +TOTAL : 4.584116 sec INFO: No Floating Point Exceptions have been reported - 12,818,589,929 cycles # 2.849 GHz - 31,001,594,318 instructions # 2.42 insn per cycle - 4.500171814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 13,015,516,432 cycles # 2.837 GHz + 31,615,544,326 instructions # 2.43 insn per cycle + 4.589745993 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.924966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.685276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.685276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.910581e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.667694e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.667694e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.690851 sec +TOTAL : 3.717580 sec INFO: No Floating Point Exceptions have been reported - 10,087,697,509 cycles # 2.730 GHz - 19,365,345,065 instructions # 1.92 insn per cycle - 3.696395456 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,129,644,926 cycles # 2.722 GHz + 19,615,817,348 instructions # 1.94 insn per cycle + 3.723306833 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.949695e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.742629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.742629e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.703083e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.703083e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.652825 sec +TOTAL : 3.709507 sec INFO: No Floating Point Exceptions have been reported - 9,810,916,380 cycles # 2.684 GHz - 18,988,601,654 instructions # 1.94 insn per cycle - 3.658397686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 10,001,772,202 cycles # 2.693 GHz + 19,274,333,460 instructions # 1.93 insn per cycle + 3.715129775 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.670977e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.205176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.205176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.675494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.205413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.205413e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.189326 sec +TOTAL : 4.178251 sec INFO: No Floating Point Exceptions have been reported - 8,618,980,631 cycles # 2.055 GHz - 15,727,806,217 instructions # 1.82 insn per cycle - 4.194947819 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,596,800,326 cycles # 2.055 GHz + 15,729,129,551 instructions # 1.83 insn per cycle + 4.183796011 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 3d612f0f8f..9f94eba974 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:31:52 +DATE: 2024-06-03_18:29:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.966609e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692288e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029334e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.059269e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.762127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.035370e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.920797 sec +TOTAL : 1.898879 sec INFO: No Floating Point Exceptions have been reported - 6,106,858,009 cycles # 2.839 GHz - 11,364,477,574 instructions # 1.86 insn per cycle - 2.208478557 seconds time elapsed + 6,080,782,058 cycles # 2.845 GHz + 11,333,399,425 instructions # 1.86 insn per cycle + 2.193723946 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -83,16 +83,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.046138e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.230727e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.230727e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.011160e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180008e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180008e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.439007 sec +TOTAL : 6.644599 sec INFO: No Floating Point Exceptions have been reported - 18,389,458,277 cycles # 2.854 GHz - 43,970,512,226 instructions # 2.39 insn per cycle - 6.444728897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) + 19,072,626,755 cycles # 2.869 GHz + 46,106,399,416 instructions # 2.42 insn per cycle + 6.650013559 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 476) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.536098e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017790e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017790e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537609e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.007819e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.007819e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.525302 sec +TOTAL : 4.519754 sec INFO: No Floating Point Exceptions have been reported - 12,893,450,701 cycles # 2.847 GHz - 31,000,830,473 instructions # 2.40 insn per cycle - 4.530866155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) + 12,966,403,150 cycles # 2.866 GHz + 31,615,629,074 instructions # 2.44 insn per cycle + 4.525378951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1676) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928619e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.689762e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.689762e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896438e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633059e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633059e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.685051 sec +TOTAL : 3.741423 sec INFO: No Floating Point Exceptions have been reported - 10,071,705,419 cycles # 2.730 GHz - 19,365,099,946 instructions # 1.92 insn per cycle - 3.690595449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) + 10,139,443,489 cycles # 2.707 GHz + 19,615,728,681 instructions # 1.93 insn per cycle + 3.747032020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1972) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.001165e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.830714e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.830714e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.989752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.800797e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800797e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.567461 sec +TOTAL : 3.582955 sec INFO: No Floating Point Exceptions have been reported - 9,752,859,688 cycles # 2.730 GHz - 18,976,384,316 instructions # 1.95 insn per cycle - 3.573110488 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) + 9,847,950,220 cycles # 2.745 GHz + 19,262,900,417 instructions # 1.96 insn per cycle + 3.588451727 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,16 +195,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.669903e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201435e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.681253e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.222096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.222096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.193613 sec +TOTAL : 4.166168 sec INFO: No Floating Point Exceptions have been reported - 8,620,136,128 cycles # 2.053 GHz - 15,727,513,221 instructions # 1.82 insn per cycle - 4.199142686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) + 8,643,873,393 cycles # 2.073 GHz + 15,729,348,046 instructions # 1.82 insn per cycle + 4.171863724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 900) (512y: 156) (512z: 1257) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 3617e224c6..36620ecacd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:49:15 +DATE: 2024-06-03_17:55:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.362484e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.520610e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.206124e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.378306e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711623e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.204981e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676929 sec +TOTAL : 0.690663 sec INFO: No Floating Point Exceptions have been reported - 2,569,510,705 cycles # 2.820 GHz - 4,003,501,539 instructions # 1.56 insn per cycle - 0.973315338 seconds time elapsed + 2,634,366,044 cycles # 2.824 GHz + 4,060,642,595 instructions # 1.54 insn per cycle + 0.990576887 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103860e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.307114e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.307114e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.004096e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168038e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.122439 sec +TOTAL : 6.686767 sec INFO: No Floating Point Exceptions have been reported - 17,532,692,730 cycles # 2.862 GHz - 41,814,035,675 instructions # 2.38 insn per cycle - 6.128383150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) + 19,167,690,225 cycles # 2.865 GHz + 46,069,035,215 instructions # 2.40 insn per cycle + 6.692184734 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.581686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.085119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.085119e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537206e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.002103e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002103e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.404931 sec +TOTAL : 4.518313 sec INFO: No Floating Point Exceptions have been reported - 12,515,101,521 cycles # 2.838 GHz - 30,161,142,578 instructions # 2.41 insn per cycle - 4.410559397 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) + 12,954,375,073 cycles # 2.864 GHz + 31,589,759,365 instructions # 2.44 insn per cycle + 4.523729432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.946029e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.718574e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.718574e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.927185e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.677512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.677512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.653893 sec +TOTAL : 3.686146 sec INFO: No Floating Point Exceptions have been reported - 9,961,431,996 cycles # 2.723 GHz - 19,096,639,277 instructions # 1.92 insn per cycle - 3.659578231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) + 10,059,405,191 cycles # 2.726 GHz + 19,593,879,777 instructions # 1.95 insn per cycle + 3.691447203 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1955) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.016173e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.848698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.848698e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.952035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.725882e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.725882e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.542404 sec +TOTAL : 3.645301 sec INFO: No Floating Point Exceptions have been reported - 9,660,599,362 cycles # 2.725 GHz - 18,744,004,297 instructions # 1.94 insn per cycle - 3.547571851 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) + 9,902,836,539 cycles # 2.714 GHz + 19,290,819,520 instructions # 1.95 insn per cycle + 3.650635929 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1698) (512y: 178) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.727713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296045e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296045e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.701058e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251875e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.062176 sec +TOTAL : 4.122795 sec INFO: No Floating Point Exceptions have been reported - 8,450,337,585 cycles # 2.078 GHz - 15,603,422,783 instructions # 1.85 insn per cycle - 4.067782201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) + 8,456,560,073 cycles # 2.049 GHz + 15,601,817,159 instructions # 1.84 insn per cycle + 4.128003753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 886) (512y: 156) (512z: 1237) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 501b51f71f..451f1ae6b2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:16:43 +DATE: 2024-06-03_18:13:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.738504e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.732597e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.167044e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.826886e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.944644e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.167643e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.684793 sec +TOTAL : 0.667400 sec INFO: No Floating Point Exceptions have been reported - 2,601,107,988 cycles # 2.822 GHz - 4,061,282,635 instructions # 1.56 insn per cycle - 0.978559950 seconds time elapsed + 2,582,478,119 cycles # 2.831 GHz + 3,921,579,026 instructions # 1.52 insn per cycle + 0.972378951 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.575213e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.020448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.020448e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572927e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021325e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.416526 sec +TOTAL : 4.421917 sec INFO: No Floating Point Exceptions have been reported - 12,654,142,005 cycles # 2.862 GHz - 32,510,363,434 instructions # 2.57 insn per cycle - 4.422251656 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) + 12,670,048,279 cycles # 2.863 GHz + 32,460,799,656 instructions # 2.56 insn per cycle + 4.427297536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 294) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.002255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.874219e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.874219e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.982168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.835238e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.835238e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.568265 sec +TOTAL : 3.599430 sec INFO: No Floating Point Exceptions have been reported - 10,224,593,553 cycles # 2.863 GHz - 24,472,095,992 instructions # 2.39 insn per cycle - 3.573538181 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) + 10,303,629,232 cycles # 2.859 GHz + 24,602,320,321 instructions # 2.39 insn per cycle + 3.604552236 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.169162e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.169863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.169863e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.163312e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.175141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.175141e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.324119 sec +TOTAL : 3.330877 sec INFO: No Floating Point Exceptions have been reported - 9,111,176,688 cycles # 2.737 GHz - 16,922,082,397 instructions # 1.86 insn per cycle - 3.329746327 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) + 9,113,641,103 cycles # 2.732 GHz + 16,920,127,372 instructions # 1.86 insn per cycle + 3.336119061 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1630) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.220354e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.270284e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.270284e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.226331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.302500e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.302500e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.254253 sec +TOTAL : 3.247561 sec INFO: No Floating Point Exceptions have been reported - 8,910,060,786 cycles # 2.734 GHz - 16,345,046,075 instructions # 1.83 insn per cycle - 3.260025356 seconds time elapsed + 8,894,526,436 cycles # 2.735 GHz + 16,333,311,875 instructions # 1.84 insn per cycle + 3.252816409 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.878740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.571241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.571241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.866287e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571770e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571770e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.773398 sec +TOTAL : 3.797015 sec INFO: No Floating Point Exceptions have been reported - 7,901,326,876 cycles # 2.092 GHz - 14,582,511,484 instructions # 1.85 insn per cycle - 3.778605571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) + 7,945,914,620 cycles # 2.090 GHz + 14,570,610,289 instructions # 1.83 insn per cycle + 3.802243147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1012) (512y: 158) (512z: 954) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index fa73177cd7..3a280ed2df 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:10 +DATE: 2024-06-03_18:14:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.732503e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.754634e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.215471e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.829864e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.970963e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217527e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.688215 sec +TOTAL : 0.675302 sec INFO: No Floating Point Exceptions have been reported - 2,639,635,060 cycles # 2.820 GHz - 4,015,174,984 instructions # 1.52 insn per cycle - 0.994462896 seconds time elapsed + 2,543,380,020 cycles # 2.802 GHz + 3,941,057,239 instructions # 1.55 insn per cycle + 0.974156954 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.083631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.949296e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.949296e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.036422e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.861638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.861638e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.441397 sec +TOTAL : 3.511896 sec INFO: No Floating Point Exceptions have been reported - 9,846,812,518 cycles # 2.858 GHz - 25,386,191,431 instructions # 2.58 insn per cycle - 3.446476615 seconds time elapsed + 9,977,246,915 cycles # 2.837 GHz + 25,414,242,774 instructions # 2.55 insn per cycle + 3.517152120 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.316581e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.557276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.557276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.321836e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587383e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587383e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.142273 sec +TOTAL : 3.134839 sec INFO: No Floating Point Exceptions have been reported - 8,991,441,142 cycles # 2.857 GHz - 21,484,440,131 instructions # 2.39 insn per cycle - 3.147929478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) + 8,976,689,142 cycles # 2.860 GHz + 21,408,195,057 instructions # 2.38 insn per cycle + 3.139945644 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.320671e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.503530e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.503530e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.287559e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.440355e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.440355e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.133297 sec +TOTAL : 3.172922 sec INFO: No Floating Point Exceptions have been reported - 8,580,721,113 cycles # 2.735 GHz - 15,811,719,082 instructions # 1.84 insn per cycle - 3.138961399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) + 8,695,276,950 cycles # 2.737 GHz + 15,871,278,326 instructions # 1.83 insn per cycle + 3.178201663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1503) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.375766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.617874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.617874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.355752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594201e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.069315 sec +TOTAL : 3.089945 sec INFO: No Floating Point Exceptions have been reported - 8,463,481,626 cycles # 2.754 GHz - 15,513,175,556 instructions # 1.83 insn per cycle - 3.074609069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) + 8,475,803,379 cycles # 2.739 GHz + 15,579,989,322 instructions # 1.84 insn per cycle + 3.095117552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1282) (512y: 141) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.011258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.824275e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.824275e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995463e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.811156e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.811156e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.549017 sec +TOTAL : 3.579131 sec INFO: No Floating Point Exceptions have been reported - 7,565,498,334 cycles # 2.129 GHz - 14,283,366,137 instructions # 1.89 insn per cycle - 3.554588261 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) + 7,587,197,159 cycles # 2.118 GHz + 14,284,005,803 instructions # 1.88 insn per cycle + 3.584393140 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 876) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 78b8b832b6..bcbeba2deb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:49:45 +DATE: 2024-06-03_17:56:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.453404e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.301856e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286518e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.164643e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140650e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147287e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.580838 sec +TOTAL : 0.590670 sec INFO: No Floating Point Exceptions have been reported - 2,286,209,161 cycles # 2.821 GHz - 3,532,764,689 instructions # 1.55 insn per cycle - 0.869255178 seconds time elapsed + 2,301,464,636 cycles # 2.819 GHz + 3,602,954,463 instructions # 1.57 insn per cycle + 0.873444960 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.078555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.280307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051421e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239912e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239912e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.210359 sec +TOTAL : 6.360765 sec INFO: No Floating Point Exceptions have been reported - 17,783,383,753 cycles # 2.862 GHz - 43,511,171,857 instructions # 2.45 insn per cycle - 6.215902116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,243,657,346 cycles # 2.866 GHz + 45,005,960,239 instructions # 2.47 insn per cycle + 6.365748081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377494e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.377494e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.202499e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.355957e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.355957e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.231501 sec +TOTAL : 3.238806 sec INFO: No Floating Point Exceptions have been reported - 9,255,830,965 cycles # 2.863 GHz - 21,906,871,719 instructions # 2.37 insn per cycle - 3.236386372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,281,505,690 cycles # 2.862 GHz + 22,294,520,661 instructions # 2.40 insn per cycle + 3.243835029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.371336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.610412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.610412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353660e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.572726e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.572726e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.030129 sec +TOTAL : 3.047299 sec INFO: No Floating Point Exceptions have been reported - 8,294,048,623 cycles # 2.733 GHz - 15,590,527,403 instructions # 1.88 insn per cycle - 3.035436377 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,361,633,099 cycles # 2.740 GHz + 15,758,092,056 instructions # 1.88 insn per cycle + 3.052295265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.397036e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.670362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.670362e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.399950e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678079e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678079e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.001269 sec +TOTAL : 2.995029 sec INFO: No Floating Point Exceptions have been reported - 8,226,933,402 cycles # 2.737 GHz - 15,430,117,600 instructions # 1.88 insn per cycle - 3.006462525 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,228,763,442 cycles # 2.744 GHz + 15,611,452,650 instructions # 1.90 insn per cycle + 3.000008543 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.353627e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576193e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576193e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369890e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596618e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596618e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.054034 sec +TOTAL : 3.032697 sec INFO: No Floating Point Exceptions have been reported - 6,654,880,184 cycles # 2.176 GHz - 12,863,187,093 instructions # 1.93 insn per cycle - 3.059348788 seconds time elapsed + 6,617,314,348 cycles # 2.179 GHz + 12,864,001,473 instructions # 1.94 insn per cycle + 3.037918435 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 65b53d740f..fd083bf0e0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:26:49 +DATE: 2024-06-03_18:24:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.060712e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.025766e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.025766e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.121418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.423124e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.423124e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.728605 sec +TOTAL : 1.718861 sec INFO: No Floating Point Exceptions have been reported - 5,559,492,246 cycles # 2.840 GHz - 10,134,973,470 instructions # 1.82 insn per cycle - 2.015457056 seconds time elapsed + 5,539,636,530 cycles # 2.844 GHz + 10,034,793,253 instructions # 1.81 insn per cycle + 2.005948260 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -90,16 +90,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.061003e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.254218e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.254218e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.027970e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.209007e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.420879 sec +TOTAL : 6.611368 sec INFO: No Floating Point Exceptions have been reported - 18,399,874,886 cycles # 2.863 GHz - 43,656,453,581 instructions # 2.37 insn per cycle - 6.427487997 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,923,826,319 cycles # 2.860 GHz + 45,154,643,508 instructions # 2.39 insn per cycle + 6.617847071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -119,16 +119,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.114817e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.166060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.166060e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.092220e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.146266e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.146266e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.495712 sec +TOTAL : 3.523089 sec INFO: No Floating Point Exceptions have been reported - 10,015,137,457 cycles # 2.860 GHz - 23,241,753,742 instructions # 2.32 insn per cycle - 3.502264923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 10,101,089,413 cycles # 2.863 GHz + 23,628,753,394 instructions # 2.34 insn per cycle + 3.529482790 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231996e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358012e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.238833e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347968e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347968e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.334222 sec +TOTAL : 3.323626 sec INFO: No Floating Point Exceptions have been reported - 9,138,089,758 cycles # 2.736 GHz - 16,713,258,734 instructions # 1.83 insn per cycle - 3.340848954 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 9,165,072,954 cycles # 2.753 GHz + 16,876,243,244 instructions # 1.84 insn per cycle + 3.330172075 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.270258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.427950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.427950e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.271402e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.437298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.437298e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.281372 sec +TOTAL : 3.278898 sec INFO: No Floating Point Exceptions have been reported - 9,005,942,306 cycles # 2.740 GHz - 16,548,921,552 instructions # 1.84 insn per cycle - 3.287811990 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 9,037,907,486 cycles # 2.752 GHz + 16,730,036,801 instructions # 1.85 insn per cycle + 3.285218692 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,15 +206,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.240131e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.309974e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.309974e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.251664e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.326129e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.325465 sec +TOTAL : 3.310369 sec INFO: No Floating Point Exceptions have been reported - 7,397,809,520 cycles # 2.221 GHz - 14,072,596,030 instructions # 1.90 insn per cycle - 3.332157901 seconds time elapsed + 7,379,156,811 cycles # 2.226 GHz + 14,070,522,690 instructions # 1.91 insn per cycle + 3.316794525 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 41fcdf2cfe..352483b9b2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:37:54 +DATE: 2024-06-03_18:35:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.389352e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220547e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.248789e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.414064e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187425e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123209e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.212858 sec +TOTAL : 1.215276 sec INFO: No Floating Point Exceptions have been reported - 4,093,911,360 cycles # 2.853 GHz - 6,566,528,457 instructions # 1.60 insn per cycle - 1.491492030 seconds time elapsed + 4,088,679,623 cycles # 2.843 GHz + 6,534,825,098 instructions # 1.60 insn per cycle + 1.494464073 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.285638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.285638e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.048831e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237717e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237717e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.532719 sec +TOTAL : 6.724917 sec INFO: No Floating Point Exceptions have been reported - 18,783,324,742 cycles # 2.874 GHz - 43,693,376,231 instructions # 2.33 insn per cycle - 6.537953932 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 19,274,608,418 cycles # 2.865 GHz + 45,191,452,352 instructions # 2.34 insn per cycle + 6.730164945 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210512e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382085e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382085e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.187170e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.339667e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.339667e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.582695 sec +TOTAL : 3.608148 sec INFO: No Floating Point Exceptions have been reported - 10,261,732,783 cycles # 2.861 GHz - 21,990,872,924 instructions # 2.14 insn per cycle - 3.587933515 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 10,343,597,433 cycles # 2.863 GHz + 22,378,027,937 instructions # 2.16 insn per cycle + 3.613416876 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.363828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.635295e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.635295e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.332970e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565979e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565979e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.389987 sec +TOTAL : 3.421134 sec INFO: No Floating Point Exceptions have been reported - 9,345,873,446 cycles # 2.754 GHz - 15,502,334,673 instructions # 1.66 insn per cycle - 3.395365367 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 9,428,567,434 cycles # 2.753 GHz + 15,670,954,000 instructions # 1.66 insn per cycle + 3.426333943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.383815e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.695694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.695694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.368735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.670315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.670315e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.376377 sec +TOTAL : 3.387000 sec INFO: No Floating Point Exceptions have been reported - 9,309,965,265 cycles # 2.754 GHz - 15,139,174,417 instructions # 1.63 insn per cycle - 3.381718320 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 9,333,760,322 cycles # 2.752 GHz + 15,321,820,627 instructions # 1.64 insn per cycle + 3.392134873 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.370232e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.594168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.594168e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.361931e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581750e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581750e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.394429 sec +TOTAL : 3.402251 sec INFO: No Floating Point Exceptions have been reported - 7,646,568,006 cycles # 2.250 GHz - 12,573,843,987 instructions # 1.64 insn per cycle - 3.399861496 seconds time elapsed + 7,642,447,849 cycles # 2.243 GHz + 12,573,604,096 instructions # 1.65 insn per cycle + 3.407581538 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index cbd445fde8..b6efbb5a3c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:35:09 +DATE: 2024-06-03_18:32:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.400821e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.237112e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.282800e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.442100e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215180e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122824e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.873575 sec +TOTAL : 0.869353 sec INFO: No Floating Point Exceptions have been reported - 3,103,401,533 cycles # 2.833 GHz - 6,356,049,584 instructions # 2.05 insn per cycle - 1.152258826 seconds time elapsed + 3,095,265,258 cycles # 2.834 GHz + 6,379,863,801 instructions # 2.06 insn per cycle + 1.148344255 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.080565e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.282306e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.282306e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238510e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238510e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.200319 sec +TOTAL : 6.372456 sec INFO: No Floating Point Exceptions have been reported - 17,766,785,300 cycles # 2.864 GHz - 43,507,689,985 instructions # 2.45 insn per cycle - 6.205548449 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,270,854,403 cycles # 2.866 GHz + 45,003,372,558 instructions # 2.46 insn per cycle + 6.377274532 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.219205e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391133e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391133e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.202091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.354764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.354764e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.222734 sec +TOTAL : 3.241559 sec INFO: No Floating Point Exceptions have been reported - 9,245,511,720 cycles # 2.865 GHz - 21,907,133,008 instructions # 2.37 insn per cycle - 3.228067776 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,293,257,475 cycles # 2.863 GHz + 22,294,148,302 instructions # 2.40 insn per cycle + 3.246883723 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.362472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.627969e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.627969e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.338458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.580425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580425e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.040708 sec +TOTAL : 3.067414 sec INFO: No Floating Point Exceptions have been reported - 8,344,967,895 cycles # 2.740 GHz - 15,591,025,923 instructions # 1.87 insn per cycle - 3.046295562 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,429,082,509 cycles # 2.744 GHz + 15,756,097,527 instructions # 1.87 insn per cycle + 3.072640159 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.690003e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.690003e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.368856e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665021e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.024127 sec +TOTAL : 3.033024 sec INFO: No Floating Point Exceptions have been reported - 8,310,000,031 cycles # 2.744 GHz - 15,436,072,136 instructions # 1.86 insn per cycle - 3.029766146 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,320,312,555 cycles # 2.740 GHz + 15,609,926,581 instructions # 1.88 insn per cycle + 3.037988723 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.361982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.586100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.586100e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.372805e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.593211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.593211e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.044604 sec +TOTAL : 3.032379 sec INFO: No Floating Point Exceptions have been reported - 6,641,930,192 cycles # 2.179 GHz - 12,864,124,768 instructions # 1.94 insn per cycle - 3.049948851 seconds time elapsed + 6,617,917,246 cycles # 2.179 GHz + 12,863,669,922 instructions # 1.94 insn per cycle + 3.037700714 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 275da8993d..07585ab3a9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:32:24 +DATE: 2024-06-03_18:29:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,18 +50,18 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.808283e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.206237e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.142753e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.911492e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.182207e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.013995e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.519463 sec +TOTAL : 1.510644 sec INFO: No Floating Point Exceptions have been reported - 4,951,290,576 cycles # 2.843 GHz - 9,146,689,840 instructions # 1.85 insn per cycle - 1.798020957 seconds time elapsed + 4,929,344,895 cycles # 2.845 GHz + 9,108,041,198 instructions # 1.85 insn per cycle + 1.788993156 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -83,16 +83,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281294e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281294e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239017e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.297116 sec +TOTAL : 6.369929 sec INFO: No Floating Point Exceptions have been reported - 18,034,882,237 cycles # 2.862 GHz - 43,508,302,495 instructions # 2.41 insn per cycle - 6.302484278 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) + 18,276,108,910 cycles # 2.867 GHz + 45,009,820,973 instructions # 2.46 insn per cycle + 6.375254703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210720e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.378151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.378151e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.203215e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.358762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358762e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.233511 sec +TOTAL : 3.240583 sec INFO: No Floating Point Exceptions have been reported - 9,271,850,153 cycles # 2.864 GHz - 21,907,043,465 instructions # 2.36 insn per cycle - 3.238909660 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) + 9,298,780,281 cycles # 2.866 GHz + 22,295,651,617 instructions # 2.40 insn per cycle + 3.245919845 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1969) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.352851e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608306e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608306e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.327930e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.549134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.549134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.052167 sec +TOTAL : 3.079866 sec INFO: No Floating Point Exceptions have been reported - 8,355,651,202 cycles # 2.734 GHz - 15,591,192,622 instructions # 1.87 insn per cycle - 3.057529422 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) + 8,420,044,894 cycles # 2.731 GHz + 15,757,149,809 instructions # 1.87 insn per cycle + 3.085217641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675709e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675709e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.356494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624833e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624833e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.022810 sec +TOTAL : 3.047415 sec INFO: No Floating Point Exceptions have been reported - 8,287,014,121 cycles # 2.737 GHz - 15,428,840,508 instructions # 1.86 insn per cycle - 3.028205243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) + 8,310,845,470 cycles # 2.723 GHz + 15,615,712,930 instructions # 1.88 insn per cycle + 3.052828049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2516) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,15 +195,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.364932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.586134e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.586134e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.366193e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582701e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.042115 sec +TOTAL : 3.040896 sec INFO: No Floating Point Exceptions have been reported - 6,626,503,704 cycles # 2.175 GHz - 12,863,711,552 instructions # 1.94 insn per cycle - 3.047484573 seconds time elapsed + 6,624,272,469 cycles # 2.176 GHz + 12,864,098,950 instructions # 1.94 insn per cycle + 3.046289418 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 3a0fd0a90a..f06578ead5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:50:12 +DATE: 2024-06-03_17:56:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.463816e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.296908e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328725e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.179867e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152324e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.182937e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577578 sec +TOTAL : 0.590804 sec INFO: No Floating Point Exceptions have been reported - 2,276,498,493 cycles # 2.822 GHz - 3,528,632,890 instructions # 1.55 insn per cycle - 0.863625263 seconds time elapsed + 2,308,504,890 cycles # 2.821 GHz + 3,608,810,202 instructions # 1.56 insn per cycle + 0.874918353 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.154961e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.387398e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.387398e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051623e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240991e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240991e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.823373 sec +TOTAL : 6.357908 sec INFO: No Floating Point Exceptions have been reported - 16,689,004,614 cycles # 2.865 GHz - 41,263,252,653 instructions # 2.47 insn per cycle - 5.828275571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) + 18,230,546,868 cycles # 2.866 GHz + 44,980,120,800 instructions # 2.47 insn per cycle + 6.363052088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 410) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.271492e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527103e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.527103e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.198260e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.343617e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.343617e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.154514 sec +TOTAL : 3.247815 sec INFO: No Floating Point Exceptions have been reported - 9,027,063,562 cycles # 2.858 GHz - 21,210,233,128 instructions # 2.35 insn per cycle - 3.159850522 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) + 9,312,738,759 cycles # 2.864 GHz + 22,262,519,463 instructions # 2.39 insn per cycle + 3.252975989 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387654e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.648132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.648132e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.359228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.576517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576517e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.011648 sec +TOTAL : 3.047650 sec INFO: No Floating Point Exceptions have been reported - 8,243,811,729 cycles # 2.736 GHz - 15,422,236,523 instructions # 1.87 insn per cycle - 3.016578474 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) + 8,350,755,476 cycles # 2.736 GHz + 15,749,116,716 instructions # 1.89 insn per cycle + 3.052777331 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2583) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.441553e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.770792e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.770792e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.400758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.672596e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.672596e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.953944 sec +TOTAL : 2.993253 sec INFO: No Floating Point Exceptions have been reported - 8,107,162,693 cycles # 2.740 GHz - 15,232,791,801 instructions # 1.88 insn per cycle - 2.959262610 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) + 8,214,715,534 cycles # 2.741 GHz + 15,591,753,579 instructions # 1.90 insn per cycle + 2.998240487 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2485) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.373912e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.600116e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.600116e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.375214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.601309e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601309e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.031130 sec +TOTAL : 3.028196 sec INFO: No Floating Point Exceptions have been reported - 6,600,460,683 cycles # 2.175 GHz - 12,841,921,234 instructions # 1.95 insn per cycle - 3.036614829 seconds time elapsed + 6,596,197,870 cycles # 2.176 GHz + 12,844,320,466 instructions # 1.95 insn per cycle + 3.033260897 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 9c3ce37c8b..53015944b4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:34 +DATE: 2024-06-03_18:14:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.329688e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197353e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.294513e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594191e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.255518e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.111244e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.584260 sec +TOTAL : 0.578532 sec INFO: No Floating Point Exceptions have been reported - 2,290,837,964 cycles # 2.826 GHz - 3,567,143,300 instructions # 1.56 insn per cycle - 0.867513946 seconds time elapsed + 2,276,560,334 cycles # 2.810 GHz + 3,509,958,267 instructions # 1.54 insn per cycle + 0.871311573 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 121 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.595326e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079631e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079631e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.597537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.078888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.078888e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.323662 sec +TOTAL : 4.314114 sec INFO: No Floating Point Exceptions have been reported - 12,200,237,668 cycles # 2.819 GHz - 32,427,514,864 instructions # 2.66 insn per cycle - 4.329187293 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) + 12,143,989,139 cycles # 2.813 GHz + 32,189,649,214 instructions # 2.65 insn per cycle + 4.319203265 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 303) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.615965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.427412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.427412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.630595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.454503e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.454503e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.793932 sec +TOTAL : 2.775394 sec INFO: No Floating Point Exceptions have been reported - 7,996,574,580 cycles # 2.860 GHz - 18,655,165,559 instructions # 2.33 insn per cycle - 2.799028174 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) + 7,955,697,825 cycles # 2.862 GHz + 18,698,571,181 instructions # 2.35 insn per cycle + 2.780527030 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1560) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,15 +137,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.687284e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.399964e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.399964e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.690652e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.468642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.468642e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.721615 sec +TOTAL : 2.714160 sec INFO: No Floating Point Exceptions have been reported - 7,454,408,451 cycles # 2.735 GHz - 14,253,415,404 instructions # 1.91 insn per cycle - 2.727046264 seconds time elapsed + 7,473,599,264 cycles # 2.749 GHz + 14,248,936,269 instructions # 1.91 insn per cycle + 2.719148041 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770821e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.626363e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.626363e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.730968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.602800e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.602800e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.650953 sec +TOTAL : 2.680982 sec INFO: No Floating Point Exceptions have been reported - 7,318,335,472 cycles # 2.756 GHz - 13,948,037,827 instructions # 1.91 insn per cycle - 2.656532072 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) + 7,364,716,439 cycles # 2.743 GHz + 13,944,217,782 instructions # 1.89 insn per cycle + 2.686027677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2094) (512y: 3) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.428325e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.729451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.729451e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.419153e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.705210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.705210e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.970494 sec +TOTAL : 2.980985 sec INFO: No Floating Point Exceptions have been reported - 6,503,944,976 cycles # 2.187 GHz - 13,423,073,698 instructions # 2.06 insn per cycle - 2.975897923 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) + 6,519,305,235 cycles # 2.184 GHz + 13,428,800,724 instructions # 2.06 insn per cycle + 2.985939217 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2073) (512y: 1) (512z: 1197) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 76b55ad2e4..78aa8adf25 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_21:17:58 +DATE: 2024-06-03_18:15:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.324695e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210479e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333686e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596788e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.293054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.216099e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.587402 sec +TOTAL : 0.570515 sec INFO: No Floating Point Exceptions have been reported - 2,292,933,061 cycles # 2.822 GHz - 3,530,135,889 instructions # 1.54 insn per cycle - 0.871362762 seconds time elapsed + 2,247,271,854 cycles # 2.826 GHz + 3,528,743,047 instructions # 1.57 insn per cycle + 0.851595191 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.056635e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.056635e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.134822e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.092898e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.092898e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.364784 sec +TOTAL : 3.323704 sec INFO: No Floating Point Exceptions have been reported - 9,485,686,184 cycles # 2.815 GHz - 25,263,356,042 instructions # 2.66 insn per cycle - 3.370276038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) + 9,341,713,175 cycles # 2.807 GHz + 25,628,381,739 instructions # 2.74 insn per cycle + 3.328792293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 256) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.961447e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.505102e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.505102e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.935423e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.423634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.423634e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.509947 sec +TOTAL : 2.529370 sec INFO: No Floating Point Exceptions have been reported - 7,195,839,812 cycles # 2.862 GHz - 16,868,387,762 instructions # 2.34 insn per cycle - 2.515387369 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) + 7,246,753,871 cycles # 2.860 GHz + 16,870,793,706 instructions # 2.33 insn per cycle + 2.534506834 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.832943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.808929e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.808929e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.821799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.828846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.828846e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.598583 sec +TOTAL : 2.608573 sec INFO: No Floating Point Exceptions have been reported - 7,142,287,207 cycles # 2.744 GHz - 13,617,950,967 instructions # 1.91 insn per cycle - 2.604113274 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) + 7,171,582,794 cycles # 2.745 GHz + 13,626,734,166 instructions # 1.90 insn per cycle + 2.613649240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2061) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.905826e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.012392e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.012392e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.881454e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.039760e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.039760e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.542873 sec +TOTAL : 2.559465 sec INFO: No Floating Point Exceptions have been reported - 7,030,524,595 cycles # 2.760 GHz - 13,426,027,213 instructions # 1.91 insn per cycle - 2.548245628 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) + 7,076,560,187 cycles # 2.760 GHz + 13,426,992,808 instructions # 1.90 insn per cycle + 2.564578714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1947) (512y: 4) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.533501e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.984604e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.984604e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.543259e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.003390e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.003390e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.864382 sec +TOTAL : 2.852367 sec INFO: No Floating Point Exceptions have been reported - 6,329,632,912 cycles # 2.206 GHz - 13,154,745,067 instructions # 2.08 insn per cycle - 2.870076647 seconds time elapsed + 6,308,825,768 cycles # 2.209 GHz + 13,154,958,113 instructions # 2.09 insn per cycle + 2.857417301 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 30bc197182..18a6685695 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:50:38 +DATE: 2024-06-03_17:57:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.732879e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.705839e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157350e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.318290e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.683416e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.140319e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.682384 sec +TOTAL : 0.694822 sec INFO: No Floating Point Exceptions have been reported - 2,597,825,046 cycles # 2.830 GHz - 4,090,205,188 instructions # 1.57 insn per cycle - 0.977759023 seconds time elapsed + 2,632,787,789 cycles # 2.825 GHz + 4,125,301,456 instructions # 1.57 insn per cycle + 0.988279407 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031288e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.206220e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.206220e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.933181e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153811e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153811e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.522879 sec +TOTAL : 6.752917 sec INFO: No Floating Point Exceptions have been reported - 18,693,180,609 cycles # 2.864 GHz - 44,222,141,009 instructions # 2.37 insn per cycle - 6.528296312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) + 19,322,413,781 cycles # 2.860 GHz + 46,295,992,950 instructions # 2.40 insn per cycle + 6.758378943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 479) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.613969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.133791e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.133791e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.594060e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096178e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096178e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.323747 sec +TOTAL : 4.370340 sec INFO: No Floating Point Exceptions have been reported - 12,389,439,717 cycles # 2.863 GHz - 30,920,154,197 instructions # 2.50 insn per cycle - 4.329261731 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) + 12,539,997,225 cycles # 2.867 GHz + 31,478,010,346 instructions # 2.51 insn per cycle + 4.375575128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1732) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.921394e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.668547e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.668547e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.919152e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.671850e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.671850e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.698108 sec +TOTAL : 3.697636 sec INFO: No Floating Point Exceptions have been reported - 10,087,612,198 cycles # 2.725 GHz - 19,373,367,445 instructions # 1.92 insn per cycle - 3.703569148 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) + 10,080,701,963 cycles # 2.723 GHz + 19,468,552,192 instructions # 1.93 insn per cycle + 3.702944091 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2133) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.984824e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.800128e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.800128e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.949458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.715469e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.715469e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.590549 sec +TOTAL : 3.644169 sec INFO: No Floating Point Exceptions have been reported - 9,780,533,240 cycles # 2.721 GHz - 18,954,616,108 instructions # 1.94 insn per cycle - 3.596015305 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) + 9,924,266,504 cycles # 2.720 GHz + 19,219,972,572 instructions # 1.94 insn per cycle + 3.649461576 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1874) (512y: 189) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.719226e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.286801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286801e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.736545e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.311368e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.311368e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.082921 sec +TOTAL : 4.043422 sec INFO: No Floating Point Exceptions have been reported - 8,420,905,067 cycles # 2.060 GHz - 15,057,078,071 instructions # 1.79 insn per cycle - 4.088371462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) + 8,353,044,219 cycles # 2.064 GHz + 15,065,800,381 instructions # 1.80 insn per cycle + 4.048753454 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1028) (512y: 154) (512z: 1321) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 05d5e2d3d7..1519beb165 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-06-02_20:51:09 +DATE: 2024-06-03_17:57:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.579969e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.686020e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157503e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.355716e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.694815e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.156755e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687280 sec +TOTAL : 0.692525 sec INFO: No Floating Point Exceptions have been reported - 2,618,766,619 cycles # 2.826 GHz - 4,041,753,204 instructions # 1.54 insn per cycle - 0.983494772 seconds time elapsed + 2,622,244,403 cycles # 2.823 GHz + 4,091,592,081 instructions # 1.56 insn per cycle + 0.985574224 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.075110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.266502e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266502e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.849035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.144096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.273174 sec +TOTAL : 6.811046 sec INFO: No Floating Point Exceptions have been reported - 17,976,200,983 cycles # 2.863 GHz - 42,467,527,484 instructions # 2.36 insn per cycle - 6.278790336 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) + 19,367,363,181 cycles # 2.842 GHz + 46,233,493,704 instructions # 2.39 insn per cycle + 6.816378070 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.651801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199325e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.582631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079535e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079535e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.233700 sec +TOTAL : 4.399702 sec INFO: No Floating Point Exceptions have been reported - 12,135,466,974 cycles # 2.863 GHz - 30,227,050,455 instructions # 2.49 insn per cycle - 4.239313548 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) + 12,583,865,657 cycles # 2.857 GHz + 31,452,363,261 instructions # 2.50 insn per cycle + 4.405099309 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928405e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679872e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.679872e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.918861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.663659e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.663659e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.683536 sec +TOTAL : 3.700838 sec INFO: No Floating Point Exceptions have been reported - 10,048,331,425 cycles # 2.724 GHz - 19,255,994,226 instructions # 1.92 insn per cycle - 3.689022984 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) + 10,078,917,373 cycles # 2.720 GHz + 19,455,390,747 instructions # 1.93 insn per cycle + 3.706301399 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2117) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015749e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.854641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.854641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951655e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.728573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.728573e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.541051 sec +TOTAL : 3.642043 sec INFO: No Floating Point Exceptions have been reported - 9,640,245,530 cycles # 2.719 GHz - 18,744,573,817 instructions # 1.94 insn per cycle - 3.546545299 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) + 9,923,186,488 cycles # 2.721 GHz + 19,285,155,635 instructions # 1.94 insn per cycle + 3.647395181 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1868) (512y: 189) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,15 +193,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763865e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360552e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.767522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.368955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.368955e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.989990 sec +TOTAL : 3.981046 sec INFO: No Floating Point Exceptions have been reported - 8,258,527,369 cycles # 2.068 GHz - 14,978,587,265 instructions # 1.81 insn per cycle - 3.995476429 seconds time elapsed + 8,244,325,807 cycles # 2.069 GHz + 14,979,516,169 instructions # 1.82 insn per cycle + 3.986442883 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index e6ca4b3727..330c65bf94 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:51:39 +DATE: 2024-06-03_17:58:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571906e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.165675e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278273e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507794e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163633e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275901e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.533824 sec +TOTAL : 0.531152 sec INFO: No Floating Point Exceptions have been reported - 2,160,338,708 cycles # 2.816 GHz - 3,108,947,549 instructions # 1.44 insn per cycle - 0.826788991 seconds time elapsed + 2,172,514,009 cycles # 2.825 GHz + 3,097,319,785 instructions # 1.43 insn per cycle + 0.826126707 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.052422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.113200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.113200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.780038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.825479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.825479e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.219551 sec +TOTAL : 6.000160 sec INFO: No Floating Point Exceptions have been reported - 14,990,786,657 cycles # 2.870 GHz - 38,373,509,892 instructions # 2.56 insn per cycle - 5.225136790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,224,093,003 cycles # 2.869 GHz + 45,944,443,660 instructions # 2.67 insn per cycle + 6.005443800 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.420355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.607688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.607688e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.108544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.261195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.261195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.176496 sec +TOTAL : 3.482499 sec INFO: No Floating Point Exceptions have been reported - 9,107,869,375 cycles # 2.863 GHz - 24,577,368,445 instructions # 2.70 insn per cycle - 3.182042391 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,010,591,709 cycles # 2.871 GHz + 27,842,727,910 instructions # 2.78 insn per cycle + 3.487798049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.505330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.986443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.986443e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.875219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.247933e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.247933e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.015782 sec +TOTAL : 2.261397 sec INFO: No Floating Point Exceptions have been reported - 5,458,675,505 cycles # 2.701 GHz - 11,252,130,547 instructions # 2.06 insn per cycle - 2.021434813 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,075,167,839 cycles # 2.681 GHz + 12,586,731,458 instructions # 2.07 insn per cycle + 2.266904739 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.084191e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.669186e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.669186e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.356278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.809616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.809616e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.833429 sec +TOTAL : 2.067294 sec INFO: No Floating Point Exceptions have been reported - 4,938,916,416 cycles # 2.687 GHz - 10,556,489,069 instructions # 2.14 insn per cycle - 1.839288416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,563,695,224 cycles # 2.686 GHz + 12,021,605,529 instructions # 2.16 insn per cycle + 2.072698796 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.610931e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.807983e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.807983e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.399286e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.574109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.574109e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.014130 sec +TOTAL : 3.194495 sec INFO: No Floating Point Exceptions have been reported - 5,379,542,844 cycles # 1.782 GHz - 7,793,225,348 instructions # 1.45 insn per cycle - 3.019714352 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,687,155,664 cycles # 1.778 GHz + 8,297,084,050 instructions # 1.46 insn per cycle + 3.199814441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 1fa6968bab..8d1d94459f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:27:19 +DATE: 2024-06-03_18:24:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.462166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.205101e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.205101e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.531777e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238320e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238320e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.820247 sec +TOTAL : 0.815404 sec INFO: No Floating Point Exceptions have been reported - 3,038,196,334 cycles # 2.828 GHz - 4,716,972,261 instructions # 1.55 insn per cycle - 1.131805276 seconds time elapsed + 2,992,682,585 cycles # 2.828 GHz + 4,685,692,827 instructions # 1.57 insn per cycle + 1.115617940 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,16 +90,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.034127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094165e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.765471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.810865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.810865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.347576 sec +TOTAL : 6.134495 sec INFO: No Floating Point Exceptions have been reported - 15,332,963,137 cycles # 2.864 GHz - 38,433,385,565 instructions # 2.51 insn per cycle - 5.354839876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,602,475,077 cycles # 2.867 GHz + 46,002,972,709 instructions # 2.61 insn per cycle + 6.141458193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -107,8 +107,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -119,16 +119,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.391806e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576311e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576311e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072742e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.223579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.223579e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.286080 sec +TOTAL : 3.608235 sec INFO: No Floating Point Exceptions have been reported - 9,426,586,757 cycles # 2.864 GHz - 24,763,935,790 instructions # 2.63 insn per cycle - 3.293082254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,371,923,734 cycles # 2.870 GHz + 28,025,852,686 instructions # 2.70 insn per cycle + 3.615212944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -148,16 +148,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.330091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.783188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.783188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.822050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.191359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.166949 sec +TOTAL : 2.372316 sec INFO: No Floating Point Exceptions have been reported - 5,830,307,091 cycles # 2.683 GHz - 11,537,845,857 instructions # 1.98 insn per cycle - 2.174147046 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,428,545,735 cycles # 2.703 GHz + 12,872,354,230 instructions # 2.00 insn per cycle + 2.379155799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -177,16 +177,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.935200e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.505542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.505542e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.264619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.698712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.698712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.966012 sec +TOTAL : 2.186815 sec INFO: No Floating Point Exceptions have been reported - 5,308,540,347 cycles # 2.692 GHz - 10,845,350,411 instructions # 2.04 insn per cycle - 1.973191716 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,919,973,980 cycles # 2.700 GHz + 12,311,338,626 instructions # 2.08 insn per cycle + 2.193659045 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -206,16 +206,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.557121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.748967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.748967e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.354371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.543902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.543902e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.148015 sec +TOTAL : 3.326227 sec INFO: No Floating Point Exceptions have been reported - 5,759,134,449 cycles # 1.827 GHz - 8,037,556,808 instructions # 1.40 insn per cycle - 3.155199268 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 6,132,874,621 cycles # 1.841 GHz + 8,544,916,384 instructions # 1.39 insn per cycle + 3.333259222 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 564b56aaa2..965664b535 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:38:23 +DATE: 2024-06-03_18:35:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.834225e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174853e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276595e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.935211e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173304e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276568e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.628184 sec +TOTAL : 0.629953 sec INFO: No Floating Point Exceptions have been reported - 2,420,276,537 cycles # 2.823 GHz - 3,537,411,505 instructions # 1.46 insn per cycle - 0.914703116 seconds time elapsed + 2,425,033,040 cycles # 2.821 GHz + 3,533,427,129 instructions # 1.46 insn per cycle + 0.916564365 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.042150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.102920e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.102920e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.776195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.821804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.821804e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.309711 sec +TOTAL : 6.077661 sec INFO: No Floating Point Exceptions have been reported - 15,208,601,758 cycles # 2.862 GHz - 38,393,755,680 instructions # 2.52 insn per cycle - 5.315290014 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,427,004,730 cycles # 2.865 GHz + 45,959,309,920 instructions # 2.64 insn per cycle + 6.083278927 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.630223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.630223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.101458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.257493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.257493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.221641 sec +TOTAL : 3.555023 sec INFO: No Floating Point Exceptions have been reported - 9,237,771,832 cycles # 2.863 GHz - 24,577,605,010 instructions # 2.66 insn per cycle - 3.227257120 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,225,297,905 cycles # 2.873 GHz + 27,842,100,711 instructions # 2.72 insn per cycle + 3.560578065 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.426554e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.901258e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.901258e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.882594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.257869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.257869e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.107966 sec +TOTAL : 2.322439 sec INFO: No Floating Point Exceptions have been reported - 5,642,384,352 cycles # 2.670 GHz - 11,234,139,166 instructions # 1.99 insn per cycle - 2.113587927 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,291,312,792 cycles # 2.703 GHz + 12,568,769,792 instructions # 2.00 insn per cycle + 2.328056676 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.038246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.623434e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.623434e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.346787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.802546e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.802546e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.913383 sec +TOTAL : 2.136932 sec INFO: No Floating Point Exceptions have been reported - 5,136,905,922 cycles # 2.679 GHz - 10,506,331,510 instructions # 2.05 insn per cycle - 1.919115165 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,796,366,053 cycles # 2.706 GHz + 11,970,782,633 instructions # 2.07 insn per cycle + 2.142516142 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.577150e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.773033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.773033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.432702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.609165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.107472 sec +TOTAL : 3.229295 sec INFO: No Floating Point Exceptions have been reported - 5,603,483,278 cycles # 1.801 GHz - 7,744,927,992 instructions # 1.38 insn per cycle - 3.113151850 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,897,294,349 cycles # 1.824 GHz + 8,246,732,764 instructions # 1.40 insn per cycle + 3.234708724 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 1ac31e13f6..1740584361 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:35:36 +DATE: 2024-06-03_18:32:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.828351e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175882e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277851e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.946226e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175204e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276988e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.567256 sec +TOTAL : 0.566359 sec INFO: No Floating Point Exceptions have been reported - 2,253,776,484 cycles # 2.822 GHz - 3,518,982,258 instructions # 1.56 insn per cycle - 0.855545893 seconds time elapsed + 2,244,660,155 cycles # 2.822 GHz + 3,509,815,718 instructions # 1.56 insn per cycle + 0.854244701 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.056356e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.117975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.117975e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.777744e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.824118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.824118e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.211101 sec +TOTAL : 6.010337 sec INFO: No Floating Point Exceptions have been reported - 15,016,692,408 cycles # 2.879 GHz - 38,373,187,740 instructions # 2.56 insn per cycle - 5.216643799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,238,214,430 cycles # 2.866 GHz + 45,943,155,206 instructions # 2.67 insn per cycle + 6.015955555 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.467027e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.660047e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.660047e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.100277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254705e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.136723 sec +TOTAL : 3.494866 sec INFO: No Floating Point Exceptions have been reported - 9,074,004,106 cycles # 2.889 GHz - 24,577,979,212 instructions # 2.71 insn per cycle - 3.142372869 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,035,630,342 cycles # 2.868 GHz + 27,842,669,439 instructions # 2.77 insn per cycle + 3.500612923 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.443155e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.918966e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.918966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.870990e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.243848e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.243848e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.039960 sec +TOTAL : 2.264831 sec INFO: No Floating Point Exceptions have been reported - 5,475,053,241 cycles # 2.677 GHz - 11,251,295,295 instructions # 2.06 insn per cycle - 2.045540905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,092,216,208 cycles # 2.684 GHz + 12,585,707,297 instructions # 2.07 insn per cycle + 2.270557428 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.050564e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.643604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.643604e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.353191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.806875e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.806875e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.847523 sec +TOTAL : 2.072141 sec INFO: No Floating Point Exceptions have been reported - 4,963,669,399 cycles # 2.679 GHz - 10,556,626,951 instructions # 2.13 insn per cycle - 1.853394046 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,581,758,113 cycles # 2.688 GHz + 12,021,560,907 instructions # 2.15 insn per cycle + 2.077787033 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.592373e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.788368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.788368e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.372806e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548133e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.031855 sec +TOTAL : 3.221900 sec INFO: No Floating Point Exceptions have been reported - 5,406,083,686 cycles # 1.780 GHz - 7,793,724,258 instructions # 1.44 insn per cycle - 3.037456860 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,733,219,688 cycles # 1.777 GHz + 8,300,008,708 instructions # 1.45 insn per cycle + 3.227549517 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 5a92d6747d..423530fe15 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:32:52 +DATE: 2024-06-03_18:30:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.752555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167872e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274609e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.772233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.173217e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274698e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.720859 sec +TOTAL : 0.721656 sec INFO: No Floating Point Exceptions have been reported - 2,687,445,313 cycles # 2.827 GHz - 4,266,641,488 instructions # 1.59 insn per cycle - 1.008523478 seconds time elapsed + 2,682,709,722 cycles # 2.825 GHz + 4,257,255,296 instructions # 1.59 insn per cycle + 1.008651062 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -83,16 +83,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046564e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107338e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.781162e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827105e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.235890 sec +TOTAL : 5.999645 sec INFO: No Floating Point Exceptions have been reported - 15,009,426,379 cycles # 2.865 GHz - 38,373,420,682 instructions # 2.56 insn per cycle - 5.241597173 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 17,238,271,411 cycles # 2.871 GHz + 45,943,018,029 instructions # 2.67 insn per cycle + 6.005361351 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.437646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.628123e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.628123e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.100108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254803e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.163337 sec +TOTAL : 3.494980 sec INFO: No Floating Point Exceptions have been reported - 9,075,620,291 cycles # 2.865 GHz - 24,578,067,979 instructions # 2.71 insn per cycle - 3.168963563 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,040,303,278 cycles # 2.869 GHz + 27,845,061,153 instructions # 2.77 insn per cycle + 3.500633160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -139,16 +139,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.418817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.893074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.893074e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.873993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.251786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.251786e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.047725 sec +TOTAL : 2.264207 sec INFO: No Floating Point Exceptions have been reported - 5,485,987,071 cycles # 2.673 GHz - 11,251,055,584 instructions # 2.05 insn per cycle - 2.053325818 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) + 6,092,809,767 cycles # 2.685 GHz + 12,585,659,630 instructions # 2.07 insn per cycle + 2.269920215 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.062072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.650490e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.650490e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.354382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.806589e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.806589e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.841979 sec +TOTAL : 2.070043 sec INFO: No Floating Point Exceptions have been reported - 4,952,856,258 cycles # 2.682 GHz - 10,558,518,877 instructions # 2.13 insn per cycle - 1.847644905 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) + 5,575,902,651 cycles # 2.688 GHz + 12,022,417,349 instructions # 2.16 insn per cycle + 2.075736025 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2438) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -195,16 +195,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.597246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.793746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793746e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.385452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.559039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.559039e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.028351 sec +TOTAL : 3.210851 sec INFO: No Floating Point Exceptions have been reported - 5,400,904,302 cycles # 1.781 GHz - 7,793,425,655 instructions # 1.44 insn per cycle - 3.034012590 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) + 5,716,187,306 cycles # 1.778 GHz + 8,297,674,838 instructions # 1.45 insn per cycle + 3.216588828 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 122) (512z: 1801) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 58e2659367..8044c64e2b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:03 +DATE: 2024-06-03_17:58:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.589601e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168593e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279897e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.505313e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168076e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279932e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529965 sec +TOTAL : 0.529980 sec INFO: No Floating Point Exceptions have been reported - 2,176,096,086 cycles # 2.821 GHz - 3,116,651,489 instructions # 1.43 insn per cycle - 0.828693316 seconds time elapsed + 2,177,992,433 cycles # 2.826 GHz + 3,144,999,861 instructions # 1.44 insn per cycle + 0.827728090 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046807e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107510e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107510e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.835420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883637e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.232787 sec +TOTAL : 5.821491 sec INFO: No Floating Point Exceptions have been reported - 15,011,109,733 cycles # 2.866 GHz - 40,100,143,330 instructions # 2.67 insn per cycle - 5.238308472 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) + 16,709,182,080 cycles # 2.868 GHz + 44,935,035,746 instructions # 2.69 insn per cycle + 5.826794315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.594471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.800449e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.254078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.422040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422040e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.028090 sec +TOTAL : 3.332001 sec INFO: No Floating Point Exceptions have been reported - 8,685,388,720 cycles # 2.864 GHz - 23,672,029,686 instructions # 2.73 insn per cycle - 3.033712399 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) + 9,562,607,479 cycles # 2.866 GHz + 26,700,619,348 instructions # 2.79 insn per cycle + 3.337351491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2344) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.870985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.244397e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.244397e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.461877e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.772979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.772979e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.263752 sec +TOTAL : 2.460573 sec INFO: No Floating Point Exceptions have been reported - 6,080,549,535 cycles # 2.681 GHz - 13,060,990,924 instructions # 2.15 insn per cycle - 2.269275292 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) + 6,596,657,745 cycles # 2.676 GHz + 14,125,089,780 instructions # 2.14 insn per cycle + 2.466109830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2786) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.126541e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.542200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.542200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.654021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.990833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.990833e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.155718 sec +TOTAL : 2.362948 sec INFO: No Floating Point Exceptions have been reported - 5,801,329,189 cycles # 2.685 GHz - 12,321,707,264 instructions # 2.12 insn per cycle - 2.161291942 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) + 6,346,647,925 cycles # 2.681 GHz + 13,710,696,355 instructions # 2.16 insn per cycle + 2.368361225 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2437) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.300125e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.464144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.464144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.256087e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.415473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.415473e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.288245 sec +TOTAL : 3.330240 sec INFO: No Floating Point Exceptions have been reported - 5,828,079,793 cycles # 1.770 GHz - 9,603,396,173 instructions # 1.65 insn per cycle - 3.293946839 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) + 5,906,442,578 cycles # 1.771 GHz + 10,065,102,749 instructions # 1.70 insn per cycle + 3.335608318 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1291) (512y: 208) (512z: 1987) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index eacee14a97..73cacda685 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:18:20 +DATE: 2024-06-03_18:15:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.680825e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167882e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277071e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.200564e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181073e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276469e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530821 sec +TOTAL : 0.523685 sec INFO: No Floating Point Exceptions have been reported - 2,180,475,973 cycles # 2.822 GHz - 3,129,539,684 instructions # 1.44 insn per cycle - 0.829975432 seconds time elapsed + 2,132,146,236 cycles # 2.820 GHz + 3,089,550,200 instructions # 1.45 insn per cycle + 0.812891889 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.369428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369000e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.450802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450802e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.536778 sec +TOTAL : 4.537740 sec INFO: No Floating Point Exceptions have been reported - 13,015,636,000 cycles # 2.866 GHz - 34,387,703,055 instructions # 2.64 insn per cycle - 4.542541003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) + 13,022,402,458 cycles # 2.867 GHz + 34,354,727,792 instructions # 2.64 insn per cycle + 4.543265370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.924099e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.059855e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.059855e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.903350e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.037707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.037707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.697505 sec +TOTAL : 3.724484 sec INFO: No Floating Point Exceptions have been reported - 10,607,346,172 cycles # 2.865 GHz - 24,007,082,338 instructions # 2.26 insn per cycle - 3.703200013 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) + 10,686,935,794 cycles # 2.866 GHz + 24,007,583,642 instructions # 2.25 insn per cycle + 3.730002429 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.415164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.721166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.721166e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.455535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.769238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.769238e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.486573 sec +TOTAL : 2.467399 sec INFO: No Floating Point Exceptions have been reported - 6,676,542,764 cycles # 2.680 GHz - 12,401,383,261 instructions # 1.86 insn per cycle - 2.492408166 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) + 6,614,195,369 cycles # 2.676 GHz + 12,348,253,571 instructions # 1.87 insn per cycle + 2.472798844 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.719363e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.070259e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070259e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.784525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.146203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.146203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.333222 sec +TOTAL : 2.304448 sec INFO: No Floating Point Exceptions have been reported - 6,249,988,604 cycles # 2.673 GHz - 11,572,934,567 instructions # 1.85 insn per cycle - 2.339105101 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) + 6,172,989,918 cycles # 2.673 GHz + 11,569,702,810 instructions # 1.87 insn per cycle + 2.309924380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2671) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.635891e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.836056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.836056e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.608551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806496e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806496e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.994997 sec +TOTAL : 3.017997 sec INFO: No Floating Point Exceptions have been reported - 5,329,559,296 cycles # 1.777 GHz - 9,295,784,708 instructions # 1.74 insn per cycle - 3.000603709 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) + 5,387,022,252 cycles # 1.782 GHz + 9,287,765,087 instructions # 1.72 insn per cycle + 3.023735174 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2114) (512y: 282) (512z: 1954) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 2a7449ccf8..4a353d3560 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:18:45 +DATE: 2024-06-03_18:15:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.690021e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.170643e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281113e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.200706e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184031e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279511e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529016 sec +TOTAL : 0.522181 sec INFO: No Floating Point Exceptions have been reported - 2,181,786,873 cycles # 2.820 GHz - 3,123,798,739 instructions # 1.43 insn per cycle - 0.830314726 seconds time elapsed + 2,138,289,162 cycles # 2.834 GHz + 3,096,034,787 instructions # 1.45 insn per cycle + 0.811805326 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.498656e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.589488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.589488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.513254e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.307566 sec +TOTAL : 4.283257 sec INFO: No Floating Point Exceptions have been reported - 12,355,577,175 cycles # 2.865 GHz - 35,037,181,929 instructions # 2.84 insn per cycle - 4.313266681 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) + 12,289,516,686 cycles # 2.866 GHz + 34,923,121,667 instructions # 2.84 insn per cycle + 4.288584491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.900568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.034514e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.034514e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900102e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.034022e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.034022e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.726420 sec +TOTAL : 3.727942 sec INFO: No Floating Point Exceptions have been reported - 10,682,800,271 cycles # 2.863 GHz - 23,083,133,822 instructions # 2.16 insn per cycle - 3.732131064 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) + 10,702,802,356 cycles # 2.868 GHz + 23,010,990,691 instructions # 2.15 insn per cycle + 3.733644631 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2349) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,15 +137,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.781411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.142736e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.142736e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.700701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.048209e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.048209e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.304487 sec +TOTAL : 2.342453 sec INFO: No Floating Point Exceptions have been reported - 6,156,370,918 cycles # 2.666 GHz - 11,956,053,429 instructions # 1.94 insn per cycle - 2.310098952 seconds time elapsed + 6,283,660,596 cycles # 2.677 GHz + 11,956,874,467 instructions # 1.90 insn per cycle + 2.347962428 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe @@ -165,15 +165,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.888437e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.261805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.261805e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.892088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.267236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.267236e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.256063 sec +TOTAL : 2.256039 sec INFO: No Floating Point Exceptions have been reported - 6,010,669,476 cycles # 2.659 GHz - 11,128,968,945 instructions # 1.85 insn per cycle - 2.261765055 seconds time elapsed + 6,045,328,125 cycles # 2.674 GHz + 11,131,038,815 instructions # 1.84 insn per cycle + 2.261476465 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.710789e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.919169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.919169e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.747196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.960337e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.936689 sec +TOTAL : 2.910095 sec INFO: No Floating Point Exceptions have been reported - 5,226,987,569 cycles # 1.777 GHz - 9,022,159,593 instructions # 1.73 insn per cycle - 2.942633203 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) + 5,193,848,580 cycles # 1.782 GHz + 9,026,478,285 instructions # 1.74 insn per cycle + 2.915440965 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1570) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 109477ba28..3358c1fcdf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:27 +DATE: 2024-06-03_17:59:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.016847e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.697629e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.981893e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.482617 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.189397e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.170319e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.387519e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.487127 sec INFO: No Floating Point Exceptions have been reported - 1,998,642,469 cycles # 2.818 GHz - 2,880,747,261 instructions # 1.44 insn per cycle - 0.766260624 seconds time elapsed + 2,013,212,019 cycles # 2.819 GHz + 2,909,811,134 instructions # 1.45 insn per cycle + 0.770281917 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.261171e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261171e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.885414e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.938356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.938356e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.877134 sec +TOTAL : 5.649040 sec INFO: No Floating Point Exceptions have been reported - 13,994,891,793 cycles # 2.867 GHz - 38,340,768,488 instructions # 2.74 insn per cycle - 4.882458026 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,220,933,838 cycles # 2.870 GHz + 45,344,072,693 instructions # 2.80 insn per cycle + 5.654165812 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.868397e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.268797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.268797e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.438186e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.768939e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.768939e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.243083 sec +TOTAL : 2.450780 sec INFO: No Floating Point Exceptions have been reported - 6,441,514,611 cycles # 2.866 GHz - 15,815,172,638 instructions # 2.46 insn per cycle - 2.248472682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,039,664,256 cycles # 2.868 GHz + 17,774,972,317 instructions # 2.52 insn per cycle + 2.455771294 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.776329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.004987e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004987e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.082016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.154135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.154135e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.283215 sec +TOTAL : 1.386052 sec INFO: No Floating Point Exceptions have been reported - 3,465,053,995 cycles # 2.691 GHz - 7,593,444,901 instructions # 2.19 insn per cycle - 1.288569725 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,733,426,497 cycles # 2.685 GHz + 8,265,424,322 instructions # 2.21 insn per cycle + 1.391191714 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.465176e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096622e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096622e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.446730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.624818e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.624818e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.196628 sec +TOTAL : 1.329069 sec INFO: No Floating Point Exceptions have been reported - 3,245,802,002 cycles # 2.702 GHz - 7,203,049,725 instructions # 2.22 insn per cycle - 1.202115916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,545,954,783 cycles # 2.659 GHz + 7,920,530,780 instructions # 2.23 insn per cycle + 1.334352866 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.681015e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.390488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.390488e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.261718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.888190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.888190e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.658088 sec +TOTAL : 1.762412 sec INFO: No Floating Point Exceptions have been reported - 3,068,008,054 cycles # 1.846 GHz - 5,834,677,054 instructions # 1.90 insn per cycle - 1.663441620 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,252,410,609 cycles # 1.841 GHz + 6,100,423,263 instructions # 1.88 insn per cycle + 1.767662641 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index ecf1f25eca..97dec5a86e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:27:43 +DATE: 2024-06-03_18:24:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.942422e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.805237e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.805237e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.981473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.818775e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.818775e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.682307 sec +TOTAL : 0.680933 sec INFO: No Floating Point Exceptions have been reported - 2,573,557,720 cycles # 2.826 GHz - 4,026,704,050 instructions # 1.56 insn per cycle - 0.968234758 seconds time elapsed + 2,573,133,936 cycles # 2.826 GHz + 4,034,036,104 instructions # 1.57 insn per cycle + 0.968043602 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -76,8 +76,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -90,16 +90,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.164148e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.234526e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.234526e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.876784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929482e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.979751 sec +TOTAL : 5.721392 sec INFO: No Floating Point Exceptions have been reported - 14,265,943,518 cycles # 2.864 GHz - 38,385,772,523 instructions # 2.69 insn per cycle - 4.986458560 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,410,500,959 cycles # 2.866 GHz + 45,384,280,796 instructions # 2.77 insn per cycle + 5.727881558 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -107,8 +107,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -119,16 +119,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.818621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.210781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.210781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.387912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711236e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.313915 sec +TOTAL : 2.528580 sec INFO: No Floating Point Exceptions have been reported - 6,643,916,276 cycles # 2.864 GHz - 16,095,500,762 instructions # 2.42 insn per cycle - 2.320575344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,256,716,109 cycles # 2.864 GHz + 18,057,230,835 instructions # 2.49 insn per cycle + 2.535013143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -136,8 +136,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -148,16 +148,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.616473e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.887402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.887402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.916935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.978265e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.978265e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.355624 sec +TOTAL : 1.463017 sec INFO: No Floating Point Exceptions have been reported - 3,674,656,527 cycles # 2.699 GHz - 7,830,907,550 instructions # 2.13 insn per cycle - 1.362279184 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,946,830,503 cycles # 2.687 GHz + 8,502,419,899 instructions # 2.15 insn per cycle + 1.469468319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,8 +165,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -177,16 +177,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.223119e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.069627e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.069627e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.391597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.599398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.599398e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.276248 sec +TOTAL : 1.387422 sec INFO: No Floating Point Exceptions have been reported - 3,470,678,953 cycles # 2.706 GHz - 7,438,963,141 instructions # 2.14 insn per cycle - 1.283084734 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,764,832,151 cycles # 2.703 GHz + 8,158,322,672 instructions # 2.17 insn per cycle + 1.393974805 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -194,8 +194,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP= @@ -206,16 +206,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.605496e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.298191e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.298191e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.166687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.764799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.764799e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.726954 sec +TOTAL : 1.839563 sec INFO: No Floating Point Exceptions have been reported - 3,271,998,736 cycles # 1.889 GHz - 6,089,399,163 instructions # 1.86 insn per cycle - 1.733677408 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,463,042,965 cycles # 1.877 GHz + 6,355,050,636 instructions # 1.84 insn per cycle + 1.845923010 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -223,8 +223,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 0ad3eafec8..b054003c65 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:38:47 +DATE: 2024-06-03_18:36:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.959750e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.678565e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.983275e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.193088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.180658e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376626e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.579074 sec +TOTAL : 0.579235 sec INFO: No Floating Point Exceptions have been reported - 2,257,408,465 cycles # 2.820 GHz - 3,319,397,953 instructions # 1.47 insn per cycle - 0.858489340 seconds time elapsed + 2,259,058,959 cycles # 2.822 GHz + 3,332,330,914 instructions # 1.48 insn per cycle + 0.858238193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.184974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.256690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256690e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935460e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.945704 sec +TOTAL : 5.718812 sec INFO: No Floating Point Exceptions have been reported - 14,170,625,155 cycles # 2.863 GHz - 38,370,527,150 instructions # 2.71 insn per cycle - 4.951076630 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,400,409,861 cycles # 2.866 GHz + 45,373,617,800 instructions # 2.77 insn per cycle + 5.724065508 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.860658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.261752e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.261752e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.427518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.757072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.757072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.305703 sec +TOTAL : 2.515997 sec INFO: No Floating Point Exceptions have been reported - 6,613,265,265 cycles # 2.862 GHz - 15,828,283,667 instructions # 2.39 insn per cycle - 2.311146715 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,219,245,576 cycles # 2.864 GHz + 17,789,385,116 instructions # 2.46 insn per cycle + 2.521192317 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.686618e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.970674e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.970674e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.058523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.147643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.147643e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.355317 sec +TOTAL : 1.448735 sec INFO: No Floating Point Exceptions have been reported - 3,633,491,212 cycles # 2.672 GHz - 7,578,117,090 instructions # 2.09 insn per cycle - 1.360829033 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,909,707,878 cycles # 2.690 GHz + 8,249,725,200 instructions # 2.11 insn per cycle + 1.454021088 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.321381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.082734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.082734e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.543553e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.792551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.792551e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.273623 sec +TOTAL : 1.375214 sec INFO: No Floating Point Exceptions have been reported - 3,431,066,713 cycles # 2.684 GHz - 7,153,252,647 instructions # 2.08 insn per cycle - 1.278962900 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,723,811,727 cycles # 2.699 GHz + 7,870,129,072 instructions # 2.11 insn per cycle + 1.380400518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.705447e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.424288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.424288e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.254541e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.873169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.873169e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.712725 sec +TOTAL : 1.824987 sec INFO: No Floating Point Exceptions have been reported - 3,232,442,354 cycles # 1.882 GHz - 5,785,846,161 instructions # 1.79 insn per cycle - 1.718151231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,423,559,926 cycles # 1.872 GHz + 6,050,752,549 instructions # 1.77 insn per cycle + 1.830318343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 4e4b68c02e..c835253773 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:35:59 +DATE: 2024-06-03_18:33:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.829888e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.672325e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968559e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.521072 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.306399e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.181630e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.378764e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.518721 sec INFO: No Floating Point Exceptions have been reported - 2,127,774,211 cycles # 2.822 GHz - 3,322,390,866 instructions # 1.56 insn per cycle - 0.812598155 seconds time elapsed + 2,091,185,845 cycles # 2.826 GHz + 3,304,915,518 instructions # 1.58 insn per cycle + 0.797417153 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.259969e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.259969e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881998e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.880866 sec +TOTAL : 5.660184 sec INFO: No Floating Point Exceptions have been reported - 14,003,432,297 cycles # 2.867 GHz - 38,341,113,505 instructions # 2.74 insn per cycle - 4.886313267 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,228,253,084 cycles # 2.865 GHz + 45,339,793,508 instructions # 2.79 insn per cycle + 5.665482518 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.865782e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.266690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.266690e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.430385e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.762372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.762372e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.245588 sec +TOTAL : 2.456993 sec INFO: No Floating Point Exceptions have been reported - 6,448,223,959 cycles # 2.866 GHz - 15,815,680,836 instructions # 2.45 insn per cycle - 2.250939876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,054,966,301 cycles # 2.866 GHz + 17,775,169,086 instructions # 2.52 insn per cycle + 2.462396323 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.646940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.924791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.924791e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.096232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.198906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.198906e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.302035 sec +TOTAL : 1.384702 sec INFO: No Floating Point Exceptions have been reported - 3,468,694,819 cycles # 2.655 GHz - 7,593,779,362 instructions # 2.19 insn per cycle - 1.307417897 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,747,560,129 cycles # 2.697 GHz + 8,265,172,826 instructions # 2.21 insn per cycle + 1.390118552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.341284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.086225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.086225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.514573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.761076e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.761076e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.212139 sec +TOTAL : 1.321389 sec INFO: No Floating Point Exceptions have been reported - 3,264,189,304 cycles # 2.683 GHz - 7,202,777,705 instructions # 2.21 insn per cycle - 1.217586001 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,562,769,632 cycles # 2.687 GHz + 7,920,504,953 instructions # 2.22 insn per cycle + 1.326818698 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.695951e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442709e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442709e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.212259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.822619e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.822619e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.656278 sec +TOTAL : 1.777315 sec INFO: No Floating Point Exceptions have been reported - 3,079,117,307 cycles # 1.854 GHz - 5,835,044,949 instructions # 1.90 insn per cycle - 1.661779093 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,264,947,794 cycles # 1.833 GHz + 6,099,951,354 instructions # 1.87 insn per cycle + 1.782787601 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 7d521e9bea..58e49d86b2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:33:16 +DATE: 2024-06-03_18:30:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,18 +50,18 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.584988e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.675915e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.967452e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.624537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.179236e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.368664e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.626563 sec +TOTAL : 0.626368 sec INFO: No Floating Point Exceptions have been reported - 2,397,623,441 cycles # 2.823 GHz - 3,754,246,774 instructions # 1.57 insn per cycle - 0.906158297 seconds time elapsed + 2,398,947,371 cycles # 2.829 GHz + 3,752,782,194 instructions # 1.56 insn per cycle + 0.905540211 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -70,8 +70,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -83,16 +83,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260287e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260287e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.935038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.935038e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.881127 sec +TOTAL : 5.662100 sec INFO: No Floating Point Exceptions have been reported - 14,001,505,232 cycles # 2.866 GHz - 38,340,997,313 instructions # 2.74 insn per cycle - 4.886650264 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) + 16,227,209,162 cycles # 2.864 GHz + 45,340,117,828 instructions # 2.79 insn per cycle + 5.667445307 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 605) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.864903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.265431e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.265431e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.425382e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.754402e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.754402e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.245406 sec +TOTAL : 2.460362 sec INFO: No Floating Point Exceptions have been reported - 6,447,851,141 cycles # 2.865 GHz - 15,815,588,340 instructions # 2.45 insn per cycle - 2.250959020 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) + 7,057,208,189 cycles # 2.863 GHz + 17,774,934,002 instructions # 2.52 insn per cycle + 2.465805803 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -128,8 +128,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -139,16 +139,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.822705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013257e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013257e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.060158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.151567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.151567e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.277005 sec +TOTAL : 1.390519 sec INFO: No Floating Point Exceptions have been reported - 3,462,079,610 cycles # 2.701 GHz - 7,593,569,628 instructions # 2.19 insn per cycle - 1.282260547 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) + 3,750,841,770 cycles # 2.689 GHz + 8,266,163,693 instructions # 2.20 insn per cycle + 1.395877768 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3374) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -156,8 +156,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.456827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099917e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099917e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.541765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.784606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.784606e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.197770 sec +TOTAL : 1.316558 sec INFO: No Floating Point Exceptions have been reported - 3,264,255,856 cycles # 2.715 GHz - 7,202,978,053 instructions # 2.21 insn per cycle - 1.203170555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) + 3,557,676,159 cycles # 2.693 GHz + 7,920,448,927 instructions # 2.23 insn per cycle + 1.321911082 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3226) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP= @@ -195,16 +195,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.703762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.418726e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418726e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.236074e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.852334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.852334e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.654681 sec +TOTAL : 1.771281 sec INFO: No Floating Point Exceptions have been reported - 3,071,792,415 cycles # 1.854 GHz - 5,835,969,355 instructions # 1.90 insn per cycle - 1.660064901 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2368) (512y: 24) (512z: 1888) + 3,260,314,929 cycles # 1.836 GHz + 6,099,684,038 instructions # 1.87 insn per cycle + 1.776894036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2400) (512y: 24) (512z: 2152) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -212,8 +212,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 8b44c0445b..4f2f03096c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:52:47 +DATE: 2024-06-03_17:59:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.016058e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.669695e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.038322e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.483252 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.575450e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.442883e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719385e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.491778 sec INFO: No Floating Point Exceptions have been reported - 1,998,362,251 cycles # 2.820 GHz - 2,889,875,535 instructions # 1.45 insn per cycle - 0.765688803 seconds time elapsed + 2,019,426,299 cycles # 2.818 GHz + 2,897,302,470 instructions # 1.43 insn per cycle + 0.776260193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207513e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.915291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969931e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.990603 sec +TOTAL : 5.561920 sec INFO: No Floating Point Exceptions have been reported - 14,312,095,417 cycles # 2.865 GHz - 39,833,075,351 instructions # 2.78 insn per cycle - 4.996093045 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) + 15,980,534,668 cycles # 2.871 GHz + 44,449,710,623 instructions # 2.78 insn per cycle + 5.567058871 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 550) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -98,8 +98,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199028000236 -Relative difference = 4.790961076489297e-08 +Avg ME (F77/C++) = 2.0288198669441044 +Relative difference = 6.558289825352968e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.631923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.176181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.176181e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.192586e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.649473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.649473e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.951672 sec +TOTAL : 2.107774 sec INFO: No Floating Point Exceptions have been reported - 5,596,939,158 cycles # 2.861 GHz - 15,284,742,297 instructions # 2.73 insn per cycle - 1.956996618 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) + 6,057,620,501 cycles # 2.868 GHz + 17,081,636,340 instructions # 2.82 insn per cycle + 2.112968987 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288193075684831 +Relative difference = 1.515997647531052e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.218680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.834401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.834401e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.911096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.464242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.464242e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.775280 sec +TOTAL : 1.861423 sec INFO: No Floating Point Exceptions have been reported - 4,757,273,897 cycles # 2.673 GHz - 9,734,524,466 instructions # 2.05 insn per cycle - 1.780718117 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) + 5,014,435,050 cycles # 2.687 GHz + 10,230,180,171 instructions # 2.04 insn per cycle + 1.866720473 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.294887e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.927248e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.927248e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.975955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.541648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.541648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.754842 sec +TOTAL : 1.842744 sec INFO: No Floating Point Exceptions have been reported - 4,627,741,209 cycles # 2.630 GHz - 9,326,813,558 instructions # 2.02 insn per cycle - 1.760340221 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) + 4,967,642,651 cycles # 2.690 GHz + 10,001,951,713 instructions # 2.01 insn per cycle + 1.847768308 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3824) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288181869545951 +Relative difference = 9.214951531400725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,16 +193,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.443462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.905052e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.905052e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.478820e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.786579e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.786579e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.015644 sec +TOTAL : 2.429927 sec INFO: No Floating Point Exceptions have been reported - 3,664,457,552 cycles # 1.814 GHz - 7,034,592,113 instructions # 1.92 insn per cycle - 2.021014520 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2610) (512y: 12) (512z: 2220) + 4,360,807,561 cycles # 1.792 GHz + 8,449,096,692 instructions # 1.94 insn per cycle + 2.435243753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2897) (512y: 4) (512z: 2751) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -210,8 +210,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183459779248 -Relative difference = 1.7053177021099307e-07 +Avg ME (F77/C++) = 2.0288183148950338 +Relative difference = 1.5521108056421764e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 0b4aad6d48..116d48b4c8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:19:09 +DATE: 2024-06-03_18:16:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.536351e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.649763e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.975434e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.490081 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.893743e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.209215e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382460e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485128 sec INFO: No Floating Point Exceptions have been reported - 2,012,235,376 cycles # 2.813 GHz - 2,887,278,665 instructions # 1.43 insn per cycle - 0.773980012 seconds time elapsed + 1,998,933,549 cycles # 2.818 GHz + 2,886,521,387 instructions # 1.44 insn per cycle + 0.766563752 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.473438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.473438e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.442511e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532646e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.481561 sec +TOTAL : 4.381681 sec INFO: No Floating Point Exceptions have been reported - 12,595,769,062 cycles # 2.808 GHz - 34,371,859,733 instructions # 2.73 insn per cycle - 4.486981898 seconds time elapsed + 12,577,682,953 cycles # 2.868 GHz + 34,626,967,775 instructions # 2.75 insn per cycle + 4.386763817 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.156182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.609115e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.609115e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.188953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647273e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.124099 sec +TOTAL : 2.110381 sec INFO: No Floating Point Exceptions have been reported - 6,097,938,896 cycles # 2.864 GHz - 14,860,412,482 instructions # 2.44 insn per cycle - 2.129667458 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) + 6,062,164,948 cycles # 2.867 GHz + 14,850,935,565 instructions # 2.45 insn per cycle + 2.115526112 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193803280592 -Relative difference = 1.8746278463897685e-07 +Avg ME (F77/C++) = 2.0288193414453417 +Relative difference = 1.6829758681196702e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.956935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.736211e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.736211e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.948644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.746008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.746008e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.596219 sec +TOTAL : 1.598307 sec INFO: No Floating Point Exceptions have been reported - 4,283,389,233 cycles # 2.675 GHz - 9,028,537,855 instructions # 2.11 insn per cycle - 1.601731069 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) + 4,310,676,440 cycles # 2.690 GHz + 9,056,395,427 instructions # 2.10 insn per cycle + 1.603504485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4470) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288181974319741 +Relative difference = 9.731379272303266e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,15 +165,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.081544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.893467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.893467e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.149704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.008490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.008490e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.569456 sec +TOTAL : 1.555197 sec INFO: No Floating Point Exceptions have been reported - 4,194,103,331 cycles # 2.666 GHz - 8,663,712,018 instructions # 2.07 insn per cycle - 1.575061670 seconds time elapsed + 4,190,493,094 cycles # 2.688 GHz + 8,664,572,975 instructions # 2.07 insn per cycle + 1.560446046 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288181974319741 +Relative difference = 9.731379272303266e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.158310e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.571602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.571602e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.176266e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.591500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.591500e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.121701 sec +TOTAL : 2.115034 sec INFO: No Floating Point Exceptions have been reported - 3,840,319,125 cycles # 1.806 GHz - 7,808,340,953 instructions # 2.03 insn per cycle - 2.127208383 seconds time elapsed + 3,838,944,051 cycles # 1.811 GHz + 7,808,393,263 instructions # 2.03 insn per cycle + 2.120392458 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4424) (512y: 0) (512z: 2555) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 68145ed810..71aecc0e65 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_21:19:29 +DATE: 2024-06-03_18:16:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.581017e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.707013e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.045464e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486421 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.042762e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.498495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.721786e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086719e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.481742 sec INFO: No Floating Point Exceptions have been reported - 2,005,170,721 cycles # 2.817 GHz - 2,885,331,094 instructions # 1.44 insn per cycle - 0.769664626 seconds time elapsed + 1,994,870,659 cycles # 2.822 GHz + 2,887,372,486 instructions # 1.45 insn per cycle + 0.763571243 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 2.028811e+00 -Avg ME (F77/GPU) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +Avg ME (F77/GPU) = 2.0288499356247485 +Relative difference = 1.9191351362116207e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.592369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.693697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.693697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.582587e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.683226e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.683226e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.133554 sec +TOTAL : 4.148957 sec INFO: No Floating Point Exceptions have been reported - 11,751,435,134 cycles # 2.840 GHz - 35,107,900,053 instructions # 2.99 insn per cycle - 4.139079374 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) + 11,805,670,893 cycles # 2.843 GHz + 35,094,586,200 instructions # 2.97 insn per cycle + 4.154168605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,15 +109,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.304158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.779755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.779755e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.294390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.771165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.771165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.066374 sec +TOTAL : 2.069902 sec INFO: No Floating Point Exceptions have been reported - 5,955,734,514 cycles # 2.876 GHz - 14,470,820,860 instructions # 2.43 insn per cycle - 2.071726681 seconds time elapsed + 5,951,056,214 cycles # 2.869 GHz + 14,470,509,640 instructions # 2.43 insn per cycle + 2.075071006 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe @@ -137,16 +137,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.262038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.115917e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.115917e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.194905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.531929 sec +TOTAL : 1.546412 sec INFO: No Floating Point Exceptions have been reported - 4,141,893,808 cycles # 2.695 GHz - 8,874,492,613 instructions # 2.14 insn per cycle - 1.537451545 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) + 4,171,092,335 cycles # 2.689 GHz + 8,883,426,870 instructions # 2.13 insn per cycle + 1.551600108 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3580) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -154,8 +154,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288182104704902 +Relative difference = 1.0374044905426431e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,16 +165,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.243724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.092067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.092067e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.232209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.105029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.105029e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.535382 sec +TOTAL : 1.537952 sec INFO: No Floating Point Exceptions have been reported - 4,153,568,465 cycles # 2.697 GHz - 8,412,828,463 instructions # 2.03 insn per cycle - 1.540818202 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) + 4,138,913,630 cycles # 2.684 GHz + 8,410,635,266 instructions # 2.03 insn per cycle + 1.543068814 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3314) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -182,8 +182,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288182104704902 +Relative difference = 1.0374044905426431e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,15 +193,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.345549e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788380e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788380e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.264406e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.694500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.694500e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.050082 sec +TOTAL : 2.081032 sec INFO: No Floating Point Exceptions have been reported - 3,776,688,425 cycles # 1.838 GHz - 7,700,644,489 instructions # 2.04 insn per cycle - 2.055466903 seconds time elapsed + 3,784,430,137 cycles # 1.815 GHz + 7,701,504,575 instructions # 2.04 insn per cycle + 2.086318740 seconds time elapsed =Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3440) (512y: 0) (512z: 2107) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index ac74dccede..a7f36bcee1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:53:07 +DATE: 2024-06-03_17:59:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.554923e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166179e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277525e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507163e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163262e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276787e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530564 sec +TOTAL : 0.538303 sec INFO: No Floating Point Exceptions have been reported - 2,183,555,188 cycles # 2.822 GHz - 3,144,815,139 instructions # 1.44 insn per cycle - 0.830734274 seconds time elapsed + 2,165,151,740 cycles # 2.801 GHz + 3,099,392,657 instructions # 1.43 insn per cycle + 0.834137975 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.007712e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065929e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.065929e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.771690e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816324e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.334891 sec +TOTAL : 6.027557 sec INFO: No Floating Point Exceptions have been reported - 15,289,472,900 cycles # 2.864 GHz - 38,577,953,274 instructions # 2.52 insn per cycle - 5.340497151 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) + 17,393,761,998 cycles # 2.884 GHz + 46,095,398,077 instructions # 2.65 insn per cycle + 6.033053485 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 631) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.464249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.655908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.655908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.091341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.242349e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.242349e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.137831 sec +TOTAL : 3.502392 sec INFO: No Floating Point Exceptions have been reported - 8,973,826,641 cycles # 2.856 GHz - 24,223,107,065 instructions # 2.70 insn per cycle - 3.143393706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) + 9,983,647,191 cycles # 2.847 GHz + 27,594,046,832 instructions # 2.76 insn per cycle + 3.507812390 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.480038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.956414e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.956414e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.924038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.302955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.302955e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.024686 sec +TOTAL : 2.239342 sec INFO: No Floating Point Exceptions have been reported - 5,399,489,629 cycles # 2.661 GHz - 11,276,345,804 instructions # 2.09 insn per cycle - 2.030286202 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) + 6,002,425,078 cycles # 2.675 GHz + 12,490,276,444 instructions # 2.08 insn per cycle + 2.244729020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2780) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.087786e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.678670e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.678670e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.391265e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.848955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.848955e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.832786 sec +TOTAL : 2.055015 sec INFO: No Floating Point Exceptions have been reported - 4,867,249,834 cycles # 2.649 GHz - 10,525,904,310 instructions # 2.16 insn per cycle - 1.838359941 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) + 5,505,020,906 cycles # 2.674 GHz + 11,927,639,782 instructions # 2.17 insn per cycle + 2.060518547 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2531) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.703127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.911454e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.911454e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.468701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.651354e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.651354e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.942466 sec +TOTAL : 3.133138 sec INFO: No Floating Point Exceptions have been reported - 5,244,087,950 cycles # 1.780 GHz - 7,604,896,768 instructions # 1.45 insn per cycle - 2.947952496 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) + 5,572,843,285 cycles # 1.776 GHz + 8,116,354,124 instructions # 1.46 insn per cycle + 3.138888887 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1668) (512y: 126) (512z: 1862) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index e93587dbb1..0d17bfb092 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_20:53:31 +DATE: 2024-06-03_18:00:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.488756e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158798e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278167e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.507488e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165492e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276653e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532803 sec +TOTAL : 0.530194 sec INFO: No Floating Point Exceptions have been reported - 2,183,879,430 cycles # 2.808 GHz - 3,134,949,776 instructions # 1.44 insn per cycle - 0.834793767 seconds time elapsed + 2,167,878,364 cycles # 2.823 GHz + 3,109,854,489 instructions # 1.43 insn per cycle + 0.825240075 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.997980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.055486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.055486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.805887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.852897e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.852897e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.359665 sec +TOTAL : 5.915886 sec INFO: No Floating Point Exceptions have been reported - 15,360,836,715 cycles # 2.864 GHz - 40,374,148,262 instructions # 2.63 insn per cycle - 5.365170950 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) + 16,977,594,162 cycles # 2.868 GHz + 45,124,282,986 instructions # 2.66 insn per cycle + 5.921217258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.653686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.867948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.867948e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.290200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.461592e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.461592e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.980978 sec +TOTAL : 3.297067 sec INFO: No Floating Point Exceptions have been reported - 8,537,424,840 cycles # 2.860 GHz - 23,255,933,901 instructions # 2.72 insn per cycle - 2.986536514 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) + 9,474,741,584 cycles # 2.870 GHz + 26,246,059,686 instructions # 2.77 insn per cycle + 3.302677343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -137,16 +137,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.675454e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.025638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.025638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.354754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.651843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.651843e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.354095 sec +TOTAL : 2.518683 sec INFO: No Floating Point Exceptions have been reported - 6,271,057,831 cycles # 2.659 GHz - 12,961,948,705 instructions # 2.07 insn per cycle - 2.359820621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) + 6,735,700,717 cycles # 2.669 GHz + 14,039,091,079 instructions # 2.08 insn per cycle + 2.524292410 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.936868e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.317677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.317677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.594752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.925919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.925919e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.234617 sec +TOTAL : 2.392171 sec INFO: No Floating Point Exceptions have been reported - 5,930,155,286 cycles # 2.648 GHz - 12,239,916,481 instructions # 2.06 insn per cycle - 2.240091992 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) + 6,402,125,730 cycles # 2.671 GHz + 13,528,701,240 instructions # 2.11 insn per cycle + 2.397739364 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2547) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.444871e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.625260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.625260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464006e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.644808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644808e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.154913 sec +TOTAL : 3.137581 sec INFO: No Floating Point Exceptions have been reported - 5,603,434,208 cycles # 1.774 GHz - 8,746,306,669 instructions # 1.56 insn per cycle - 3.160521781 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) + 5,585,907,221 cycles # 1.778 GHz + 9,215,484,758 instructions # 1.65 insn per cycle + 3.143155754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1453) (512y: 212) (512z: 2059) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d2a9436bac..ebda5e548c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:53:55 +DATE: 2024-06-03_18:00:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.828792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058201e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072605e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.496575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.048174e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064535e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469991 sec +TOTAL : 0.472291 sec INFO: No Floating Point Exceptions have been reported - 1,950,502,519 cycles # 2.813 GHz - 2,806,448,865 instructions # 1.44 insn per cycle - 0.749484229 seconds time elapsed + 1,944,574,836 cycles # 2.817 GHz + 2,773,147,259 instructions # 1.43 insn per cycle + 0.747270416 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.080589e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.327376e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341190e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.086792e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326818e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.340800e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609725 sec +TOTAL : 0.615400 sec INFO: No Floating Point Exceptions have been reported - 2,396,738,847 cycles # 2.820 GHz - 3,668,256,125 instructions # 1.53 insn per cycle - 0.908964727 seconds time elapsed + 2,409,226,641 cycles # 2.819 GHz + 3,595,245,677 instructions # 1.49 insn per cycle + 0.915650252 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.392275e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404239e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404239e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.374863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.386689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.386689e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.873703 sec +TOTAL : 6.922997 sec INFO: No Floating Point Exceptions have been reported - 19,791,549,123 cycles # 2.878 GHz - 59,606,317,603 instructions # 3.01 insn per cycle - 6.878041961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) + 19,884,239,052 cycles # 2.871 GHz + 59,920,809,546 instructions # 3.01 insn per cycle + 6.927215258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.568257e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612303e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.612303e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.463550e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.505286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505286e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.610171 sec +TOTAL : 3.694475 sec INFO: No Floating Point Exceptions have been reported - 10,370,530,942 cycles # 2.870 GHz - 30,676,186,235 instructions # 2.96 insn per cycle - 3.614641811 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) + 10,604,185,605 cycles # 2.867 GHz + 31,094,882,685 instructions # 2.93 insn per cycle + 3.698744087 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.953227e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119594e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119594e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.858143e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.020160e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.020160e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.853859 sec +TOTAL : 1.873129 sec INFO: No Floating Point Exceptions have been reported - 4,895,759,086 cycles # 2.636 GHz - 11,018,740,137 instructions # 2.25 insn per cycle - 1.858168783 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) + 4,996,519,277 cycles # 2.663 GHz + 11,413,075,235 instructions # 2.28 insn per cycle + 1.877381447 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4653) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.002285e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.022719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.022719e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.939252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.013970e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013970e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.658467 sec +TOTAL : 1.671902 sec INFO: No Floating Point Exceptions have been reported - 4,377,789,034 cycles # 2.634 GHz - 10,296,146,857 instructions # 2.35 insn per cycle - 1.662826466 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) + 4,459,214,327 cycles # 2.662 GHz + 10,671,791,558 instructions # 2.39 insn per cycle + 1.676188201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4395) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.862327e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.962239e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.962239e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.825648e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.923292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.923292e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.413722 sec +TOTAL : 2.426057 sec INFO: No Floating Point Exceptions have been reported - 4,103,494,859 cycles # 1.698 GHz - 5,842,470,718 instructions # 1.42 insn per cycle - 2.418058321 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) + 4,141,058,833 cycles # 1.704 GHz + 5,974,244,939 instructions # 1.44 insn per cycle + 2.430391084 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1629) (512y: 95) (512z: 3576) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index a85c881c90..eb34157c53 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_21:28:04 +DATE: 2024-06-03_18:25:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.555829e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.818637e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.818637e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.584531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.976756e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976756e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.500626 sec +TOTAL : 0.499776 sec INFO: No Floating Point Exceptions have been reported - 2,009,885,240 cycles # 2.812 GHz - 3,066,226,195 instructions # 1.53 insn per cycle - 0.771725422 seconds time elapsed + 2,012,995,564 cycles # 2.821 GHz + 3,041,094,374 instructions # 1.51 insn per cycle + 0.770525018 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.701616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.932321e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.932321e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.717101e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.982761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.982761e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.829861 sec +TOTAL : 0.825846 sec INFO: No Floating Point Exceptions have been reported - 3,061,531,665 cycles # 2.832 GHz - 4,949,277,454 instructions # 1.62 insn per cycle - 1.138627743 seconds time elapsed + 3,047,783,214 cycles # 2.833 GHz + 4,940,094,414 instructions # 1.62 insn per cycle + 1.136340341 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.385150e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.397380e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.397380e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.364897e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.377090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.377090e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.902584 sec +TOTAL : 6.960938 sec INFO: No Floating Point Exceptions have been reported - 19,794,872,013 cycles # 2.866 GHz - 59,611,558,170 instructions # 3.01 insn per cycle - 6.907141027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) + 19,949,623,161 cycles # 2.865 GHz + 59,928,901,076 instructions # 3.00 insn per cycle + 6.965534824 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1212) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.557869e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.602026e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.602026e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.452940e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.495356e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.495356e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.626237 sec +TOTAL : 3.712004 sec INFO: No Floating Point Exceptions have been reported - 10,407,659,030 cycles # 2.867 GHz - 30,722,342,234 instructions # 2.95 insn per cycle - 3.630843347 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) + 10,658,944,463 cycles # 2.869 GHz + 31,146,122,122 instructions # 2.92 insn per cycle + 3.716787966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.913116e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.082779e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.082779e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.837623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.005313e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.005313e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.870398 sec +TOTAL : 1.887457 sec INFO: No Floating Point Exceptions have been reported - 4,945,675,416 cycles # 2.639 GHz - 11,067,795,090 instructions # 2.24 insn per cycle - 1.874949750 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) + 5,042,590,042 cycles # 2.666 GHz + 11,463,290,873 instructions # 2.27 insn per cycle + 1.892080136 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4653) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.991691e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.020412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.020412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.912443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.011944e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.011944e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.672210 sec +TOTAL : 1.684689 sec INFO: No Floating Point Exceptions have been reported - 4,419,204,279 cycles # 2.637 GHz - 10,345,034,833 instructions # 2.34 insn per cycle - 1.676771727 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) + 4,502,957,413 cycles # 2.667 GHz + 10,720,861,961 instructions # 2.38 insn per cycle + 1.689237193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4395) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.843006e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.942637e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.942637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.781909e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879331e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879331e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.429398 sec +TOTAL : 2.451274 sec INFO: No Floating Point Exceptions have been reported - 4,154,030,623 cycles # 1.707 GHz - 5,882,157,165 instructions # 1.42 insn per cycle - 2.434135528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) + 4,196,086,430 cycles # 1.709 GHz + 6,013,875,420 instructions # 1.43 insn per cycle + 2.455998982 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1629) (512y: 95) (512z: 3576) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 9c5400dc3c..a30e15379c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:54:21 +DATE: 2024-06-03_18:01:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.728787e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040296e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054902e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.445164e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056751e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469961 sec +TOTAL : 0.474176 sec INFO: No Floating Point Exceptions have been reported - 1,945,140,914 cycles # 2.814 GHz - 2,797,132,683 instructions # 1.44 insn per cycle - 0.748195009 seconds time elapsed + 1,943,847,656 cycles # 2.811 GHz + 2,740,790,877 instructions # 1.41 insn per cycle + 0.749383492 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.071125e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312843e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325872e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.078272e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313524e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326650e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609321 sec +TOTAL : 0.608270 sec INFO: No Floating Point Exceptions have been reported - 2,388,183,446 cycles # 2.820 GHz - 3,624,721,355 instructions # 1.52 insn per cycle - 0.907535221 seconds time elapsed + 2,398,054,354 cycles # 2.828 GHz + 3,623,922,979 instructions # 1.51 insn per cycle + 0.907154023 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.416052e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.428253e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.428253e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.369615e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.381392e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.381392e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.805799 sec +TOTAL : 6.938575 sec INFO: No Floating Point Exceptions have been reported - 19,509,656,309 cycles # 2.865 GHz - 58,797,581,425 instructions # 3.01 insn per cycle - 6.810019541 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) + 19,924,877,146 cycles # 2.871 GHz + 60,135,086,712 instructions # 3.02 insn per cycle + 6.942820252 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1335) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.624636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.669607e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.669607e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.537192e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.580141e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.580141e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.566261 sec +TOTAL : 3.634362 sec INFO: No Floating Point Exceptions have been reported - 10,224,672,260 cycles # 2.864 GHz - 30,345,523,778 instructions # 2.97 insn per cycle - 3.570675168 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) + 10,428,527,121 cycles # 2.867 GHz + 30,694,765,319 instructions # 2.94 insn per cycle + 3.638602748 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5059) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.622392e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.780322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.780322e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.601162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.754382e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.754382e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.924079 sec +TOTAL : 1.928356 sec INFO: No Floating Point Exceptions have been reported - 5,063,711,278 cycles # 2.628 GHz - 11,483,381,207 instructions # 2.27 insn per cycle - 1.928457003 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) + 5,139,069,824 cycles # 2.660 GHz + 11,845,314,592 instructions # 2.30 insn per cycle + 1.932635801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4759) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.428126e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.609483e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.609483e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.349025e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.527469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.527469e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.762110 sec +TOTAL : 1.775911 sec INFO: No Floating Point Exceptions have been reported - 4,654,381,781 cycles # 2.637 GHz - 10,841,512,729 instructions # 2.33 insn per cycle - 1.766492012 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) + 4,732,346,737 cycles # 2.660 GHz + 11,170,822,632 instructions # 2.36 insn per cycle + 1.780229441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4420) (512y: 245) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 +Avg ME (F77/C++) = 1.4131213684416466 +Relative difference = 4.469241533230934e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.831561e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.933974e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.933974e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.768854e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866037e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.866037e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.424320 sec +TOTAL : 2.446344 sec INFO: No Floating Point Exceptions have been reported - 4,122,188,832 cycles # 1.698 GHz - 6,106,386,209 instructions # 1.48 insn per cycle - 2.428663337 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) + 4,167,220,480 cycles # 1.701 GHz + 6,225,852,056 instructions # 1.49 insn per cycle + 2.450525643 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1525) (512y: 140) (512z: 3678) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 25f5a9a1db..d5011c542d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:54:47 +DATE: 2024-06-03_18:01:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.457737e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273716e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366330e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.164978e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.938816e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.022860e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.454346 sec +TOTAL : 0.455835 sec INFO: No Floating Point Exceptions have been reported - 1,882,812,006 cycles # 2.812 GHz - 2,667,678,817 instructions # 1.42 insn per cycle - 0.728316472 seconds time elapsed + 1,889,397,429 cycles # 2.815 GHz + 2,651,505,840 instructions # 1.40 insn per cycle + 0.728542305 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 227 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.267220e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.427979e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.526801e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.499240 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.909385e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.914593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.984981e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.505996 sec INFO: No Floating Point Exceptions have been reported - 2,052,043,922 cycles # 2.819 GHz - 2,965,868,392 instructions # 1.45 insn per cycle - 0.784455397 seconds time elapsed + 2,072,435,695 cycles # 2.821 GHz + 3,006,042,205 instructions # 1.45 insn per cycle + 0.792850900 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,25 +96,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.468259e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481049e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481049e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.455284e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.468142e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.468142e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.660760 sec +TOTAL : 6.697187 sec INFO: No Floating Point Exceptions have been reported - 19,087,341,831 cycles # 2.864 GHz - 58,960,382,092 instructions # 3.09 insn per cycle - 6.664849133 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) + 19,231,773,979 cycles # 2.870 GHz + 59,620,868,157 instructions # 3.10 insn per cycle + 6.701301879 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 972) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,25 +124,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.119436e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.261857e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.261857e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.883262e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.018595e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.018595e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.039807 sec +TOTAL : 2.099868 sec INFO: No Floating Point Exceptions have been reported - 5,851,713,678 cycles # 2.864 GHz - 16,693,562,801 instructions # 2.85 insn per cycle - 2.044009980 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) + 6,023,746,068 cycles # 2.864 GHz + 17,069,553,194 instructions # 2.83 insn per cycle + 2.104057020 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,25 +152,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.728939e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.791004e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.791004e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.969206 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.693129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.752626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.752626e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.989464 sec INFO: No Floating Point Exceptions have been reported - 2,595,644,078 cycles # 2.669 GHz - 5,979,320,953 instructions # 2.30 insn per cycle - 0.973332836 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) + 2,644,653,272 cycles # 2.663 GHz + 6,193,862,153 instructions # 2.34 insn per cycle + 0.993621228 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,25 +180,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907804e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.986792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.986792e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881217 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.865244e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.937490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937490e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.900025 sec INFO: No Floating Point Exceptions have been reported - 2,345,880,719 cycles # 2.652 GHz - 5,602,748,051 instructions # 2.39 insn per cycle - 0.885574348 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) + 2,409,522,680 cycles # 2.666 GHz + 5,798,061,377 instructions # 2.41 insn per cycle + 0.904266583 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4920) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.406889e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.449235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.449235e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.387108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427869e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.188517 sec +TOTAL : 1.205183 sec INFO: No Floating Point Exceptions have been reported - 2,058,034,828 cycles # 1.727 GHz - 3,333,328,616 instructions # 1.62 insn per cycle - 1.192698457 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) + 2,084,041,087 cycles # 1.725 GHz + 3,398,650,768 instructions # 1.63 insn per cycle + 1.209392897 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2238) (512y: 39) (512z: 3787) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index e87a092429..27d39c227c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_21:28:30 +DATE: 2024-06-03_18:25:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.706813e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.038920e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.038920e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.785942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.033343e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.033343e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.466185 sec +TOTAL : 0.464883 sec INFO: No Floating Point Exceptions have been reported - 1,911,951,129 cycles # 2.814 GHz - 2,841,120,455 instructions # 1.49 insn per cycle - 0.735601781 seconds time elapsed + 1,914,723,949 cycles # 2.822 GHz + 2,841,749,042 instructions # 1.48 insn per cycle + 0.734851797 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 227 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= @@ -79,24 +79,24 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.584141e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.645916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.645916e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.648144 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.584091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.526997e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.526997e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737499e+02 +- 4.776369e+02 ) GeV^-2 +TOTAL : 0.646853 sec INFO: No Floating Point Exceptions have been reported - 2,489,864,035 cycles # 2.825 GHz - 3,827,644,098 instructions # 1.54 insn per cycle - 0.938216487 seconds time elapsed + 2,489,386,165 cycles # 2.829 GHz + 3,848,805,319 instructions # 1.55 insn per cycle + 0.936355064 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.480910e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.493960e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.493960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.451669e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.464494e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.464494e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.630728 sec +TOTAL : 6.709628 sec INFO: No Floating Point Exceptions have been reported - 19,100,779,078 cycles # 2.879 GHz - 58,964,120,971 instructions # 3.09 insn per cycle - 6.635020831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) + 19,244,071,354 cycles # 2.867 GHz + 59,624,819,172 instructions # 3.10 insn per cycle + 6.714018744 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 972) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,25 +138,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.112476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.258255e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.258255e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.876682e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.018166e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.018166e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 2.046410 sec +TOTAL : 2.106720 sec INFO: No Floating Point Exceptions have been reported - 5,888,238,325 cycles # 2.872 GHz - 16,741,878,300 instructions # 2.84 insn per cycle - 2.050817243 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) + 6,048,127,238 cycles # 2.866 GHz + 17,116,783,460 instructions # 2.83 insn per cycle + 2.111089880 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5867) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,25 +167,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.723304e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.786360e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.786360e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.977338 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.653152e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.712585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.712585e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.017936 sec INFO: No Floating Point Exceptions have been reported - 2,615,739,499 cycles # 2.667 GHz - 6,017,192,558 instructions # 2.30 insn per cycle - 0.981679944 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) + 2,674,239,143 cycles # 2.617 GHz + 6,230,764,637 instructions # 2.33 insn per cycle + 1.022355051 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,25 +196,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915953e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.994516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.994516e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881911 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.856094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929527e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 0.910035 sec INFO: No Floating Point Exceptions have been reported - 2,367,964,767 cycles # 2.674 GHz - 5,639,235,283 instructions # 2.38 insn per cycle - 0.886220730 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) + 2,430,364,587 cycles # 2.661 GHz + 5,835,243,014 instructions # 2.40 insn per cycle + 0.914365378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4920) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.400847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.442998e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.442998e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.383498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.424523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.424523e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.198499 sec +TOTAL : 1.213191 sec INFO: No Floating Point Exceptions have been reported - 2,084,095,957 cycles # 1.733 GHz - 3,374,916,702 instructions # 1.62 insn per cycle - 1.202990814 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2144) (512y: 39) (512z: 3675) + 2,110,463,788 cycles # 1.734 GHz + 3,440,119,960 instructions # 1.63 insn per cycle + 1.217545966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2238) (512y: 39) (512z: 3787) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index b3b78f68de..016a71e02c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:08 +DATE: 2024-06-03_18:02:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.497603e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.303121e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.398772e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.157640e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.930293e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.025008e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.451033 sec +TOTAL : 0.454926 sec INFO: No Floating Point Exceptions have been reported - 1,878,344,567 cycles # 2.819 GHz - 2,673,672,832 instructions # 1.42 insn per cycle - 0.723231427 seconds time elapsed + 1,891,883,564 cycles # 2.818 GHz + 2,657,665,460 instructions # 1.40 insn per cycle + 0.728906539 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 221 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.242647e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.389213e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475124e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.498086 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.937550e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960296e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.028764e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630097e+02 +- 4.770717e+02 ) GeV^-2 +TOTAL : 0.506114 sec INFO: No Floating Point Exceptions have been reported - 2,053,404,426 cycles # 2.826 GHz - 2,966,260,637 instructions # 1.44 insn per cycle - 0.782979940 seconds time elapsed + 2,076,281,969 cycles # 2.822 GHz + 3,000,969,388 instructions # 1.45 insn per cycle + 0.793032794 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 1.412608e+00 -Avg ME (F77/GPU) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +Avg ME (C++/GPU) = 1.412607e+00 +Avg ME (F77/GPU) = 1.4132214305330990 +Relative difference = 0.0004349621183379836 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,25 +96,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.479080e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.492104e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.492104e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.439985e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452524e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452524e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.631451 sec +TOTAL : 6.737070 sec INFO: No Floating Point Exceptions have been reported - 18,984,604,647 cycles # 2.862 GHz - 58,702,110,153 instructions # 3.09 insn per cycle - 6.635623922 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) + 19,431,566,977 cycles # 2.883 GHz + 59,361,173,188 instructions # 3.05 insn per cycle + 6.741226799 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1040) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129949096991936 +Relative difference = 6.390737857384068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,25 +124,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.506447e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.662703e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.662703e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.308400e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.454140e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.454140e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.947376 sec +TOTAL : 1.992790 sec INFO: No Floating Point Exceptions have been reported - 5,589,202,869 cycles # 2.865 GHz - 16,510,174,954 instructions # 2.95 insn per cycle - 1.951631264 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) + 5,753,899,727 cycles # 2.883 GHz + 16,856,331,975 instructions # 2.93 insn per cycle + 1.996902777 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412995e+00 +Avg ME (F77/C++) = 1.4129954647353316 +Relative difference = 3.2890090308261873e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,25 +152,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.493410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.539841e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.539841e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.118677 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.500767e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.547334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547334e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.113231 sec INFO: No Floating Point Exceptions have been reported - 2,976,233,441 cycles # 2.652 GHz - 6,633,667,708 instructions # 2.23 insn per cycle - 1.122890193 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) + 3,022,289,723 cycles # 2.706 GHz + 6,854,476,939 instructions # 2.27 insn per cycle + 1.117430952 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5739) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,25 +180,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.625860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.680500e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.680500e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.029211 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.597334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.650106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.650106e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.047323 sec INFO: No Floating Point Exceptions have been reported - 2,757,551,412 cycles # 2.670 GHz - 6,254,933,924 instructions # 2.27 insn per cycle - 1.033320955 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) + 2,810,159,694 cycles # 2.674 GHz + 6,444,290,635 instructions # 2.29 insn per cycle + 1.051666815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5521) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (C++/C++) = 1.413313e+00 +Avg ME (F77/C++) = 1.4133132969790267 +Relative difference = 2.1012969292986113e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.290714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.325632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325632e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.272649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306381e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.306381e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.293262 sec +TOTAL : 1.311231 sec INFO: No Floating Point Exceptions have been reported - 2,228,539,636 cycles # 1.719 GHz - 3,697,845,631 instructions # 1.66 insn per cycle - 1.297458184 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2391) (512y: 29) (512z: 3970) + 2,260,530,890 cycles # 1.720 GHz + 3,762,323,072 instructions # 1.66 insn per cycle + 1.315383432 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2469) (512y: 29) (512z: 4082) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 1aea1ca46b..7c9cec7e5b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:29 +DATE: 2024-06-03_18:02:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.705675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040108e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054231e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.364718e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.031397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047638e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.467689 sec +TOTAL : 0.473744 sec INFO: No Floating Point Exceptions have been reported - 1,951,012,614 cycles # 2.822 GHz - 2,780,514,605 instructions # 1.43 insn per cycle - 0.747566497 seconds time elapsed + 1,947,725,894 cycles # 2.818 GHz + 2,805,290,478 instructions # 1.44 insn per cycle + 0.747729470 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.071549e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.315004e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.328475e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.079036e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315033e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.328388e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609446 sec +TOTAL : 0.611082 sec INFO: No Floating Point Exceptions have been reported - 2,398,520,319 cycles # 2.827 GHz - 3,697,559,551 instructions # 1.54 insn per cycle - 0.906881942 seconds time elapsed + 2,407,302,120 cycles # 2.826 GHz + 3,697,067,837 instructions # 1.54 insn per cycle + 0.910744181 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.346860e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.358448e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.358448e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334345e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345812e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345812e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 7.005881 sec +TOTAL : 7.042999 sec INFO: No Floating Point Exceptions have been reported - 20,061,545,492 cycles # 2.863 GHz - 60,534,513,586 instructions # 3.02 insn per cycle - 7.010263470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) + 20,221,850,441 cycles # 2.870 GHz + 60,954,392,946 instructions # 3.01 insn per cycle + 7.047197015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1233) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.628417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.673382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.673382e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.519746e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.562485e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.562485e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.563203 sec +TOTAL : 3.648318 sec INFO: No Floating Point Exceptions have been reported - 10,193,843,427 cycles # 2.858 GHz - 30,384,715,959 instructions # 2.98 insn per cycle - 3.567486704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) + 10,489,870,662 cycles # 2.873 GHz + 30,832,759,019 instructions # 2.94 insn per cycle + 3.652585279 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.060307e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.230512e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.230512e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.905378e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.068897e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.068897e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.832180 sec +TOTAL : 1.863571 sec INFO: No Floating Point Exceptions have been reported - 4,873,743,401 cycles # 2.655 GHz - 10,979,146,931 instructions # 2.25 insn per cycle - 1.836546702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) + 4,962,857,454 cycles # 2.658 GHz + 11,366,629,197 instructions # 2.29 insn per cycle + 1.867862143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4782) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.032510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054207e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054207e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.014145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.035087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.035087e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.610269 sec +TOTAL : 1.638933 sec INFO: No Floating Point Exceptions have been reported - 4,286,427,813 cycles # 2.656 GHz - 10,247,731,306 instructions # 2.39 insn per cycle - 1.614556045 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) + 4,385,977,782 cycles # 2.670 GHz + 10,616,380,005 instructions # 2.42 insn per cycle + 1.643255169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4515) (512y: 83) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.692625e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.784575e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.784575e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.631134e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.722110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.722110e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.474016 sec +TOTAL : 2.496704 sec INFO: No Floating Point Exceptions have been reported - 4,210,263,291 cycles # 1.700 GHz - 6,043,220,655 instructions # 1.44 insn per cycle - 2.478297594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) + 4,254,957,859 cycles # 1.702 GHz + 6,172,800,294 instructions # 1.45 insn per cycle + 2.500959660 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2141) (512y: 117) (512z: 3652) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 1c6d0ff5f8..7f0d5e8677 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-06-02_20:55:55 +DATE: 2024-06-03_18:02:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.735452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041244e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055299e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.462653e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041036e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057390e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469831 sec +TOTAL : 0.473993 sec INFO: No Floating Point Exceptions have been reported - 1,948,866,708 cycles # 2.819 GHz - 2,802,491,668 instructions # 1.44 insn per cycle - 0.748686662 seconds time elapsed + 1,945,651,451 cycles # 2.818 GHz + 2,806,210,093 instructions # 1.44 insn per cycle + 0.748369834 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.070810e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308874e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321967e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.075770e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309834e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323116e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605052 sec +TOTAL : 0.609160 sec INFO: No Floating Point Exceptions have been reported - 2,392,252,084 cycles # 2.830 GHz - 3,645,625,496 instructions # 1.52 insn per cycle - 0.903478683 seconds time elapsed + 2,396,352,595 cycles # 2.825 GHz + 3,579,944,538 instructions # 1.49 insn per cycle + 0.909918067 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.367403e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.379210e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.379210e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.327535e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.338914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338914e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.944976 sec +TOTAL : 7.062977 sec INFO: No Floating Point Exceptions have been reported - 19,868,797,568 cycles # 2.860 GHz - 59,935,823,047 instructions # 3.02 insn per cycle - 6.949220462 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) + 20,277,498,382 cycles # 2.870 GHz + 61,179,962,230 instructions # 3.02 insn per cycle + 7.067219454 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1285) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.689813e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736104e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736104e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.581110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.625265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.625265e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.516734 sec +TOTAL : 3.599791 sec INFO: No Floating Point Exceptions have been reported - 10,083,295,126 cycles # 2.864 GHz - 30,097,719,684 instructions # 2.98 insn per cycle - 3.521023820 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) + 10,346,310,423 cycles # 2.871 GHz + 30,542,909,720 instructions # 2.95 insn per cycle + 3.604137951 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 5166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.780572e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.943341e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.943341e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.602120e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.754758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.754758e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.889615 sec +TOTAL : 1.928065 sec INFO: No Floating Point Exceptions have been reported - 5,024,798,861 cycles # 2.654 GHz - 11,482,219,428 instructions # 2.29 insn per cycle - 1.893950854 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) + 5,154,340,530 cycles # 2.668 GHz + 11,880,615,676 instructions # 2.30 insn per cycle + 1.932456074 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4893) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.644667e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.830260e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.830260e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.479310e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.664048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.664048e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.722143 sec +TOTAL : 1.751853 sec INFO: No Floating Point Exceptions have been reported - 4,588,199,336 cycles # 2.659 GHz - 10,809,611,838 instructions # 2.36 insn per cycle - 1.726402099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) + 4,683,675,227 cycles # 2.668 GHz + 11,173,952,183 instructions # 2.39 insn per cycle + 1.756187164 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4520) (512y: 238) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.668589e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758731e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758731e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.604900e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.695150e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.695150e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.482767 sec +TOTAL : 2.505893 sec INFO: No Floating Point Exceptions have been reported - 4,227,913,144 cycles # 1.701 GHz - 6,273,317,964 instructions # 1.48 insn per cycle - 2.486971348 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) + 4,267,946,917 cycles # 1.701 GHz + 6,413,270,989 instructions # 1.50 insn per cycle + 2.510099754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2037) (512y: 163) (512z: 3730) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 06aa0981a7..bae1743dda 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:56:21 +DATE: 2024-06-03_18:03:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.484170e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.448075e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.480692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483372e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529714 sec +TOTAL : 0.534727 sec INFO: No Floating Point Exceptions have been reported - 2,187,150,016 cycles # 2.828 GHz - 3,407,204,108 instructions # 1.56 insn per cycle - 0.831981040 seconds time elapsed + 2,160,892,959 cycles # 2.821 GHz + 3,385,854,093 instructions # 1.57 insn per cycle + 0.824886873 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.126702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.160633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.128072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162714e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.045826 sec +TOTAL : 3.052385 sec INFO: No Floating Point Exceptions have been reported - 9,422,847,840 cycles # 2.851 GHz - 20,052,737,736 instructions # 2.13 insn per cycle - 3.360343374 seconds time elapsed + 9,488,215,870 cycles # 2.858 GHz + 20,102,748,501 instructions # 2.12 insn per cycle + 3.378618806 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837101e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837996e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.833875e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.834753e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.834753e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.937233 sec +TOTAL : 8.952951 sec INFO: No Floating Point Exceptions have been reported - 25,623,864,923 cycles # 2.867 GHz - 78,942,890,669 instructions # 3.08 insn per cycle - 8.941589016 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,692,730,603 cycles # 2.869 GHz + 78,963,630,993 instructions # 3.07 insn per cycle + 8.957178674 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.527505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.530758e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530758e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.418565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.421561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.421561e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.659217 sec +TOTAL : 4.806739 sec INFO: No Floating Point Exceptions have been reported - 12,887,852,449 cycles # 2.765 GHz - 39,283,888,678 instructions # 3.05 insn per cycle - 4.663553220 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,124,597,322 cycles # 2.729 GHz + 39,567,241,453 instructions # 3.01 insn per cycle + 4.811058536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.819266e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.834708e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.834708e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.807184e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.822253e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.822253e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106980 sec +TOTAL : 2.109980 sec INFO: No Floating Point Exceptions have been reported - 5,581,493,843 cycles # 2.645 GHz - 13,685,869,165 instructions # 2.45 insn per cycle - 2.111397973 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,625,324,473 cycles # 2.662 GHz + 13,831,103,677 instructions # 2.46 insn per cycle + 2.114152989 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.940031e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.960919e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.960919e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.917724e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.938311e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.938311e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.844023 sec +TOTAL : 1.848430 sec INFO: No Floating Point Exceptions have been reported - 4,890,407,622 cycles # 2.647 GHz - 12,340,850,912 instructions # 2.52 insn per cycle - 1.848367657 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,937,289,641 cycles # 2.666 GHz + 12,512,672,274 instructions # 2.53 insn per cycle + 1.852715240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.735647e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.747350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.747350e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.698211e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.709522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.709522e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.444477 sec +TOTAL : 2.458283 sec INFO: No Floating Point Exceptions have been reported - 4,109,625,734 cycles # 1.679 GHz - 6,334,694,015 instructions # 1.54 insn per cycle - 2.448820329 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,149,503,000 cycles # 1.686 GHz + 6,398,619,235 instructions # 1.54 insn per cycle + 2.462674392 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 638dc04e22..591f3c3a40 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:29:17 +DATE: 2024-06-03_18:26:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.091974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.434049e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.434049e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.120114e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.465842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.465842e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520202 sec +TOTAL : 0.523164 sec INFO: No Floating Point Exceptions have been reported - 2,115,997,044 cycles # 2.821 GHz - 3,356,177,989 instructions # 1.59 insn per cycle - 0.810142197 seconds time elapsed + 2,123,157,197 cycles # 2.820 GHz + 3,317,966,729 instructions # 1.56 insn per cycle + 0.813410467 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.632273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.129586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.129586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.633710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.125236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.125236e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.322225 sec +TOTAL : 3.321653 sec INFO: No Floating Point Exceptions have been reported - 10,271,397,573 cycles # 2.856 GHz - 22,004,537,141 instructions # 2.14 insn per cycle - 3.652772092 seconds time elapsed + 10,250,803,498 cycles # 2.854 GHz + 21,387,097,503 instructions # 2.09 insn per cycle + 3.648104166 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.833131e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.834044e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.834044e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.828668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829556e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.960944 sec +TOTAL : 8.983076 sec INFO: No Floating Point Exceptions have been reported - 25,662,418,157 cycles # 2.863 GHz - 78,944,265,965 instructions # 3.08 insn per cycle - 8.965544443 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,767,042,680 cycles # 2.867 GHz + 78,969,761,034 instructions # 3.06 insn per cycle + 8.987670049 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.536360e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.539829e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.539829e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.417531e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.420733e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.420733e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.652216 sec +TOTAL : 4.812631 sec INFO: No Floating Point Exceptions have been reported - 12,900,905,409 cycles # 2.771 GHz - 39,296,118,040 instructions # 3.05 insn per cycle - 4.656875062 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,140,367,247 cycles # 2.728 GHz + 39,580,393,488 instructions # 3.01 insn per cycle + 4.817209290 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.851670e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.867737e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.867737e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.785368e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.801748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.801748e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.103024 sec +TOTAL : 2.120902 sec INFO: No Floating Point Exceptions have been reported - 5,594,201,816 cycles # 2.655 GHz - 13,697,712,232 instructions # 2.45 insn per cycle - 2.107624615 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,646,935,532 cycles # 2.658 GHz + 13,843,170,636 instructions # 2.45 insn per cycle + 2.125599955 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.921358e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.943578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.943578e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.853249e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.874783e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.874783e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.852398 sec +TOTAL : 1.866411 sec INFO: No Floating Point Exceptions have been reported - 4,909,478,389 cycles # 2.645 GHz - 12,351,405,876 instructions # 2.52 insn per cycle - 1.857024744 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,959,147,596 cycles # 2.652 GHz + 12,524,330,168 instructions # 2.53 insn per cycle + 1.870952591 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.741453e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753934e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753934e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.744564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.757032e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.757032e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.447091 sec +TOTAL : 2.446868 sec INFO: No Floating Point Exceptions have been reported - 4,126,055,402 cycles # 1.684 GHz - 6,345,698,997 instructions # 1.54 insn per cycle - 2.451723990 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,167,603,212 cycles # 1.701 GHz + 6,410,053,608 instructions # 1.54 insn per cycle + 2.451645185 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 79d60d2a9e..65f6877a1f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:39:07 +DATE: 2024-06-03_18:36:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.458071e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487880e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.459269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.484247e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.486579e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.516495 sec +TOTAL : 0.512560 sec INFO: No Floating Point Exceptions have been reported - 2,100,340,953 cycles # 2.817 GHz - 3,318,240,864 instructions # 1.58 insn per cycle - 0.807243807 seconds time elapsed + 2,110,715,830 cycles # 2.825 GHz + 3,341,378,210 instructions # 1.58 insn per cycle + 0.806813764 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.151628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182666e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.128324e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.158214e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.126580 sec +TOTAL : 3.135265 sec INFO: No Floating Point Exceptions have been reported - 9,651,663,769 cycles # 2.855 GHz - 21,411,369,286 instructions # 2.22 insn per cycle - 3.436400390 seconds time elapsed + 9,696,524,262 cycles # 2.859 GHz + 21,940,401,561 instructions # 2.26 insn per cycle + 3.448862398 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838040e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.838971e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.838971e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.832359e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833217e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833217e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.934375 sec +TOTAL : 8.961711 sec INFO: No Floating Point Exceptions have been reported - 25,608,373,833 cycles # 2.865 GHz - 78,937,451,677 instructions # 3.08 insn per cycle - 8.938566207 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,704,324,268 cycles # 2.867 GHz + 78,963,978,235 instructions # 3.07 insn per cycle + 8.965873340 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.525117e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.528335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.528335e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.407652e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410676e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.663427 sec +TOTAL : 4.823773 sec INFO: No Floating Point Exceptions have been reported - 12,892,979,270 cycles # 2.763 GHz - 39,279,722,056 instructions # 3.05 insn per cycle - 4.667648484 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,137,198,616 cycles # 2.722 GHz + 39,569,230,845 instructions # 3.01 insn per cycle + 4.827968795 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.833327e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.849001e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.849001e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.808509e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.824569e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.824569e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.104949 sec +TOTAL : 2.111587 sec INFO: No Floating Point Exceptions have been reported - 5,585,628,818 cycles # 2.649 GHz - 13,686,707,794 instructions # 2.45 insn per cycle - 2.109194961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,632,091,917 cycles # 2.663 GHz + 13,830,351,821 instructions # 2.46 insn per cycle + 2.115903971 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.959882e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.980659e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.980659e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.875781e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.896132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.896132e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.842075 sec +TOTAL : 1.859252 sec INFO: No Floating Point Exceptions have been reported - 4,892,059,435 cycles # 2.651 GHz - 12,339,041,510 instructions # 2.52 insn per cycle - 1.846337430 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,941,133,634 cycles # 2.653 GHz + 12,511,050,210 instructions # 2.53 insn per cycle + 1.863775304 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.749292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.761080e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.761080e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.689168e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.701153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.701153e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.441489 sec +TOTAL : 2.463375 sec INFO: No Floating Point Exceptions have been reported - 4,113,508,290 cycles # 1.683 GHz - 6,332,907,864 instructions # 1.54 insn per cycle - 2.445760630 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,157,918,797 cycles # 1.686 GHz + 6,398,953,341 instructions # 1.54 insn per cycle + 2.467565422 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 5745d06e17..1adc1b429a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:36:19 +DATE: 2024-06-03_18:33:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.461663e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.488298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490692e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.490991e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.516812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519478e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.512096 sec +TOTAL : 0.511721 sec INFO: No Floating Point Exceptions have been reported - 2,118,130,475 cycles # 2.818 GHz - 3,288,416,689 instructions # 1.55 insn per cycle - 0.809966158 seconds time elapsed + 2,115,984,574 cycles # 2.820 GHz + 3,347,070,767 instructions # 1.58 insn per cycle + 0.810000053 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164023e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165296e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.120312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.148695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149900e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.077228 sec +TOTAL : 3.081960 sec INFO: No Floating Point Exceptions have been reported - 9,556,153,983 cycles # 2.858 GHz - 21,726,674,576 instructions # 2.27 insn per cycle - 3.399985544 seconds time elapsed + 9,549,075,862 cycles # 2.861 GHz + 21,937,059,442 instructions # 2.30 insn per cycle + 3.392837005 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838183e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839050e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839050e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839428e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.840299e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840299e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.931733 sec +TOTAL : 8.925724 sec INFO: No Floating Point Exceptions have been reported - 25,607,680,070 cycles # 2.866 GHz - 78,937,604,302 instructions # 3.08 insn per cycle - 8.935953129 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,698,291,007 cycles # 2.878 GHz + 78,968,762,593 instructions # 3.07 insn per cycle + 8.929919118 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.531271e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534495e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534495e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.420392e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.423509e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.423509e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.653724 sec +TOTAL : 4.804169 sec INFO: No Floating Point Exceptions have been reported - 12,891,706,587 cycles # 2.769 GHz - 39,279,955,585 instructions # 3.05 insn per cycle - 4.658078583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,120,732,392 cycles # 2.730 GHz + 39,569,936,739 instructions # 3.02 insn per cycle + 4.808564438 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.750526e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.765709e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.765709e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.723476e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.738556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.738556e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125753 sec +TOTAL : 2.133009 sec INFO: No Floating Point Exceptions have been reported - 5,636,716,298 cycles # 2.647 GHz - 13,685,667,157 instructions # 2.43 insn per cycle - 2.130052622 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,628,049,025 cycles # 2.634 GHz + 13,831,258,234 instructions # 2.46 insn per cycle + 2.137317610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.975427e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.996737e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.996737e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.911711e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.931877e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.931877e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.836722 sec +TOTAL : 1.849874 sec INFO: No Floating Point Exceptions have been reported - 4,887,653,189 cycles # 2.656 GHz - 12,340,725,033 instructions # 2.52 insn per cycle - 1.841006928 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,937,906,884 cycles # 2.664 GHz + 12,512,636,347 instructions # 2.53 insn per cycle + 1.854242976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.741757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.753368e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.753368e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.695018e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.706430e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.706430e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.442506 sec +TOTAL : 2.459636 sec INFO: No Floating Point Exceptions have been reported - 4,119,197,476 cycles # 1.684 GHz - 6,334,707,467 instructions # 1.54 insn per cycle - 2.446806756 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,146,970,086 cycles # 1.684 GHz + 6,398,911,395 instructions # 1.54 insn per cycle + 2.463931312 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 845fe92d47..be059bacfa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:33:36 +DATE: 2024-06-03_18:30:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.175037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.487201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490172e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.179061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.492181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494484e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519308 sec +TOTAL : 0.516700 sec INFO: No Floating Point Exceptions have been reported - 2,132,091,270 cycles # 2.847 GHz - 3,400,172,905 instructions # 1.59 insn per cycle - 0.809359327 seconds time elapsed + 2,106,227,613 cycles # 2.820 GHz + 3,284,065,817 instructions # 1.56 insn per cycle + 0.806223623 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.733086e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.182437e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.183712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.730205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178090e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.211390 sec +TOTAL : 3.209108 sec INFO: No Floating Point Exceptions have been reported - 9,891,188,972 cycles # 2.857 GHz - 21,285,655,080 instructions # 2.15 insn per cycle - 3.520480517 seconds time elapsed + 9,879,829,969 cycles # 2.855 GHz + 22,117,916,176 instructions # 2.24 insn per cycle + 3.518614714 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -99,16 +99,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838729e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839597e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839597e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.833069e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833958e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833958e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.929008 sec +TOTAL : 8.957266 sec INFO: No Floating Point Exceptions have been reported - 25,610,442,672 cycles # 2.867 GHz - 78,938,090,928 instructions # 3.08 insn per cycle - 8.933282594 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) + 25,705,366,952 cycles # 2.869 GHz + 78,968,969,046 instructions # 3.07 insn per cycle + 8.962060815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -127,16 +127,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.506228e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509410e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.509410e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.422242e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.425294e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.425294e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.687492 sec +TOTAL : 4.801839 sec INFO: No Floating Point Exceptions have been reported - 12,899,492,836 cycles # 2.750 GHz - 39,283,102,027 instructions # 3.05 insn per cycle - 4.691777787 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) + 13,111,523,729 cycles # 2.729 GHz + 39,567,220,863 instructions # 3.02 insn per cycle + 4.806097144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13211) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,16 +155,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.752261e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.768022e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.768022e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.775140e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.790396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.790396e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125292 sec +TOTAL : 2.119070 sec INFO: No Floating Point Exceptions have been reported - 5,579,027,054 cycles # 2.621 GHz - 13,686,176,373 instructions # 2.45 insn per cycle - 2.129518303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) + 5,626,691,162 cycles # 2.651 GHz + 13,831,085,723 instructions # 2.46 insn per cycle + 2.123497883 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11548) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -183,16 +183,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.957937e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.978146e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.978146e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.910298e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.931005e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.931005e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.840124 sec +TOTAL : 1.850169 sec INFO: No Floating Point Exceptions have been reported - 4,887,669,768 cycles # 2.651 GHz - 12,340,977,183 instructions # 2.52 insn per cycle - 1.844385583 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) + 4,938,091,387 cycles # 2.664 GHz + 12,512,642,240 instructions # 2.53 insn per cycle + 1.854518481 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10473) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -211,16 +211,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.740559e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.752226e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.752226e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.690516e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.701832e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.701832e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.443234 sec +TOTAL : 2.461169 sec INFO: No Floating Point Exceptions have been reported - 4,110,489,538 cycles # 1.680 GHz - 6,334,661,219 instructions # 1.54 insn per cycle - 2.447553100 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) + 4,150,873,146 cycles # 1.684 GHz + 6,398,872,000 instructions # 1.54 insn per cycle + 2.465432241 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1998) (512y: 102) (512z: 9391) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index f7617fa14d..2910777a3a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:56:54 +DATE: 2024-06-03_18:03:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.465250e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494561e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.460518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496369e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.531384 sec +TOTAL : 0.528534 sec INFO: No Floating Point Exceptions have been reported - 2,176,903,857 cycles # 2.812 GHz - 3,360,015,570 instructions # 1.54 insn per cycle - 0.832792471 seconds time elapsed + 2,182,805,282 cycles # 2.827 GHz + 3,340,889,449 instructions # 1.53 insn per cycle + 0.831226796 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.144291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.177839e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.021369 sec +TOTAL : 3.031527 sec INFO: No Floating Point Exceptions have been reported - 9,368,773,778 cycles # 2.857 GHz - 21,286,316,558 instructions # 2.27 insn per cycle - 3.335014178 seconds time elapsed + 9,413,255,455 cycles # 2.860 GHz + 21,563,433,975 instructions # 2.29 insn per cycle + 3.350618061 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.844326e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.845217e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845217e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839296e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.840216e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.840216e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.901599 sec +TOTAL : 8.925746 sec INFO: No Floating Point Exceptions have been reported - 25,466,417,477 cycles # 2.860 GHz - 78,709,901,314 instructions # 3.09 insn per cycle - 8.905880803 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) + 25,644,420,769 cycles # 2.872 GHz + 78,708,718,415 instructions # 3.07 insn per cycle + 8.929928283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.439967e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.443121e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443121e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453191e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.456333e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.456333e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.776530 sec +TOTAL : 4.758278 sec INFO: No Floating Point Exceptions have been reported - 12,973,172,137 cycles # 2.714 GHz - 39,229,674,228 instructions # 3.02 insn per cycle - 4.780939174 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) + 13,060,724,173 cycles # 2.743 GHz + 39,458,534,044 instructions # 3.02 insn per cycle + 4.762512071 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12985) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.803639e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.819236e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.819236e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.693265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.707995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.707995e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.110784 sec +TOTAL : 2.141153 sec INFO: No Floating Point Exceptions have been reported - 5,623,478,205 cycles # 2.660 GHz - 13,801,627,183 instructions # 2.45 insn per cycle - 2.115128076 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) + 5,709,775,869 cycles # 2.662 GHz + 13,917,563,949 instructions # 2.44 insn per cycle + 2.145429673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11610) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.797294e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.817444e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.817444e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.788217e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.807623e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.807623e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.873371 sec +TOTAL : 1.875389 sec INFO: No Floating Point Exceptions have been reported - 4,983,137,076 cycles # 2.655 GHz - 12,465,949,717 instructions # 2.50 insn per cycle - 1.877612639 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) + 4,999,929,304 cycles # 2.661 GHz + 12,609,755,396 instructions # 2.52 insn per cycle + 1.879646139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10457) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.717520e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.728878e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.728878e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.675653e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.686951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.686951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.451074 sec +TOTAL : 2.466427 sec INFO: No Floating Point Exceptions have been reported - 4,120,290,741 cycles # 1.679 GHz - 6,458,681,411 instructions # 1.57 insn per cycle - 2.455362823 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) + 4,153,899,937 cycles # 1.682 GHz + 6,506,899,667 instructions # 1.57 insn per cycle + 2.470734551 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1774) (512y: 194) (512z: 9387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 0fe5c16438..9680f74ecf 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:19:48 +DATE: 2024-06-03_18:16:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.246479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.270244e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.272477e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.245303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.266293e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.268380e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.541806 sec +TOTAL : 0.540877 sec INFO: No Floating Point Exceptions have been reported - 2,171,468,423 cycles # 2.820 GHz - 3,387,210,535 instructions # 1.56 insn per cycle - 0.829471359 seconds time elapsed + 2,168,292,720 cycles # 2.822 GHz + 3,388,946,935 instructions # 1.56 insn per cycle + 0.827313099 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.755733e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.781841e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.760235e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.787497e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.311285 sec +TOTAL : 3.314609 sec INFO: No Floating Point Exceptions have been reported - 10,187,766,661 cycles # 2.856 GHz - 22,069,205,702 instructions # 2.17 insn per cycle - 3.622898713 seconds time elapsed + 10,192,422,594 cycles # 2.858 GHz + 22,975,782,663 instructions # 2.25 insn per cycle + 3.624626973 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.133683e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.134122e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.134122e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.102297e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.102729e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.102729e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.685051 sec +TOTAL : 39.987442 sec INFO: No Floating Point Exceptions have been reported - 113,512,735,354 cycles # 2.860 GHz - 144,824,168,290 instructions # 1.28 insn per cycle - 39.689377450 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:21353) (avx2: 0) (512y: 0) (512z: 0) + 113,516,605,337 cycles # 2.839 GHz + 144,863,586,012 instructions # 1.28 insn per cycle + 39.991768743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21407) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.009048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.011479e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.011479e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.998649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.000991e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000991e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.460357 sec +TOTAL : 5.479213 sec INFO: No Floating Point Exceptions have been reported - 14,780,198,562 cycles # 2.706 GHz - 37,576,710,982 instructions # 2.54 insn per cycle - 5.464730306 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) + 14,791,666,519 cycles # 2.698 GHz + 37,656,320,268 instructions # 2.55 insn per cycle + 5.483581524 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:68265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.168780e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181780e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.181780e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.130527e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.143433e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143433e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.297592 sec +TOTAL : 2.309741 sec INFO: No Floating Point Exceptions have been reported - 6,127,083,636 cycles # 2.663 GHz - 13,063,845,546 instructions # 2.13 insn per cycle - 2.302025246 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) + 6,126,510,346 cycles # 2.648 GHz + 13,068,223,479 instructions # 2.13 insn per cycle + 2.314176862 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46983) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.695008e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.714356e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.714356e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.503559e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.521907e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.521907e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.895827 sec +TOTAL : 1.938323 sec INFO: No Floating Point Exceptions have been reported - 5,063,974,681 cycles # 2.666 GHz - 11,441,302,228 instructions # 2.26 insn per cycle - 1.900215464 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) + 5,136,068,700 cycles # 2.645 GHz + 11,461,772,305 instructions # 2.23 insn per cycle + 1.942656144 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40514) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.958952e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.971780e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.971780e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.006827e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.019287e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.019287e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.366561 sec +TOTAL : 2.350376 sec INFO: No Floating Point Exceptions have been reported - 3,976,192,244 cycles # 1.678 GHz - 5,945,001,398 instructions # 1.50 insn per cycle - 2.371015031 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) + 3,963,784,764 cycles # 1.684 GHz + 5,935,907,800 instructions # 1.50 insn per cycle + 2.354741258 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2456) (512y: 337) (512z:39348) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index eab4a6ad11..6eb1688819 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:20:56 +DATE: 2024-06-03_18:18:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.265532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.290379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.292594e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.268679e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.289756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.291684e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539246 sec +TOTAL : 0.538173 sec INFO: No Floating Point Exceptions have been reported - 2,164,648,283 cycles # 2.819 GHz - 3,395,016,138 instructions # 1.57 insn per cycle - 0.825962894 seconds time elapsed + 2,184,059,608 cycles # 2.818 GHz + 3,376,427,953 instructions # 1.55 insn per cycle + 0.832264536 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.767479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793589e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.761939e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788694e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.789907e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.296186 sec +TOTAL : 3.298105 sec INFO: No Floating Point Exceptions have been reported - 10,182,455,226 cycles # 2.857 GHz - 22,751,947,545 instructions # 2.23 insn per cycle - 3.619847665 seconds time elapsed + 10,257,333,842 cycles # 2.884 GHz + 22,698,193,148 instructions # 2.21 insn per cycle + 3.611392054 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.098681e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.099131e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.099131e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142882e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.143319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.143319e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 40.023617 sec +TOTAL : 39.595989 sec INFO: No Floating Point Exceptions have been reported - 114,408,903,354 cycles # 2.859 GHz - 144,789,258,871 instructions # 1.27 insn per cycle - 40.028023083 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20719) (avx2: 0) (512y: 0) (512z: 0) + 113,429,020,583 cycles # 2.865 GHz + 144,293,514,459 instructions # 1.27 insn per cycle + 39.600246121 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:21037) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.944993e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.947319e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947319e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.929742e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.931957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931957e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.578054 sec +TOTAL : 5.607131 sec INFO: No Floating Point Exceptions have been reported - 15,223,576,233 cycles # 2.728 GHz - 37,762,970,352 instructions # 2.48 insn per cycle - 5.582406080 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) + 15,313,512,776 cycles # 2.730 GHz + 38,398,326,248 instructions # 2.51 insn per cycle + 5.611482587 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69655) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.278833e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.292510e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.292510e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.272721e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.286026e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.286026e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.262588 sec +TOTAL : 2.265052 sec INFO: No Floating Point Exceptions have been reported - 6,007,020,457 cycles # 2.651 GHz - 12,896,115,872 instructions # 2.15 insn per cycle - 2.266904685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) + 6,015,873,656 cycles # 2.652 GHz + 12,943,956,040 instructions # 2.15 insn per cycle + 2.269278552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:46109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.679346e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.698475e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.698475e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.622078e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641542e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.899294 sec +TOTAL : 1.911674 sec INFO: No Floating Point Exceptions have been reported - 5,094,216,811 cycles # 2.677 GHz - 11,448,333,625 instructions # 2.25 insn per cycle - 1.903608667 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) + 5,095,328,711 cycles # 2.660 GHz + 11,457,094,811 instructions # 2.25 insn per cycle + 1.916057589 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:40158) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.004138e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.016815e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.016815e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.994860e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.351445 sec +TOTAL : 2.354136 sec INFO: No Floating Point Exceptions have been reported - 3,952,954,226 cycles # 1.679 GHz - 5,896,992,592 instructions # 1.49 insn per cycle - 2.356043721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) + 3,958,563,795 cycles # 1.679 GHz + 5,898,244,361 instructions # 1.49 insn per cycle + 2.358438675 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38926) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index fac6650d6a..bde0297986 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:57:27 +DATE: 2024-06-03_18:04:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.356553e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.410019e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.415334e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.993391e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.048362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.053874e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487456 sec +TOTAL : 0.488917 sec INFO: No Floating Point Exceptions have been reported - 1,983,251,875 cycles # 2.820 GHz - 2,928,209,714 instructions # 1.48 insn per cycle - 0.764609113 seconds time elapsed + 1,999,590,506 cycles # 2.833 GHz + 2,960,294,533 instructions # 1.48 insn per cycle + 0.765856045 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.608392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.688100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.691532e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.124915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.193549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.196589e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.725977 sec +TOTAL : 1.798975 sec INFO: No Floating Point Exceptions have been reported - 5,578,929,082 cycles # 2.849 GHz - 11,014,391,874 instructions # 1.97 insn per cycle - 2.015111587 seconds time elapsed + 5,823,539,302 cycles # 2.863 GHz + 12,401,394,438 instructions # 2.13 insn per cycle + 2.090093066 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908177e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909097e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909097e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.845616e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.846469e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.846469e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.603316 sec +TOTAL : 8.894726 sec INFO: No Floating Point Exceptions have been reported - 24,630,404,396 cycles # 2.862 GHz - 78,128,784,942 instructions # 3.17 insn per cycle - 8.607433386 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,938,295,137 cycles # 2.803 GHz + 79,123,203,849 instructions # 3.17 insn per cycle + 8.898862244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.834941e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.847074e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.847074e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.866908e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.879125e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.879125e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.408187 sec +TOTAL : 2.396402 sec INFO: No Floating Point Exceptions have been reported - 6,475,075,186 cycles # 2.685 GHz - 20,120,578,414 instructions # 3.11 insn per cycle - 2.412367232 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,527,381,936 cycles # 2.720 GHz + 20,278,170,595 instructions # 3.11 insn per cycle + 2.400661491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.547510e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.553645e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.553645e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.565130e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.571406e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.571406e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.067919 sec +TOTAL : 1.055961 sec INFO: No Floating Point Exceptions have been reported - 2,818,351,962 cycles # 2.631 GHz - 6,988,245,481 instructions # 2.48 insn per cycle - 1.072071344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,869,911,542 cycles # 2.709 GHz + 7,073,058,843 instructions # 2.46 insn per cycle + 1.060231350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.757892e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.765838e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.765838e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.761782e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.769925e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.769925e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.941407 sec +TOTAL : 0.939080 sec INFO: No Floating Point Exceptions have been reported - 2,493,554,219 cycles # 2.639 GHz - 6,295,971,949 instructions # 2.52 insn per cycle - 0.945627547 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,557,200,794 cycles # 2.713 GHz + 6,411,348,860 instructions # 2.51 insn per cycle + 0.943228450 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.363379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.368145e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.368145e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391062e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.395939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395939e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.211254 sec +TOTAL : 1.187645 sec INFO: No Floating Point Exceptions have been reported - 2,046,343,979 cycles # 1.685 GHz - 3,265,913,971 instructions # 1.60 insn per cycle - 1.215412846 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,255,997 cycles # 1.746 GHz + 3,311,166,731 instructions # 1.59 insn per cycle + 1.191833308 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index bcf7be18e6..e9aab55893 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:29:50 +DATE: 2024-06-03_18:27:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.598474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.301712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.301712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.298686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.931151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.931151e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.472439 sec +TOTAL : 0.480027 sec INFO: No Floating Point Exceptions have been reported - 1,958,040,860 cycles # 2.820 GHz - 2,939,762,551 instructions # 1.50 insn per cycle - 0.750896192 seconds time elapsed + 1,949,076,607 cycles # 2.814 GHz + 2,936,227,158 instructions # 1.51 insn per cycle + 0.751094441 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.273826e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.571327e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.571327e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.901552 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.950050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.119314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.119314e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 +TOTAL : 1.970885 sec INFO: No Floating Point Exceptions have been reported - 6,095,137,537 cycles # 2.849 GHz - 12,940,780,690 instructions # 2.12 insn per cycle - 2.195795529 seconds time elapsed + 6,318,815,618 cycles # 2.853 GHz + 13,486,857,827 instructions # 2.13 insn per cycle + 2.273531724 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -95,8 +95,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.910575e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.911506e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.911506e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921328e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.922325e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922325e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.594991 sec +TOTAL : 8.547068 sec INFO: No Floating Point Exceptions have been reported - 24,652,141,911 cycles # 2.867 GHz - 78,137,160,167 instructions # 3.17 insn per cycle - 8.599344489 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,942,697,613 cycles # 2.917 GHz + 79,121,197,548 instructions # 3.17 insn per cycle + 8.551612978 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.898082e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.911105e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.911105e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.885373e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.897651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.897651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.388247 sec +TOTAL : 2.392647 sec INFO: No Floating Point Exceptions have been reported - 6,478,062,029 cycles # 2.708 GHz - 20,129,777,692 instructions # 3.11 insn per cycle - 2.392637226 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,537,624,163 cycles # 2.728 GHz + 20,287,271,158 instructions # 3.10 insn per cycle + 2.396994035 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.558890e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565336e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565336e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.525564e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531703e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531703e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.063397 sec +TOTAL : 1.086336 sec INFO: No Floating Point Exceptions have been reported - 2,829,584,006 cycles # 2.652 GHz - 6,998,429,462 instructions # 2.47 insn per cycle - 1.067880984 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,877,297,833 cycles # 2.640 GHz + 7,083,001,521 instructions # 2.46 insn per cycle + 1.090815935 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.773701e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.782227e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.782227e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.729035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737098e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737098e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.935665 sec +TOTAL : 0.959489 sec INFO: No Floating Point Exceptions have been reported - 2,499,925,740 cycles # 2.661 GHz - 6,304,962,307 instructions # 2.52 insn per cycle - 0.940287567 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,542,436,755 cycles # 2.639 GHz + 6,420,635,281 instructions # 2.53 insn per cycle + 0.963863512 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.365132e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.370126e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.370126e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342357e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.347052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.347052e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.212746 sec +TOTAL : 1.233007 sec INFO: No Floating Point Exceptions have been reported - 2,057,028,781 cycles # 1.691 GHz - 3,276,379,459 instructions # 1.59 insn per cycle - 1.217212255 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,091,037,827 cycles # 1.691 GHz + 3,321,502,227 instructions # 1.59 insn per cycle + 1.237437594 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index b890671a07..dc0cb0757b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:39:40 +DATE: 2024-06-03_18:37:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.323640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.375162e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.380930e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.470190 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.987947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.031953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.037166e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159396e-01 +- 3.238803e-01 ) GeV^-4 +TOTAL : 0.476174 sec INFO: No Floating Point Exceptions have been reported - 1,954,941,965 cycles # 2.821 GHz - 2,911,153,946 instructions # 1.49 insn per cycle - 0.750497586 seconds time elapsed + 1,936,582,609 cycles # 2.812 GHz + 2,924,836,850 instructions # 1.51 insn per cycle + 0.746067574 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.584082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.652641e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.655733e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.198293e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.256948e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.259713e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.807227 sec +TOTAL : 1.871961 sec INFO: No Floating Point Exceptions have been reported - 5,802,713,802 cycles # 2.849 GHz - 11,535,404,543 instructions # 1.99 insn per cycle - 2.092606216 seconds time elapsed + 5,971,916,027 cycles # 2.846 GHz + 12,313,771,451 instructions # 2.06 insn per cycle + 2.156592917 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.913976e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914935e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914935e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.890245e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.891148e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.891148e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.577750 sec +TOTAL : 8.685533 sec INFO: No Floating Point Exceptions have been reported - 24,611,965,012 cycles # 2.868 GHz - 78,127,241,298 instructions # 3.17 insn per cycle - 8.581839983 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,908,953,921 cycles # 2.867 GHz + 79,116,868,705 instructions # 3.18 insn per cycle + 8.689669358 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.890077e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.902996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.902996e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.776340e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.788373e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.788373e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.389920 sec +TOTAL : 2.429835 sec INFO: No Floating Point Exceptions have been reported - 6,479,004,824 cycles # 2.707 GHz - 20,120,753,195 instructions # 3.11 insn per cycle - 2.394143360 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,530,091,398 cycles # 2.684 GHz + 20,280,205,711 instructions # 3.11 insn per cycle + 2.433950240 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.544733e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.550971e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.550971e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.522176e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528327e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528327e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.071318 sec +TOTAL : 1.086681 sec INFO: No Floating Point Exceptions have been reported - 2,822,669,649 cycles # 2.626 GHz - 6,987,403,130 instructions # 2.48 insn per cycle - 1.075405956 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,870,476,193 cycles # 2.633 GHz + 7,070,613,855 instructions # 2.46 insn per cycle + 1.090889376 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.762670e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770899e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.770899e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.741342e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.749550e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.749550e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.940633 sec +TOTAL : 0.951048 sec INFO: No Floating Point Exceptions have been reported - 2,495,155,242 cycles # 2.643 GHz - 6,294,152,477 instructions # 2.52 insn per cycle - 0.944657528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,535,390,814 cycles # 2.657 GHz + 6,407,801,612 instructions # 2.53 insn per cycle + 0.955161053 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.367006e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.372042e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.372042e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.340513e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.345204e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.345204e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.209697 sec +TOTAL : 1.233803 sec INFO: No Floating Point Exceptions have been reported - 2,049,421,235 cycles # 1.690 GHz - 3,264,511,946 instructions # 1.59 insn per cycle - 1.213845698 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,084,630,811 cycles # 1.685 GHz + 3,309,613,890 instructions # 1.59 insn per cycle + 1.238007402 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index d9b7ee3321..dd57b89c3f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:36:52 +DATE: 2024-06-03_18:34:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.354433e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.405954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.411991e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.977039e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.020614e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.025828e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.469295 sec +TOTAL : 0.474208 sec INFO: No Floating Point Exceptions have been reported - 1,931,447,531 cycles # 2.818 GHz - 2,857,411,874 instructions # 1.48 insn per cycle - 0.742623214 seconds time elapsed + 1,932,320,281 cycles # 2.821 GHz + 2,899,665,099 instructions # 1.50 insn per cycle + 0.743582642 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.583355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.650562e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.653652e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.198516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.257721e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.260421e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.754167 sec +TOTAL : 1.817458 sec INFO: No Floating Point Exceptions have been reported - 5,647,206,768 cycles # 2.850 GHz - 12,402,873,577 instructions # 2.20 insn per cycle - 2.038447332 seconds time elapsed + 5,848,497,460 cycles # 2.852 GHz + 12,362,000,534 instructions # 2.11 insn per cycle + 2.107105881 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908988e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909932e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909932e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.892402e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893307e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893307e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.599963 sec +TOTAL : 8.674955 sec INFO: No Floating Point Exceptions have been reported - 24,610,286,619 cycles # 2.861 GHz - 78,133,539,217 instructions # 3.17 insn per cycle - 8.604049688 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,880,877,717 cycles # 2.867 GHz + 79,117,977,201 instructions # 3.18 insn per cycle + 8.679047102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.879217e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.891579e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.891579e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.793441e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.805458e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.805458e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.392369 sec +TOTAL : 2.422124 sec INFO: No Floating Point Exceptions have been reported - 6,476,796,255 cycles # 2.704 GHz - 20,121,504,943 instructions # 3.11 insn per cycle - 2.396533403 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,538,684,193 cycles # 2.696 GHz + 20,279,294,211 instructions # 3.10 insn per cycle + 2.426280911 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.559523e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565982e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565982e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.535010e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.541066e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.541066e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.059899 sec +TOTAL : 1.076749 sec INFO: No Floating Point Exceptions have been reported - 2,823,568,100 cycles # 2.655 GHz - 6,988,803,220 instructions # 2.48 insn per cycle - 1.064076387 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,872,784,220 cycles # 2.660 GHz + 7,073,630,125 instructions # 2.46 insn per cycle + 1.080883194 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763562e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.771848e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.771848e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.747305e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755134e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.938280 sec +TOTAL : 0.946884 sec INFO: No Floating Point Exceptions have been reported - 2,491,711,137 cycles # 2.646 GHz - 6,295,398,273 instructions # 2.53 insn per cycle - 0.942474198 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,530,495,182 cycles # 2.662 GHz + 6,410,891,269 instructions # 2.53 insn per cycle + 0.951073221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.365021e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.369781e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.369781e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.350589e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.355391e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.355391e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.209851 sec +TOTAL : 1.222830 sec INFO: No Floating Point Exceptions have been reported - 2,048,944,436 cycles # 1.689 GHz - 3,266,101,120 instructions # 1.59 insn per cycle - 1.214099702 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,597,483 cycles # 1.696 GHz + 3,311,319,686 instructions # 1.59 insn per cycle + 1.227064652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index ae89ba0a21..9ad1cc5540 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:34:09 +DATE: 2024-06-03_18:31:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,15 +50,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.743567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.402148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.408206e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.421409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.005306e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.010390e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.471325 sec +TOTAL : 0.478335 sec INFO: No Floating Point Exceptions have been reported - 1,966,995,104 cycles # 2.818 GHz - 2,861,570,291 instructions # 1.45 insn per cycle - 0.755170556 seconds time elapsed + 1,939,991,217 cycles # 2.815 GHz + 2,927,824,252 instructions # 1.51 insn per cycle + 0.747853851 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -70,15 +70,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.483014e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.687507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.690776e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.838348 sec +EvtsPerSec[Rmb+ME] (23) = ( 7.141822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.222275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.224942e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641709e+00 +- 4.994248e+00 ) GeV^-4 +TOTAL : 1.901980 sec INFO: No Floating Point Exceptions have been reported - 5,879,672,141 cycles # 2.848 GHz - 11,762,772,012 instructions # 2.00 insn per cycle - 2.123420033 seconds time elapsed + 6,075,195,260 cycles # 2.850 GHz + 12,958,083,094 instructions # 2.13 insn per cycle + 2.187986875 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -86,8 +86,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe @@ -99,16 +99,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.911656e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.912578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912578e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.900045e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900956e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900956e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.587423 sec +TOTAL : 8.639797 sec INFO: No Floating Point Exceptions have been reported - 24,606,124,860 cycles # 2.865 GHz - 78,133,915,634 instructions # 3.18 insn per cycle - 8.591534118 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) + 24,862,319,107 cycles # 2.877 GHz + 79,117,882,595 instructions # 3.18 insn per cycle + 8.644019221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3586) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -116,8 +116,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (F77/C++) = 6.6274863312764526E-004 +Relative difference = 4.998523613136231e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -127,16 +127,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.884793e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897138e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.897138e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.841326e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.853653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.853653e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.390128 sec +TOTAL : 2.405532 sec INFO: No Floating Point Exceptions have been reported - 6,472,493,425 cycles # 2.704 GHz - 20,120,111,462 instructions # 3.11 insn per cycle - 2.394291914 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) + 6,525,406,357 cycles # 2.709 GHz + 20,278,336,627 instructions # 3.11 insn per cycle + 2.409634043 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -144,8 +144,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (F77/C++) = 6.6274861442972011E-004 +Relative difference = 2.1772539563413118e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -155,16 +155,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.539420e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.545531e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.545531e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.529972e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.535987e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.535987e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.073549 sec +TOTAL : 1.080386 sec INFO: No Floating Point Exceptions have been reported - 2,823,771,290 cycles # 2.622 GHz - 6,988,702,898 instructions # 2.47 insn per cycle - 1.077710790 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) + 2,870,228,383 cycles # 2.648 GHz + 7,072,925,245 instructions # 2.46 insn per cycle + 1.084779868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12076) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -172,8 +172,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -183,16 +183,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.753139e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.761048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.761048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.740540e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.748411e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.748411e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.944047 sec +TOTAL : 0.950693 sec INFO: No Floating Point Exceptions have been reported - 2,494,675,517 cycles # 2.633 GHz - 6,296,230,342 instructions # 2.52 insn per cycle - 0.948274903 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) + 2,529,265,476 cycles # 2.651 GHz + 6,410,644,765 instructions # 2.53 insn per cycle + 0.954844718 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11050) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -200,8 +200,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (F77/C++) = 6.6271938174396888E-004 +Relative difference = 2.7547150614455683e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP= @@ -211,16 +211,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.362634e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367367e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367367e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.342143e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.346738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.346738e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.211984 sec +TOTAL : 1.230394 sec INFO: No Floating Point Exceptions have been reported - 2,049,642,332 cycles # 1.687 GHz - 3,266,281,603 instructions # 1.59 insn per cycle - 1.216098332 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2427) (512y: 46) (512z: 9578) + 2,079,600,495 cycles # 1.686 GHz + 3,311,340,203 instructions # 1.59 insn per cycle + 1.234597523 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2615) (512y: 46) (512z: 9609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -228,8 +228,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952779718007E-004 +Relative difference = 4.194411063934945e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 2894e34cf4..4b1379d26e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:57:53 +DATE: 2024-06-03_18:04:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.333854e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.386711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.394716e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.944277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.998202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.004125e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.491038 sec +TOTAL : 0.491300 sec INFO: No Floating Point Exceptions have been reported - 1,996,644,525 cycles # 2.816 GHz - 2,959,911,635 instructions # 1.48 insn per cycle - 0.767468034 seconds time elapsed + 2,027,457,518 cycles # 2.855 GHz + 3,013,938,753 instructions # 1.49 insn per cycle + 0.770346504 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.597534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.679033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.682352e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.155146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.224565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.227706e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.726608 sec +TOTAL : 1.795306 sec INFO: No Floating Point Exceptions have been reported - 5,570,117,632 cycles # 2.845 GHz - 11,507,962,990 instructions # 2.07 insn per cycle - 2.016820610 seconds time elapsed + 5,836,009,627 cycles # 2.860 GHz + 11,928,682,704 instructions # 2.04 insn per cycle + 2.097501087 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +Avg ME (F77/GPU) = 6.6262667672387088E-004 +Relative difference = 2.825534762507892e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915894e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916848e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916848e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.894741e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.895642e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.895642e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.568155 sec +TOTAL : 8.664226 sec INFO: No Floating Point Exceptions have been reported - 24,541,635,551 cycles # 2.863 GHz - 77,860,582,476 instructions # 3.17 insn per cycle - 8.572306446 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) + 24,819,484,072 cycles # 2.864 GHz + 78,852,465,652 instructions # 3.18 insn per cycle + 8.668329453 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3106) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +Avg ME (F77/C++) = 6.6274866250177339E-004 +Relative difference = 5.65798569465384e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.989453e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.002228e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.002228e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.915710e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.928145e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.928145e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.354308 sec +TOTAL : 2.379658 sec INFO: No Floating Point Exceptions have been reported - 6,429,555,449 cycles # 2.727 GHz - 20,085,437,100 instructions # 3.12 insn per cycle - 2.358577961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) + 6,479,109,742 cycles # 2.719 GHz + 20,237,907,077 instructions # 3.12 insn per cycle + 2.383835406 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13507) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861465384638E-004 -Relative difference = 2.211071647257023e-08 +Avg ME (F77/C++) = 6.6274861448331612E-004 +Relative difference = 2.1853408865157068e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.495665e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501413e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.473012e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.478589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.478589e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.104553 sec +TOTAL : 1.121480 sec INFO: No Floating Point Exceptions have been reported - 2,915,650,989 cycles # 2.631 GHz - 7,129,883,095 instructions # 2.45 insn per cycle - 1.108735215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) + 2,980,057,411 cycles # 2.650 GHz + 7,214,022,820 instructions # 2.42 insn per cycle + 1.125580830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:12458) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.680175e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.687445e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.687445e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.689100e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.696678e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.696678e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.984092 sec +TOTAL : 0.978913 sec INFO: No Floating Point Exceptions have been reported - 2,594,901,147 cycles # 2.627 GHz - 6,438,491,817 instructions # 2.48 insn per cycle - 0.988307717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) + 2,616,026,965 cycles # 2.663 GHz + 6,551,988,105 instructions # 2.50 insn per cycle + 0.983164160 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11478) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (F77/C++) = 6.6271939668088170E-004 +Relative difference = 5.008331292535666e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.316604e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.321189e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321189e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.298030e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.302319e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.302319e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.253756 sec +TOTAL : 1.271823 sec INFO: No Floating Point Exceptions have been reported - 2,120,187,945 cycles # 1.687 GHz - 3,427,717,458 instructions # 1.62 insn per cycle - 1.258078621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2924) (512y: 22) (512z: 9654) + 2,150,298,528 cycles # 1.687 GHz + 3,469,612,781 instructions # 1.61 insn per cycle + 1.275947754 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3051) (512y: 25) (512z: 9681) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032322112E-004 -Relative difference = 3.066639970473621e-08 +Avg ME (F77/C++) = 6.6271952032316561E-004 +Relative difference = 3.066631594207157e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index f9728316f5..9a7511ccb1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:22:03 +DATE: 2024-06-03_18:19:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.562593e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.602761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.607258e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.496319 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.040325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.086216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.091080e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.493370 sec INFO: No Floating Point Exceptions have been reported - 2,073,990,194 cycles # 2.815 GHz - 3,053,942,926 instructions # 1.47 insn per cycle - 0.794515563 seconds time elapsed + 1,994,267,279 cycles # 2.817 GHz + 2,950,946,274 instructions # 1.48 insn per cycle + 0.766690512 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.711536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.769520e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.772095e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.679705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.740696e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.743534e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.869015 sec +TOTAL : 1.728122 sec INFO: No Floating Point Exceptions have been reported - 6,036,319,674 cycles # 2.845 GHz - 12,339,909,244 instructions # 2.04 insn per cycle - 2.177861676 seconds time elapsed + 5,590,233,725 cycles # 2.853 GHz + 11,001,068,330 instructions # 1.97 insn per cycle + 2.016085871 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +Avg ME (F77/GPU) = 6.6262669162351490E-004 +Relative difference = 2.8232862531213374e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.442190e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.442945e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.442945e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.326582e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.327296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.327296e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.143467 sec +TOTAL : 30.797428 sec INFO: No Floating Point Exceptions have been reported - 86,238,493,092 cycles # 2.861 GHz - 135,582,429,521 instructions # 1.57 insn per cycle - 30.147691683 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15593) (avx2: 0) (512y: 0) (512z: 0) + 88,271,755,592 cycles # 2.866 GHz + 135,713,283,036 instructions # 1.54 insn per cycle + 30.801639573 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15654) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275351196781740E-004 -Relative difference = 1.805772034719401e-08 +Avg ME (F77/C++) = 6.6275351083142087E-004 +Relative difference = 1.6343060926412837e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.622664e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.634101e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.634101e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.650422e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.662101e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.662101e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.485418 sec +TOTAL : 2.475028 sec INFO: No Floating Point Exceptions have been reported - 6,780,106,144 cycles # 2.725 GHz - 19,386,070,044 instructions # 2.86 insn per cycle - 2.489679561 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) + 6,775,368,084 cycles # 2.734 GHz + 19,365,438,660 instructions # 2.86 insn per cycle + 2.479304920 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69602) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862707273868E-004 -Relative difference = 4.0849182767952624e-08 +Avg ME (F77/C++) = 6.6274862748188362E-004 +Relative difference = 4.14665283800746e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.375981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.380844e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.380844e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.200183 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.379846e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.384758e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.384758e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.197144 sec INFO: No Floating Point Exceptions have been reported - 3,187,701,871 cycles # 2.648 GHz - 6,807,898,152 instructions # 2.14 insn per cycle - 1.204433728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) + 3,171,868,252 cycles # 2.642 GHz + 6,800,239,710 instructions # 2.14 insn per cycle + 1.201396205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:49016) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667985e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.675164e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.675164e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.991406 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.672800e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.680200e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.680200e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.988606 sec INFO: No Floating Point Exceptions have been reported - 2,635,968,315 cycles # 2.649 GHz - 5,985,925,835 instructions # 2.27 insn per cycle - 0.995667999 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) + 2,634,024,890 cycles # 2.655 GHz + 5,977,125,707 instructions # 2.27 insn per cycle + 0.992732467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:42613) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (F77/C++) = 6.6272731568543797E-004 +Relative difference = 2.3668012430631962e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.343303e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.347932e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.347932e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.229182 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.340599e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.345196e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.345196e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.232185 sec INFO: No Floating Point Exceptions have been reported - 2,077,787,049 cycles # 1.685 GHz - 3,500,922,258 instructions # 1.68 insn per cycle - 1.233472370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44829) + 2,080,640,847 cycles # 1.685 GHz + 3,501,935,156 instructions # 1.68 insn per cycle + 1.236390991 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 5210) (512y: 3) (512z:44834) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750363879224E-004 -Relative difference = 5.490631193034436e-09 +Avg ME (F77/C++) = 6.6272750237027223E-004 +Relative difference = 3.5765412974815996e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index a6e1efe771..b7d395e1d4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_21:22:53 +DATE: 2024-06-03_18:20:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.474201e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.511992e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.516380e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.495445 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.115525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.158112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.163416e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059597e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.488524 sec INFO: No Floating Point Exceptions have been reported - 2,069,875,918 cycles # 2.815 GHz - 3,088,038,049 instructions # 1.49 insn per cycle - 0.793826258 seconds time elapsed + 1,995,967,010 cycles # 2.825 GHz + 2,980,745,793 instructions # 1.49 insn per cycle + 0.765930733 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.633274e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.688888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.691415e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.783687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.846234e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.849143e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.883548 sec +TOTAL : 1.714935 sec INFO: No Floating Point Exceptions have been reported - 6,034,409,459 cycles # 2.847 GHz - 12,602,199,510 instructions # 2.09 insn per cycle - 2.178280060 seconds time elapsed + 5,548,221,436 cycles # 2.845 GHz + 11,425,921,065 instructions # 2.06 insn per cycle + 2.006998021 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 6.626454e-04 -Avg ME (F77/GPU) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +Avg ME (F77/GPU) = 6.6262669162351490E-004 +Relative difference = 2.8232862531213374e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.435127e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.435900e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.435900e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.490308e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.491091e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.491091e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.183033 sec +TOTAL : 29.879379 sec INFO: No Floating Point Exceptions have been reported - 86,348,373,585 cycles # 2.861 GHz - 135,991,147,369 instructions # 1.57 insn per cycle - 30.187286457 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:15571) (avx2: 0) (512y: 0) (512z: 0) + 85,803,548,967 cycles # 2.872 GHz + 135,586,619,473 instructions # 1.58 insn per cycle + 29.883595944 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:15696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275348988418387E-004 -Relative difference = 1.5263316105958472e-08 +Avg ME (F77/C++) = 6.6275346699767868E-004 +Relative difference = 4.979577076821206e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.576661e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588266e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588266e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.541123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.552430e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.552430e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.501741 sec +TOTAL : 2.515548 sec INFO: No Floating Point Exceptions have been reported - 6,860,063,616 cycles # 2.739 GHz - 19,439,732,968 instructions # 2.83 insn per cycle - 2.505990169 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) + 6,837,292,011 cycles # 2.714 GHz + 19,414,639,291 instructions # 2.84 insn per cycle + 2.519892605 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:69633) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862764021530E-004 -Relative difference = 4.170542995014107e-08 +Avg ME (F77/C++) = 6.6274862799683282E-004 +Relative difference = 4.2243518621014775e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.407660e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.412779e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.412779e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.173483 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.406201e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.411445e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411445e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.174301 sec INFO: No Floating Point Exceptions have been reported - 3,113,664,715 cycles # 2.645 GHz - 6,718,777,649 instructions # 2.16 insn per cycle - 1.177759140 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) + 3,108,934,296 cycles # 2.640 GHz + 6,722,953,423 instructions # 2.16 insn per cycle + 1.178539821 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:47703) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667132e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.674306e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.674306e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.991992 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.665805e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.673199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673199e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 0.992561 sec INFO: No Floating Point Exceptions have been reported - 2,638,087,053 cycles # 2.650 GHz - 5,969,912,308 instructions # 2.26 insn per cycle - 0.996231534 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) + 2,633,671,965 cycles # 2.644 GHz + 5,976,623,304 instructions # 2.27 insn per cycle + 0.996855743 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:41894) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (F77/C++) = 6.6272731623419345E-004 +Relative difference = 2.449603850635964e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.342386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.347018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.347018e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.229963 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.344162e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.348769e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.348769e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.228300 sec INFO: No Floating Point Exceptions have been reported - 2,078,559,427 cycles # 1.687 GHz - 3,494,531,487 instructions # 1.68 insn per cycle - 1.234278676 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44472) + 2,078,335,327 cycles # 1.687 GHz + 3,494,575,133 instructions # 1.68 insn per cycle + 1.232506015 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4174) (512y: 4) (512z:44485) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750384530066E-004 -Relative difference = 5.80223501432476e-09 +Avg ME (F77/C++) = 6.6272750247886592E-004 +Relative difference = 3.740400032174438e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 7c14a2e7fb..ac0969612a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:58:18 +DATE: 2024-06-03_18:05:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.482581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.509636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453427e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.486035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.488786e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527538 sec +TOTAL : 0.536243 sec INFO: No Floating Point Exceptions have been reported - 2,176,696,799 cycles # 2.823 GHz - 3,403,965,501 instructions # 1.56 insn per cycle - 0.831403987 seconds time elapsed + 2,164,059,676 cycles # 2.821 GHz + 3,333,455,852 instructions # 1.54 insn per cycle + 0.826098229 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.140636e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175593e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180551e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.042724 sec +TOTAL : 3.039539 sec INFO: No Floating Point Exceptions have been reported - 9,426,429,638 cycles # 2.856 GHz - 21,229,330,812 instructions # 2.25 insn per cycle - 3.355690618 seconds time elapsed + 9,431,161,386 cycles # 2.859 GHz + 21,152,322,108 instructions # 2.24 insn per cycle + 3.354444882 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.825999e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.826873e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.826873e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.813946e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814806e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814806e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.991087 sec +TOTAL : 9.050488 sec INFO: No Floating Point Exceptions have been reported - 25,893,198,103 cycles # 2.879 GHz - 79,438,485,543 instructions # 3.07 insn per cycle - 8.995289929 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) + 25,976,132,367 cycles # 2.869 GHz + 79,434,004,937 instructions # 3.06 insn per cycle + 9.054767785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4789) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.417587e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.420701e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.420701e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.393355e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.396331e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.396331e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.808506 sec +TOTAL : 4.842609 sec INFO: No Floating Point Exceptions have been reported - 12,725,370,972 cycles # 2.645 GHz - 38,549,760,913 instructions # 3.03 insn per cycle - 4.812856911 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) + 12,860,713,919 cycles # 2.654 GHz + 38,831,953,159 instructions # 3.02 insn per cycle + 4.846945234 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.894022e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.909715e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.909715e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.861975e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.877645e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.877645e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.087240 sec +TOTAL : 2.096124 sec INFO: No Floating Point Exceptions have been reported - 5,528,399,244 cycles # 2.644 GHz - 13,481,627,455 instructions # 2.44 insn per cycle - 2.091666528 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) + 5,571,143,137 cycles # 2.654 GHz + 13,624,556,258 instructions # 2.45 insn per cycle + 2.100393193 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11434) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.891097e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.911587e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.911587e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.943351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.963560e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.963560e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.854687 sec +TOTAL : 1.843305 sec INFO: No Floating Point Exceptions have been reported - 4,870,728,509 cycles # 2.622 GHz - 12,137,042,883 instructions # 2.49 insn per cycle - 1.858964728 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) + 4,890,128,479 cycles # 2.648 GHz + 12,303,900,159 instructions # 2.52 insn per cycle + 1.847571545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10344) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.691394e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.702803e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.702803e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.627810e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.638960e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.638960e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.460755 sec +TOTAL : 2.484305 sec INFO: No Floating Point Exceptions have been reported - 4,149,120,121 cycles # 1.684 GHz - 6,337,745,344 instructions # 1.53 insn per cycle - 2.465117818 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) + 4,186,038,092 cycles # 1.683 GHz + 6,400,000,279 instructions # 1.53 insn per cycle + 2.488700095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1982) (512y: 93) (512z: 9359) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 1d3301fafa..91d2dbe837 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-06-02_20:58:52 +DATE: 2024-06-03_18:05:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.467668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.493937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.496432e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.467855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.500671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.503291e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527278 sec +TOTAL : 0.529906 sec INFO: No Floating Point Exceptions have been reported - 2,157,795,556 cycles # 2.825 GHz - 3,403,572,710 instructions # 1.58 insn per cycle - 0.822587991 seconds time elapsed + 2,180,833,265 cycles # 2.825 GHz + 3,396,189,647 instructions # 1.56 insn per cycle + 0.831257190 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181635e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182999e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.143984e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177475e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178851e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.025468 sec +TOTAL : 3.032584 sec INFO: No Floating Point Exceptions have been reported - 9,383,589,958 cycles # 2.858 GHz - 21,474,425,018 instructions # 2.29 insn per cycle - 3.338591075 seconds time elapsed + 9,433,736,246 cycles # 2.856 GHz + 19,935,455,469 instructions # 2.11 insn per cycle + 3.360609480 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.819625e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.820503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.820503e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.807867e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.808713e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.808713e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.022232 sec +TOTAL : 9.081153 sec INFO: No Floating Point Exceptions have been reported - 25,882,405,875 cycles # 2.868 GHz - 79,448,983,201 instructions # 3.07 insn per cycle - 9.026481543 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) + 26,018,917,813 cycles # 2.864 GHz + 79,465,278,181 instructions # 3.05 insn per cycle + 9.085407536 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 4445) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.446182e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.449327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.449327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.392154e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.395092e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395092e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.768299 sec +TOTAL : 4.844253 sec INFO: No Floating Point Exceptions have been reported - 12,681,708,725 cycles # 2.658 GHz - 38,523,479,653 instructions # 3.04 insn per cycle - 4.772643197 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) + 12,831,884,182 cycles # 2.647 GHz + 38,790,118,592 instructions # 3.02 insn per cycle + 4.848502841 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:12946) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.783293e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.799085e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.799085e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.835313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.851068e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.851068e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.116349 sec +TOTAL : 2.102167 sec INFO: No Floating Point Exceptions have been reported - 5,573,346,630 cycles # 2.629 GHz - 13,607,371,055 instructions # 2.44 insn per cycle - 2.120654201 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) + 5,610,923,922 cycles # 2.665 GHz + 13,739,248,056 instructions # 2.45 insn per cycle + 2.106309675 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:11517) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.950987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.971335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.971335e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.885385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.905960e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.905960e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.841569 sec +TOTAL : 1.854910 sec INFO: No Floating Point Exceptions have been reported - 4,914,422,282 cycles # 2.663 GHz - 12,272,016,530 instructions # 2.50 insn per cycle - 1.845933307 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) + 4,944,878,602 cycles # 2.661 GHz + 12,428,707,476 instructions # 2.51 insn per cycle + 1.859196994 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:10335) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.694812e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.706153e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.706153e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.620737e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.632179e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.632179e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.459293 sec +TOTAL : 2.486586 sec INFO: No Floating Point Exceptions have been reported - 4,148,774,251 cycles # 1.685 GHz - 6,442,210,372 instructions # 1.55 insn per cycle - 2.463549102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) + 4,188,153,219 cycles # 1.682 GHz + 6,501,924,669 instructions # 1.55 insn per cycle + 2.490871580 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1805) (512y: 191) (512z: 9368) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 2e640fb20e..78b5d57214 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:00:43 +DATE: 2024-06-03_18:07:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.070046e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070489e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070728e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.072465e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.072863e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.073011e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.432122 sec +TOTAL : 2.429567 sec INFO: No Floating Point Exceptions have been reported - 7,909,240,752 cycles # 2.872 GHz - 18,000,344,677 instructions # 2.28 insn per cycle - 2.812238946 seconds time elapsed + 7,873,742,017 cycles # 2.858 GHz + 17,189,890,626 instructions # 2.18 insn per cycle + 2.810614216 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.257630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.259727e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.259989e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.252226e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.254836e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.255128e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.998685 sec +TOTAL : 3.999219 sec INFO: No Floating Point Exceptions have been reported - 12,358,861,001 cycles # 2.857 GHz - 27,265,356,364 instructions # 2.21 insn per cycle - 4.380623068 seconds time elapsed + 12,383,566,367 cycles # 2.861 GHz + 28,539,894,752 instructions # 2.30 insn per cycle + 4.383788497 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.364227e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.364435e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.364435e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.486298e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.486497e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.486497e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.176405 sec +TOTAL : 7.044010 sec INFO: No Floating Point Exceptions have been reported - 18,821,710,602 cycles # 2.622 GHz - 53,917,723,661 instructions # 2.86 insn per cycle - 7.180812312 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) + 18,875,422,058 cycles # 2.679 GHz + 53,914,980,760 instructions # 2.86 insn per cycle + 7.048043451 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.537482e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.537565e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.537565e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.528426e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528507e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528507e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.438733 sec +TOTAL : 3.459359 sec INFO: No Floating Point Exceptions have been reported - 9,825,974,360 cycles # 2.855 GHz - 27,092,527,909 instructions # 2.76 insn per cycle - 3.442940335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) + 9,903,492,345 cycles # 2.860 GHz + 27,159,679,932 instructions # 2.74 insn per cycle + 3.463482853 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.317063e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.317444e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.317444e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.294340e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294743e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294743e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.596530 sec +TOTAL : 1.608857 sec INFO: No Floating Point Exceptions have been reported - 4,226,902,337 cycles # 2.642 GHz - 9,560,928,493 instructions # 2.26 insn per cycle - 1.600685064 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) + 4,251,273,738 cycles # 2.637 GHz + 9,597,252,474 instructions # 2.26 insn per cycle + 1.613007793 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84989) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.692227e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.692715e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.692715e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.710001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.710494e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.710494e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.434526 sec +TOTAL : 1.427734 sec INFO: No Floating Point Exceptions have been reported - 3,746,125,551 cycles # 2.606 GHz - 8,486,014,947 instructions # 2.27 insn per cycle - 1.438619859 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) + 3,763,325,270 cycles # 2.630 GHz + 8,521,622,001 instructions # 2.26 insn per cycle + 1.431927276 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80643) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.273656e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.274120e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.274120e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.263213e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.263213e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.618156 sec +TOTAL : 1.623854 sec INFO: No Floating Point Exceptions have been reported - 2,695,756,195 cycles # 1.663 GHz - 4,273,774,333 instructions # 1.59 insn per cycle - 1.622321455 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) + 2,708,113,519 cycles # 1.665 GHz + 4,288,403,954 instructions # 1.58 insn per cycle + 1.627965968 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2876) (512y: 103) (512z:79119) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1fadaabb4f..36e5ddb3e5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:30:16 +DATE: 2024-06-03_18:27:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065370e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066381e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066381e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.069807e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070797e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070797e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.397754 sec +TOTAL : 2.390412 sec INFO: No Floating Point Exceptions have been reported - 7,762,497,768 cycles # 2.853 GHz - 16,203,483,295 instructions # 2.09 insn per cycle - 2.776448372 seconds time elapsed + 7,772,262,333 cycles # 2.856 GHz + 17,485,384,479 instructions # 2.25 insn per cycle + 2.777088442 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.238795e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.273940e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.273940e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.214032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.248382e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.248382e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.985494 sec +TOTAL : 3.992588 sec INFO: No Floating Point Exceptions have been reported - 12,320,118,228 cycles # 2.858 GHz - 28,700,363,846 instructions # 2.33 insn per cycle - 4.365416695 seconds time elapsed + 12,381,100,845 cycles # 2.860 GHz + 28,809,274,855 instructions # 2.33 insn per cycle + 4.384322703 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.431447e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.431652e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.431652e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.534429e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.534647e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.534647e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.108151 sec +TOTAL : 7.010505 sec INFO: No Floating Point Exceptions have been reported - 18,792,897,710 cycles # 2.643 GHz - 53,918,227,536 instructions # 2.87 insn per cycle - 7.112225621 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) + 19,040,063,694 cycles # 2.716 GHz + 53,915,532,539 instructions # 2.83 insn per cycle + 7.014673205 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32438) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546658e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546743e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546743e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.522796e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522880e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522880e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.418887 sec +TOTAL : 3.472375 sec INFO: No Floating Point Exceptions have been reported - 9,791,096,347 cycles # 2.861 GHz - 27,093,479,045 instructions # 2.77 insn per cycle - 3.423017015 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) + 9,921,044,067 cycles # 2.854 GHz + 27,159,090,171 instructions # 2.74 insn per cycle + 3.476580021 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96511) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.304069e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.304481e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.304481e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.284550e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284932e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284932e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.603100 sec +TOTAL : 1.612989 sec INFO: No Floating Point Exceptions have been reported - 4,241,407,347 cycles # 2.640 GHz - 9,561,955,028 instructions # 2.25 insn per cycle - 1.607196815 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) + 4,260,178,809 cycles # 2.636 GHz + 9,599,298,178 instructions # 2.25 insn per cycle + 1.617114232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84989) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.744578e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.745149e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.745149e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.698370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.698859e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.698859e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.415314 sec +TOTAL : 1.433048 sec INFO: No Floating Point Exceptions have been reported - 3,737,020,554 cycles # 2.634 GHz - 8,486,765,632 instructions # 2.27 insn per cycle - 1.419451948 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) + 3,772,233,442 cycles # 2.626 GHz + 8,522,495,369 instructions # 2.26 insn per cycle + 1.437211202 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80643) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.289384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.289882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.289882e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.276746e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.277245e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.277245e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.610890 sec +TOTAL : 1.616963 sec INFO: No Floating Point Exceptions have been reported - 2,696,211,668 cycles # 1.670 GHz - 4,273,881,889 instructions # 1.59 insn per cycle - 1.615050450 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) + 2,704,381,455 cycles # 1.669 GHz + 4,289,252,981 instructions # 1.59 insn per cycle + 1.621067952 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2876) (512y: 103) (512z:79119) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index b7c9be9361..586e2a3c40 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:01:38 +DATE: 2024-06-03_18:08:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065914e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066085e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065879e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066080e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.431533 sec +TOTAL : 2.433636 sec INFO: No Floating Point Exceptions have been reported - 7,864,728,245 cycles # 2.855 GHz - 16,581,142,625 instructions # 2.11 insn per cycle - 2.810896011 seconds time elapsed + 7,867,649,256 cycles # 2.853 GHz + 17,930,493,406 instructions # 2.28 insn per cycle + 2.813625761 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.234334e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.236420e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.236694e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.251408e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.254022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.254315e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.002654 sec +TOTAL : 3.999341 sec INFO: No Floating Point Exceptions have been reported - 12,362,983,148 cycles # 2.857 GHz - 26,818,544,684 instructions # 2.17 insn per cycle - 4.385714343 seconds time elapsed + 12,370,081,693 cycles # 2.859 GHz + 29,532,832,747 instructions # 2.39 insn per cycle + 4.381754418 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.360451e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.360670e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.360670e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.604921e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605129e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.605129e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.178712 sec +TOTAL : 6.946520 sec INFO: No Floating Point Exceptions have been reported - 18,903,471,951 cycles # 2.632 GHz - 53,926,959,837 instructions # 2.85 insn per cycle - 7.182807918 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) + 18,898,258,212 cycles # 2.720 GHz + 53,941,018,950 instructions # 2.85 insn per cycle + 6.950480868 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32036) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.554111e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.554204e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.554204e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.536847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.536936e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.536936e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.401831 sec +TOTAL : 3.440816 sec INFO: No Floating Point Exceptions have been reported - 9,728,814,018 cycles # 2.857 GHz - 27,089,535,875 instructions # 2.78 insn per cycle - 3.405876690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) + 9,897,425,323 cycles # 2.874 GHz + 27,136,976,374 instructions # 2.74 insn per cycle + 3.445026034 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96387) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.279450e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279875e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.279875e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.330836e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.331235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.331235e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.614318 sec +TOTAL : 1.589285 sec INFO: No Floating Point Exceptions have been reported - 4,271,811,996 cycles # 2.641 GHz - 9,560,879,429 instructions # 2.24 insn per cycle - 1.618409778 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) + 4,264,338,826 cycles # 2.677 GHz + 9,591,625,358 instructions # 2.25 insn per cycle + 1.593576676 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84996) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.724611e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.725106e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.725106e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.750204e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.750700e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.750700e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.422019 sec +TOTAL : 1.413085 sec INFO: No Floating Point Exceptions have been reported - 3,745,973,725 cycles # 2.628 GHz - 8,485,619,535 instructions # 2.27 insn per cycle - 1.426190804 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) + 3,752,884,907 cycles # 2.650 GHz + 8,514,358,197 instructions # 2.27 insn per cycle + 1.417168539 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80666) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.244405e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.244886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.244886e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.285502e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.285994e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.285994e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.632336 sec +TOTAL : 1.612194 sec INFO: No Floating Point Exceptions have been reported - 2,716,368,412 cycles # 1.661 GHz - 4,277,085,599 instructions # 1.57 insn per cycle - 1.636609876 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) + 2,702,142,076 cycles # 1.673 GHz + 4,289,047,207 instructions # 1.59 insn per cycle + 1.616308896 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2713) (512y: 185) (512z:79103) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 9454f64bcc..28e6c74910 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:02:33 +DATE: 2024-06-03_18:09:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.559368e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.560230e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.560569e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.282693e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.283455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.283883e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.689610 sec +TOTAL : 1.743706 sec INFO: No Floating Point Exceptions have been reported - 5,589,146,316 cycles # 2.841 GHz - 11,119,486,865 instructions # 1.99 insn per cycle - 2.023529940 seconds time elapsed + 5,709,155,447 cycles # 2.851 GHz + 12,383,853,252 instructions # 2.17 insn per cycle + 2.059074311 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.312667e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.313583e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.313721e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.937585 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.136623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.137393e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.137518e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 2.053767 sec INFO: No Floating Point Exceptions have been reported - 6,322,260,150 cycles # 2.852 GHz - 12,991,612,203 instructions # 2.05 insn per cycle - 2.273050569 seconds time elapsed + 6,600,822,454 cycles # 2.853 GHz + 14,650,396,107 instructions # 2.22 insn per cycle + 2.369651749 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260159E-003 +Relative difference = 0.0021940095370046923 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.449095e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449366e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449366e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.350768e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.351017e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.351017e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.254155 sec +TOTAL : 6.328437 sec INFO: No Floating Point Exceptions have been reported - 17,924,189,515 cycles # 2.865 GHz - 53,589,289,728 instructions # 2.99 insn per cycle - 6.258212218 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) + 18,112,133,095 cycles # 2.861 GHz + 53,917,761,859 instructions # 2.98 insn per cycle + 6.332410128 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087551509E-003 +Relative difference = 2.119780432912131e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.313591e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.314004e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.314004e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.302446e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.302838e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.302838e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.598328 sec +TOTAL : 1.603618 sec INFO: No Floating Point Exceptions have been reported - 4,580,110,582 cycles # 2.860 GHz - 13,761,912,039 instructions # 3.00 insn per cycle - 1.602548280 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) + 4,595,250,388 cycles # 2.859 GHz + 13,814,198,944 instructions # 3.01 insn per cycle + 1.607761589 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:97032) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.493632e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.495239e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.495239e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.461361e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.463009e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.463009e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.817743 sec +TOTAL : 0.824355 sec INFO: No Floating Point Exceptions have been reported - 2,143,262,417 cycles # 2.610 GHz - 4,816,174,375 instructions # 2.25 insn per cycle - 0.821948335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) + 2,147,702,206 cycles # 2.594 GHz + 4,843,386,973 instructions # 2.26 insn per cycle + 0.828633688 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85515) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.535714e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.537917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.537917e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.529713e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.531799e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.531799e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.705994 sec +TOTAL : 0.706721 sec INFO: No Floating Point Exceptions have been reported - 1,872,359,401 cycles # 2.639 GHz - 4,273,597,055 instructions # 2.28 insn per cycle - 0.710000254 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) + 1,879,943,341 cycles # 2.647 GHz + 4,297,878,746 instructions # 2.29 insn per cycle + 0.710927830 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81214) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.586004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587962e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587962e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.551582e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.553505e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.553505e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.806945 sec +TOTAL : 0.810975 sec INFO: No Floating Point Exceptions have been reported - 1,355,141,176 cycles # 1.672 GHz - 2,158,222,960 instructions # 1.59 insn per cycle - 0.811081253 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) + 1,364,375,785 cycles # 1.675 GHz + 2,169,142,049 instructions # 1.59 insn per cycle + 0.815064535 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3493) (512y: 47) (512z:79334) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index c3dad58c83..859b38d5c1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:31:10 +DATE: 2024-06-03_18:28:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.586248e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588038e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588038e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.639511 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.295855e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.297495e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.297495e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187093e-05 +- 9.825663e-06 ) GeV^-6 +TOTAL : 1.706058 sec INFO: No Floating Point Exceptions have been reported - 5,458,859,849 cycles # 2.849 GHz - 11,717,497,877 instructions # 2.15 insn per cycle - 1.972700319 seconds time elapsed + 5,593,764,153 cycles # 2.850 GHz + 11,978,491,776 instructions # 2.14 insn per cycle + 2.019336398 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,24 +79,24 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304060e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317709e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317709e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.924656 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.117757e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.129477e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129477e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856440e-04 +- 8.331091e-05 ) GeV^-6 +TOTAL : 2.042960 sec INFO: No Floating Point Exceptions have been reported - 6,299,130,122 cycles # 2.862 GHz - 13,932,257,788 instructions # 2.21 insn per cycle - 2.257564113 seconds time elapsed + 6,558,229,128 cycles # 2.852 GHz + 14,467,859,228 instructions # 2.21 insn per cycle + 2.357922051 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260159E-003 +Relative difference = 0.0021940095370046923 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.449386e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449634e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449634e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.173840e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.174090e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.174090e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.252998 sec +TOTAL : 6.464931 sec INFO: No Floating Point Exceptions have been reported - 17,934,331,616 cycles # 2.867 GHz - 53,590,587,156 instructions # 2.99 insn per cycle - 6.257054326 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) + 18,464,188,622 cycles # 2.855 GHz + 53,919,219,439 instructions # 2.92 insn per cycle + 6.469025984 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087551509E-003 +Relative difference = 2.119780432912131e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.318510e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318903e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318903e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.299445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.299838e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299838e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.596218 sec +TOTAL : 1.605494 sec INFO: No Floating Point Exceptions have been reported - 4,578,862,735 cycles # 2.862 GHz - 13,762,757,180 instructions # 3.01 insn per cycle - 1.600379529 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) + 4,600,449,973 cycles # 2.860 GHz + 13,814,867,008 instructions # 3.00 insn per cycle + 1.609640985 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:97032) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546896367235E-003 +Relative difference = 3.1515505172940424e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.561862e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.563430e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.563430e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.565702e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.567481e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.567481e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.810097 sec +TOTAL : 0.809807 sec INFO: No Floating Point Exceptions have been reported - 2,150,908,758 cycles # 2.644 GHz - 4,817,064,263 instructions # 2.24 insn per cycle - 0.814268690 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) + 2,149,625,871 cycles # 2.644 GHz + 4,844,274,687 instructions # 2.25 insn per cycle + 0.813973066 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85515) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.557169e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.559485e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.559485e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.487567e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.489594e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.489594e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.704248 sec +TOTAL : 0.710407 sec INFO: No Floating Point Exceptions have been reported - 1,860,720,782 cycles # 2.629 GHz - 4,274,198,133 instructions # 2.30 insn per cycle - 0.708306150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) + 1,891,140,135 cycles # 2.649 GHz + 4,298,530,156 instructions # 2.27 insn per cycle + 0.714547469 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81214) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161091246E-003 +Relative difference = 1.8588029579156084e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.582585e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.584548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.584548e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.527055e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.529167e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.529167e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.807471 sec +TOTAL : 0.814394 sec INFO: No Floating Point Exceptions have been reported - 1,357,966,185 cycles # 1.671 GHz - 2,159,181,276 instructions # 1.59 insn per cycle - 0.813237032 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2890) (512y: 49) (512z:79305) + 1,366,391,214 cycles # 1.671 GHz + 2,170,869,275 instructions # 1.59 insn per cycle + 0.818526272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3493) (512y: 47) (512z:79334) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (F77/C++) = 9.8929811982676284E-003 +Relative difference = 2.004124217057488e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 5816b2c2c2..a86a223e97 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:03:12 +DATE: 2024-06-03_18:10:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.532712e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.533546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.533906e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.288627e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.289386e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.289766e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.691126 sec +TOTAL : 1.745254 sec INFO: No Floating Point Exceptions have been reported - 5,622,580,002 cycles # 2.845 GHz - 11,510,934,287 instructions # 2.05 insn per cycle - 2.034693744 seconds time elapsed + 5,703,053,044 cycles # 2.850 GHz + 11,232,321,378 instructions # 1.97 insn per cycle + 2.059584789 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,24 +67,24 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.322031e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.322933e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.323068e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.930959 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.130933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.131711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131834e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333437e-05 ) GeV^-6 +TOTAL : 2.058680 sec INFO: No Floating Point Exceptions have been reported - 6,307,622,079 cycles # 2.854 GHz - 13,926,153,465 instructions # 2.21 insn per cycle - 2.267828569 seconds time elapsed + 6,607,736,050 cycles # 2.851 GHz + 13,912,202,043 instructions # 2.11 insn per cycle + 2.376581429 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 9.849636e-03 -Avg ME (F77/GPU) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +Avg ME (C++/GPU) = 9.849635e-03 +Avg ME (F77/GPU) = 9.8712451931260107E-003 +Relative difference = 0.0021940095370041636 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.476563e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.476825e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.476825e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.251653e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.251887e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.251887e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.234385 sec +TOTAL : 6.403228 sec INFO: No Floating Point Exceptions have been reported - 17,827,850,406 cycles # 2.859 GHz - 53,580,311,893 instructions # 3.01 insn per cycle - 6.238431546 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) + 18,326,271,023 cycles # 2.861 GHz + 53,904,115,772 instructions # 2.94 insn per cycle + 6.407222242 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:20155) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087572898E-003 +Relative difference = 2.1198021522715588e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.320430e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320855e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320855e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.311127e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.311516e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.311516e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.594879 sec +TOTAL : 1.599283 sec INFO: No Floating Point Exceptions have been reported - 4,567,314,747 cycles # 2.858 GHz - 13,755,226,123 instructions # 3.01 insn per cycle - 1.598972704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) + 4,591,437,389 cycles # 2.865 GHz + 13,807,198,493 instructions # 3.01 insn per cycle + 1.603441295 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896225560E-003 -Relative difference = 3.151694379513441e-08 +Avg ME (F77/C++) = 9.8479546896065809E-003 +Relative difference = 3.151856596628469e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.611819e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.613638e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.613638e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.548285e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.549831e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.549831e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.803348 sec +TOTAL : 0.811520 sec INFO: No Floating Point Exceptions have been reported - 2,141,149,617 cycles # 2.654 GHz - 4,818,402,736 instructions # 2.25 insn per cycle - 0.807515784 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) + 2,161,880,934 cycles # 2.653 GHz + 4,847,309,344 instructions # 2.24 insn per cycle + 0.815674039 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:85905) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.568391e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.570476e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.570476e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.566981e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.569078e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.569078e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.704023 sec +TOTAL : 0.702924 sec INFO: No Floating Point Exceptions have been reported - 1,875,444,352 cycles # 2.651 GHz - 4,275,225,721 instructions # 2.28 insn per cycle - 0.708142027 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) + 1,875,062,433 cycles # 2.655 GHz + 4,301,131,886 instructions # 2.29 insn per cycle + 0.707001616 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:81754) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728161091923E-003 +Relative difference = 1.85880227405429e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.586825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588943e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.542502e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.544426e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.544426e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.806684 sec +TOTAL : 0.812292 sec INFO: No Floating Point Exceptions have been reported - 1,360,116,629 cycles # 1.679 GHz - 2,164,473,202 instructions # 1.59 insn per cycle - 0.810866699 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3487) (512y: 34) (512z:79499) + 1,367,647,712 cycles # 1.677 GHz + 2,175,950,494 instructions # 1.59 insn per cycle + 0.816336991 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4106) (512y: 32) (512z:79555) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982955140E-003 -Relative difference = 2.0044060904369713e-08 +Avg ME (F77/C++) = 9.8929811982957326E-003 +Relative difference = 2.0044082998332894e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 74b152faa4..12658e1990 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:03:51 +DATE: 2024-06-03_18:10:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.688684e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.689237e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.689554e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691014e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691544e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691771e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.181191 sec +TOTAL : 2.180865 sec INFO: No Floating Point Exceptions have been reported - 7,135,001,611 cycles # 2.850 GHz - 13,803,494,373 instructions # 1.93 insn per cycle - 2.561335539 seconds time elapsed + 7,147,217,142 cycles # 2.856 GHz + 15,944,601,902 instructions # 2.23 insn per cycle + 2.561081631 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110278e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110643e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110684e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111225e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111549e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111585e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.414487 sec +TOTAL : 3.420115 sec INFO: No Floating Point Exceptions have been reported - 10,695,558,623 cycles # 2.859 GHz - 24,933,179,401 instructions # 2.33 insn per cycle - 3.798291401 seconds time elapsed + 10,709,106,790 cycles # 2.860 GHz + 22,014,750,790 instructions # 2.06 insn per cycle + 3.803081141 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.308623e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.308815e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.308815e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.215233e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.215412e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215412e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 7.229792 sec +TOTAL : 7.320173 sec INFO: No Floating Point Exceptions have been reported - 19,160,887,501 cycles # 2.649 GHz - 54,158,064,644 instructions # 2.83 insn per cycle - 7.233871441 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) + 19,183,009,981 cycles # 2.620 GHz + 54,142,256,875 instructions # 2.82 insn per cycle + 7.324194367 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32014) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.495915e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.496004e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.496004e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.502066e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.502148e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.502148e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.534835 sec +TOTAL : 3.520744 sec INFO: No Floating Point Exceptions have been reported - 9,323,019,385 cycles # 2.635 GHz - 26,159,152,582 instructions # 2.81 insn per cycle - 3.538902916 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) + 9,378,849,301 cycles # 2.662 GHz + 26,196,672,297 instructions # 2.79 insn per cycle + 3.524768328 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:96060) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.446264e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.446692e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.446692e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.457096e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.457531e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.457531e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.538022 sec +TOTAL : 1.532469 sec INFO: No Floating Point Exceptions have been reported - 4,070,123,740 cycles # 2.641 GHz - 9,227,321,198 instructions # 2.27 insn per cycle - 1.542213209 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) + 4,070,106,033 cycles # 2.650 GHz + 9,257,201,603 instructions # 2.27 insn per cycle + 1.536642002 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:84397) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.010931e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.011595e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.011595e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.982544e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.983110e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.983110e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.321905 sec +TOTAL : 1.331088 sec INFO: No Floating Point Exceptions have been reported - 3,507,528,638 cycles # 2.647 GHz - 8,174,534,380 instructions # 2.33 insn per cycle - 1.326023150 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) + 3,549,141,158 cycles # 2.659 GHz + 8,190,051,527 instructions # 2.31 insn per cycle + 1.335300673 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:80028) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.380863e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.381372e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.381372e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.356629e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.357180e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.357180e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.567452 sec +TOTAL : 1.578537 sec INFO: No Floating Point Exceptions have been reported - 2,624,094,649 cycles # 1.671 GHz - 4,154,491,609 instructions # 1.58 insn per cycle - 1.571532309 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) + 2,632,257,170 cycles # 1.664 GHz + 4,179,824,767 instructions # 1.59 insn per cycle + 1.582628942 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2614) (512y: 93) (512z:78909) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 8617043553..3ead6e031a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-06-02_21:04:44 +DATE: 2024-06-03_18:11:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.686084e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.686624e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.686882e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.676327e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.676864e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.677096e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.181122 sec +TOTAL : 2.185697 sec INFO: No Floating Point Exceptions have been reported - 7,147,611,369 cycles # 2.854 GHz - 14,853,271,942 instructions # 2.08 insn per cycle - 2.560951446 seconds time elapsed + 7,180,493,419 cycles # 2.851 GHz + 16,069,308,155 instructions # 2.24 insn per cycle + 2.576733860 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.108087e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108460e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108504e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112258e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112629e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.416139 sec +TOTAL : 3.410661 sec INFO: No Floating Point Exceptions have been reported - 10,697,523,108 cycles # 2.858 GHz - 25,124,574,585 instructions # 2.35 insn per cycle - 3.798642472 seconds time elapsed + 10,670,880,741 cycles # 2.855 GHz + 24,536,777,363 instructions # 2.30 insn per cycle + 3.793405431 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.809282e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.809493e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.809493e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.751942e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.752149e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.752149e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.765846 sec +TOTAL : 6.826971 sec INFO: No Floating Point Exceptions have been reported - 19,320,733,855 cycles # 2.854 GHz - 54,152,931,560 instructions # 2.80 insn per cycle - 6.769909136 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) + 19,135,732,377 cycles # 2.802 GHz + 54,164,572,300 instructions # 2.83 insn per cycle + 6.831071371 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:32216) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.498471e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498552e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498552e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.488222e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.488300e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.488300e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.528542 sec +TOTAL : 3.552677 sec INFO: No Floating Point Exceptions have been reported - 9,412,357,855 cycles # 2.665 GHz - 26,078,069,796 instructions # 2.77 insn per cycle - 3.532621942 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) + 9,347,642,331 cycles # 2.629 GHz + 26,094,609,262 instructions # 2.79 insn per cycle + 3.556816913 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:95949) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.535316e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.535751e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.535751e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.489469e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489938e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.497994 sec +TOTAL : 1.518082 sec INFO: No Floating Point Exceptions have been reported - 4,026,588,228 cycles # 2.682 GHz - 9,213,775,354 instructions # 2.29 insn per cycle - 1.502163303 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) + 4,031,634,285 cycles # 2.650 GHz + 9,220,204,880 instructions # 2.29 insn per cycle + 1.522254552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:83871) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.075585e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.076181e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.076181e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.987027e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.987615e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.987615e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.300193 sec +TOTAL : 1.329462 sec INFO: No Floating Point Exceptions have been reported - 3,527,583,777 cycles # 2.706 GHz - 8,167,337,738 instructions # 2.32 insn per cycle - 1.304488773 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) + 3,525,556,122 cycles # 2.645 GHz + 8,174,876,823 instructions # 2.32 insn per cycle + 1.333461169 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2:79434) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.471286e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471806e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471806e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.361926e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.362434e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.362434e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.527099 sec +TOTAL : 1.576463 sec INFO: No Floating Point Exceptions have been reported - 2,623,859,326 cycles # 1.714 GHz - 4,153,167,835 instructions # 1.58 insn per cycle - 1.531362339 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) + 2,635,723,343 cycles # 1.668 GHz + 4,174,517,203 instructions # 1.58 insn per cycle + 1.580636581 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1878) (512y: 175) (512z:78883) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index e2998d6ab4..22bd4ba540 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:25 +DATE: 2024-06-03_18:06:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.755226e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.274442e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.625510e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.420650e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186340e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.600584e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455971 sec +TOTAL : 0.457505 sec INFO: No Floating Point Exceptions have been reported - 1,884,775,023 cycles # 2.810 GHz - 2,642,675,236 instructions # 1.40 insn per cycle - 0.728830794 seconds time elapsed + 1,899,272,623 cycles # 2.815 GHz + 2,683,069,645 instructions # 1.41 insn per cycle + 0.731629286 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.160169e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.139027e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.542035e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262197e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.143942e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.550325e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.535733 sec +TOTAL : 0.536930 sec INFO: No Floating Point Exceptions have been reported - 2,191,012,800 cycles # 2.827 GHz - 3,140,675,750 instructions # 1.43 insn per cycle - 0.832367820 seconds time elapsed + 2,194,237,717 cycles # 2.824 GHz + 3,155,658,053 instructions # 1.44 insn per cycle + 0.833915066 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.013225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.034611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.034611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.055928e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055928e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.640289 sec +TOTAL : 1.608174 sec INFO: No Floating Point Exceptions have been reported - 4,709,353,007 cycles # 2.865 GHz - 13,462,429,209 instructions # 2.86 insn per cycle - 1.644426545 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) + 4,627,352,906 cycles # 2.871 GHz + 13,198,524,909 instructions # 2.85 insn per cycle + 1.612420837 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 720) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.842535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.913005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893565e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.912012 sec +TOTAL : 0.920644 sec INFO: No Floating Point Exceptions have been reported - 2,622,620,970 cycles # 2.864 GHz - 7,552,013,729 instructions # 2.88 insn per cycle - 0.916398164 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) + 2,651,938,832 cycles # 2.869 GHz + 7,562,803,805 instructions # 2.85 insn per cycle + 0.924952570 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.080547e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.281001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.281001e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.066566e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.264728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.555084 sec +TOTAL : 0.557193 sec INFO: No Floating Point Exceptions have been reported - 1,478,897,839 cycles # 2.647 GHz - 3,119,129,700 instructions # 2.11 insn per cycle - 0.559398478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) + 1,496,586,992 cycles # 2.668 GHz + 3,166,660,688 instructions # 2.12 insn per cycle + 0.561453315 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3002) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.427781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674591e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.416441e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.661152e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.661152e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501239 sec +TOTAL : 0.502498 sec INFO: No Floating Point Exceptions have been reported - 1,340,705,970 cycles # 2.655 GHz - 2,981,253,669 instructions # 2.22 insn per cycle - 0.505572840 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) + 1,353,923,140 cycles # 2.675 GHz + 3,021,240,550 instructions # 2.23 insn per cycle + 0.506695350 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2769) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.239263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.242663e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.346403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346403e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.757058 sec +TOTAL : 0.755886 sec INFO: No Floating Point Exceptions have been reported - 1,333,648,350 cycles # 1.754 GHz - 1,953,454,025 instructions # 1.46 insn per cycle - 0.761354043 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) + 1,335,151,347 cycles # 1.758 GHz + 1,969,686,710 instructions # 1.48 insn per cycle + 0.760145815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1391) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index ea21ef5e35..bf06ae833c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:28:51 +DATE: 2024-06-03_18:26:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,15 +53,15 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.419906e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.143705e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.143705e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.507743e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.244408e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.244408e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.485647 sec +TOTAL : 0.483649 sec INFO: No Floating Point Exceptions have been reported - 1,977,356,966 cycles # 2.831 GHz - 2,947,602,370 instructions # 1.49 insn per cycle - 0.756831394 seconds time elapsed + 1,963,989,630 cycles # 2.822 GHz + 2,915,685,197 instructions # 1.48 insn per cycle + 0.754321456 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.235107e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.556413e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.556413e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.256162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571113e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571113e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757410 sec +TOTAL : 0.756334 sec INFO: No Floating Point Exceptions have been reported - 2,885,831,764 cycles # 2.864 GHz - 4,411,934,467 instructions # 1.53 insn per cycle - 1.066148439 seconds time elapsed + 2,857,254,096 cycles # 2.830 GHz + 4,390,239,241 instructions # 1.54 insn per cycle + 1.068102530 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.045212e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028953e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.632598 sec +TOTAL : 1.623491 sec INFO: No Floating Point Exceptions have been reported - 4,757,113,003 cycles # 2.909 GHz - 13,469,643,583 instructions # 2.83 insn per cycle - 1.637219700 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) + 4,669,332,077 cycles # 2.869 GHz + 13,205,693,220 instructions # 2.83 insn per cycle + 1.628130363 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 720) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.853468e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.810728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.881304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.881304e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915572 sec +TOTAL : 0.936476 sec INFO: No Floating Point Exceptions have been reported - 2,673,568,028 cycles # 2.908 GHz - 7,602,475,789 instructions # 2.84 insn per cycle - 0.920287554 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) + 2,697,621,166 cycles # 2.868 GHz + 7,612,311,404 instructions # 2.82 insn per cycle + 0.941119380 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3116) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.103941e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.314249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.314249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.030271e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.230400e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.559050 sec +TOTAL : 0.572120 sec INFO: No Floating Point Exceptions have been reported - 1,524,948,008 cycles # 2.709 GHz - 3,168,482,011 instructions # 2.08 insn per cycle - 0.563565721 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) + 1,540,597,129 cycles # 2.674 GHz + 3,215,714,008 instructions # 2.09 insn per cycle + 0.576670244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3002) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.488503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.747329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.747329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.370959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617466e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.500126 sec +TOTAL : 0.518302 sec INFO: No Floating Point Exceptions have been reported - 1,382,997,758 cycles # 2.744 GHz - 3,030,723,769 instructions # 2.19 insn per cycle - 0.504745068 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) + 1,404,029,571 cycles # 2.688 GHz + 3,072,237,447 instructions # 2.19 insn per cycle + 0.523023062 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2769) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.290455e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.403634e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.403634e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.218921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.326156e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.326156e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.749461 sec +TOTAL : 0.772656 sec INFO: No Floating Point Exceptions have been reported - 1,376,974,835 cycles # 1.828 GHz - 1,993,483,040 instructions # 1.45 insn per cycle - 0.754191421 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) + 1,381,653,206 cycles # 1.779 GHz + 2,009,158,338 instructions # 1.45 insn per cycle + 0.777302997 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1391) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index e245581a8d..528c14820e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:38 +DATE: 2024-06-03_18:06:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.726582e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.144906e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.472843e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387528e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.087353e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.479798e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.456749 sec +TOTAL : 0.454641 sec INFO: No Floating Point Exceptions have been reported - 1,887,155,378 cycles # 2.815 GHz - 2,674,611,026 instructions # 1.42 insn per cycle - 0.729330088 seconds time elapsed + 1,892,592,028 cycles # 2.818 GHz + 2,644,916,478 instructions # 1.40 insn per cycle + 0.728910769 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.182415e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.046068e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.438938e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.251456e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055592e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.448912e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537611 sec +TOTAL : 0.537942 sec INFO: No Floating Point Exceptions have been reported - 2,190,802,810 cycles # 2.823 GHz - 3,133,468,280 instructions # 1.43 insn per cycle - 0.833932090 seconds time elapsed + 2,196,827,706 cycles # 2.825 GHz + 3,163,011,259 instructions # 1.44 insn per cycle + 0.835779565 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.009988e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.031220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.031220e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051109e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.645685 sec +TOTAL : 1.615636 sec INFO: No Floating Point Exceptions have been reported - 4,722,807,018 cycles # 2.864 GHz - 13,456,640,489 instructions # 2.85 insn per cycle - 1.649922962 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) + 4,630,123,707 cycles # 2.860 GHz + 13,187,048,762 instructions # 2.85 insn per cycle + 1.619942385 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 705) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.814484e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.814458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882953e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882953e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.926401 sec +TOTAL : 0.925381 sec INFO: No Floating Point Exceptions have been reported - 2,657,840,392 cycles # 2.859 GHz - 7,551,476,794 instructions # 2.84 insn per cycle - 0.930660111 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) + 2,650,904,425 cycles # 2.853 GHz + 7,560,878,860 instructions # 2.85 insn per cycle + 0.929809663 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3110) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.099699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.310121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.310121e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.042543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.239403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.239403e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.551668 sec +TOTAL : 0.561428 sec INFO: No Floating Point Exceptions have been reported - 1,479,744,333 cycles # 2.665 GHz - 3,118,004,055 instructions # 2.11 insn per cycle - 0.555928182 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) + 1,506,189,984 cycles # 2.666 GHz + 3,165,859,184 instructions # 2.10 insn per cycle + 0.565660418 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2987) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.438094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687392e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.386752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.627703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.627703e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.499527 sec +TOTAL : 0.507110 sec INFO: No Floating Point Exceptions have been reported - 1,342,931,484 cycles # 2.669 GHz - 2,978,966,446 instructions # 2.22 insn per cycle - 0.503782569 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) + 1,357,913,949 cycles # 2.659 GHz + 3,018,493,039 instructions # 2.22 insn per cycle + 0.511448840 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2743) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.249330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.355241e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.355241e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.238275e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.344573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.344573e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.753724 sec +TOTAL : 0.757097 sec INFO: No Floating Point Exceptions have been reported - 1,329,841,706 cycles # 1.756 GHz - 1,951,471,549 instructions # 1.47 insn per cycle - 0.758013007 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) + 1,334,322,654 cycles # 1.754 GHz + 1,967,860,641 instructions # 1.47 insn per cycle + 0.761402568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1368) (512y: 106) (512z: 2217) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 +Avg ME (F77/C++) = 0.14247482467492589 +Relative difference = 5.286901348574438e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3a86532d9d..7c8aa5030d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_20:59:52 +DATE: 2024-06-03_18:06:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.556743e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220581e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.348241e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.550928e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024345e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.134616e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.446170 sec +TOTAL : 0.449386 sec INFO: No Floating Point Exceptions have been reported - 1,860,217,734 cycles # 2.814 GHz - 2,643,257,485 instructions # 1.42 insn per cycle - 0.717600829 seconds time elapsed + 1,888,054,726 cycles # 2.813 GHz + 2,654,260,383 instructions # 1.41 insn per cycle + 0.728565297 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 165 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP= @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.900642e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.805238e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969673e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.486879 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.491395e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519769e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624892e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.490219 sec INFO: No Floating Point Exceptions have been reported - 2,017,900,407 cycles # 2.822 GHz - 2,883,772,994 instructions # 1.43 insn per cycle - 0.772554185 seconds time elapsed + 2,021,512,627 cycles # 2.815 GHz + 2,891,282,771 instructions # 1.43 insn per cycle + 0.776542002 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.071641e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095909e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.550578 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.082711e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107977e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.534634 sec INFO: No Floating Point Exceptions have been reported - 4,458,106,622 cycles # 2.869 GHz - 13,047,664,900 instructions # 2.93 insn per cycle - 1.554777575 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) + 4,415,097,381 cycles # 2.871 GHz + 12,958,570,213 instructions # 2.94 insn per cycle + 1.538797105 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 658) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.867972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.052825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.052825e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.592805 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.819766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.993390e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993390e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.601639 sec INFO: No Floating Point Exceptions have been reported - 1,702,248,935 cycles # 2.855 GHz - 4,512,704,282 instructions # 2.65 insn per cycle - 0.597040153 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) + 1,735,482,071 cycles # 2.869 GHz + 4,549,139,048 instructions # 2.62 insn per cycle + 0.605662961 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.475375e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.145092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.145092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.533626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.225203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.225203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.320934 sec +TOTAL : 0.317106 sec INFO: No Floating Point Exceptions have been reported - 853,448,782 cycles # 2.630 GHz - 1,896,008,778 instructions # 2.22 insn per cycle - 0.325158867 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) + 862,393,564 cycles # 2.690 GHz + 1,924,665,702 instructions # 2.23 insn per cycle + 0.321250486 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3584) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.798508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.559778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.559778e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.943619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.727716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.727716e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304197 sec +TOTAL : 0.296810 sec INFO: No Floating Point Exceptions have been reported - 803,501,124 cycles # 2.610 GHz - 1,818,839,783 instructions # 2.26 insn per cycle - 0.308391634 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) + 810,008,261 cycles # 2.697 GHz + 1,841,327,409 instructions # 2.27 insn per cycle + 0.300975383 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3414) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.329166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.749966e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.749966e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.370645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.799268e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.799268e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.402147 sec +TOTAL : 0.398078 sec INFO: No Floating Point Exceptions have been reported - 735,010,684 cycles # 1.812 GHz - 1,304,684,504 instructions # 1.78 insn per cycle - 0.406336425 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) + 737,475,489 cycles # 1.836 GHz + 1,315,863,036 instructions # 1.78 insn per cycle + 0.402348444 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2005) (512y: 32) (512z: 2432) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 94d91d36db..b474391c5e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:29:04 +DATE: 2024-06-03_18:26:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -53,21 +53,21 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.395370e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.197522e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.197522e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.460501 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.399533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.035371e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.035371e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429183e+01 ) GeV^-2 +TOTAL : 0.461527 sec INFO: No Floating Point Exceptions have been reported - 1,922,773,738 cycles # 2.852 GHz - 2,838,003,502 instructions # 1.48 insn per cycle - 0.731092352 seconds time elapsed + 1,898,709,085 cycles # 2.819 GHz + 2,794,611,500 instructions # 1.47 insn per cycle + 0.730805065 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 165 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP= @@ -79,15 +79,15 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.025472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.955203e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.955203e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.631520 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.018371e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.847717e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.847717e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609941e+02 +- 2.115589e+02 ) GeV^-2 +TOTAL : 0.631803 sec INFO: No Floating Point Exceptions have been reported - 2,501,267,965 cycles # 2.864 GHz - 3,790,017,450 instructions # 1.52 insn per cycle - 0.930310379 seconds time elapsed + 2,475,233,605 cycles # 2.829 GHz + 3,755,433,498 instructions # 1.52 insn per cycle + 0.931704895 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -95,8 +95,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe @@ -109,16 +109,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108811e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.537914 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.077062e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103594e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.546741 sec INFO: No Floating Point Exceptions have been reported - 4,486,361,889 cycles # 2.911 GHz - 13,052,814,653 instructions # 2.91 insn per cycle - 1.542180616 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) + 4,439,504,684 cycles # 2.864 GHz + 12,962,981,414 instructions # 2.92 insn per cycle + 1.551131545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 658) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -138,16 +138,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.870842e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.059379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.059379e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.597066 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.800148e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.977418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.611068 sec INFO: No Floating Point Exceptions have been reported - 1,730,283,965 cycles # 2.880 GHz - 4,559,978,438 instructions # 2.64 insn per cycle - 0.601626043 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) + 1,762,180,341 cycles # 2.866 GHz + 4,596,893,163 instructions # 2.61 insn per cycle + 0.615465189 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -155,8 +155,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -167,16 +167,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.495825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.177328e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.177328e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.468716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.144921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.144921e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.324345 sec +TOTAL : 0.325673 sec INFO: No Floating Point Exceptions have been reported - 873,246,909 cycles # 2.662 GHz - 1,932,851,891 instructions # 2.21 insn per cycle - 0.328693833 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) + 885,472,071 cycles # 2.688 GHz + 1,961,444,173 instructions # 2.22 insn per cycle + 0.330024390 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3584) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -184,8 +184,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -196,16 +196,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.853914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.645046e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.645046e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.852754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.629189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.629189e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.305954 sec +TOTAL : 0.306077 sec INFO: No Floating Point Exceptions have been reported - 825,418,764 cycles # 2.665 GHz - 1,855,748,763 instructions # 2.25 insn per cycle - 0.310269538 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) + 831,476,352 cycles # 2.684 GHz + 1,878,263,824 instructions # 2.26 insn per cycle + 0.310443537 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3414) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -213,8 +213,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP= @@ -225,16 +225,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.313371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736001e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.306370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.724359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.724359e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.408387 sec +TOTAL : 0.408894 sec INFO: No Floating Point Exceptions have been reported - 758,493,445 cycles # 1.840 GHz - 1,345,737,027 instructions # 1.77 insn per cycle - 0.412742502 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1973) (512y: 32) (512z: 2382) + 761,582,213 cycles # 1.846 GHz + 1,357,292,696 instructions # 1.78 insn per cycle + 0.413233854 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2005) (512y: 32) (512z: 2432) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -242,8 +242,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 05c0e197eb..dad301565b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:04 +DATE: 2024-06-03_18:07:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.474693e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189633e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.316136e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.607493e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041375e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154202e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.447182 sec +TOTAL : 0.451358 sec INFO: No Floating Point Exceptions have been reported - 1,893,556,574 cycles # 2.816 GHz - 2,664,714,918 instructions # 1.41 insn per cycle - 0.729010855 seconds time elapsed + 1,877,588,397 cycles # 2.819 GHz + 2,627,708,031 instructions # 1.40 insn per cycle + 0.724593614 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 164 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP= @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.711155e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.788726e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932305e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.487543 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.561024e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.555409e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.667951e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571360e+02 +- 2.114020e+02 ) GeV^-2 +TOTAL : 0.490635 sec INFO: No Floating Point Exceptions have been reported - 2,013,294,748 cycles # 2.813 GHz - 2,871,604,214 instructions # 1.43 insn per cycle - 0.773944888 seconds time elapsed + 2,044,448,827 cycles # 2.838 GHz + 2,884,451,461 instructions # 1.41 insn per cycle + 0.778204852 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 1.424226e-01 -Avg ME (F77/GPU) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 +Avg ME (F77/GPU) = 0.14247487904286338 +Relative difference = 0.0003670698531228044 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.069069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093696e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093696e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.554121 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.081260e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.105951e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105951e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.536942 sec INFO: No Floating Point Exceptions have been reported - 4,457,740,413 cycles # 2.864 GHz - 13,029,198,665 instructions # 2.92 insn per cycle - 1.558378600 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) + 4,412,513,718 cycles # 2.865 GHz + 12,934,261,743 instructions # 2.93 insn per cycle + 1.541139293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 643) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246861273719524 +Relative difference = 8.940352641194861e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.886512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.070924e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.588580 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.813029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.990415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.990415e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018564e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 0.603038 sec INFO: No Floating Point Exceptions have been reported - 1,693,887,931 cycles # 2.861 GHz - 4,507,886,410 instructions # 2.66 insn per cycle - 0.592760485 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) + 1,737,431,119 cycles # 2.864 GHz + 4,543,468,610 instructions # 2.62 insn per cycle + 0.607274552 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,8 +141,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246862329122401 +Relative difference = 1.6348320966878032e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.575079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.270984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.270984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.462861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.127062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.127062e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.315242 sec +TOTAL : 0.321234 sec INFO: No Floating Point Exceptions have been reported - 850,868,776 cycles # 2.668 GHz - 1,892,927,301 instructions # 2.22 insn per cycle - 0.319517780 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) + 866,319,085 cycles # 2.667 GHz + 1,921,313,606 instructions # 2.22 insn per cycle + 0.325461095 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3554) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -169,8 +169,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.005606e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.810836e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.810836e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.908411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.688910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.688910e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293957 sec +TOTAL : 0.298501 sec INFO: No Floating Point Exceptions have been reported - 798,554,126 cycles # 2.684 GHz - 1,814,787,943 instructions # 2.27 insn per cycle - 0.298209331 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) + 810,564,237 cycles # 2.685 GHz + 1,837,469,078 instructions # 2.27 insn per cycle + 0.302652949 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3378) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -197,8 +197,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247491543012991 +Relative difference = 1.0830068962165901e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP= @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.328007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.744710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.744710e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.339310e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.756992e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.756992e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.402038 sec +TOTAL : 0.400857 sec INFO: No Floating Point Exceptions have been reported - 736,423,468 cycles # 1.816 GHz - 1,301,837,346 instructions # 1.77 insn per cycle - 0.406202485 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1936) (512y: 32) (512z: 2382) + 737,225,912 cycles # 1.823 GHz + 1,313,545,461 instructions # 1.78 insn per cycle + 0.404928677 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2435) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -225,8 +225,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247491576758442 +Relative difference = 1.1066920862943416e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 0c9965805b..7de221c727 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:16 +DATE: 2024-06-03_18:07:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.769571e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.350019e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.722049e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438101e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.278946e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.705921e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450882 sec +TOTAL : 0.458495 sec INFO: No Floating Point Exceptions have been reported - 1,880,734,206 cycles # 2.821 GHz - 2,660,717,871 instructions # 1.41 insn per cycle - 0.723822735 seconds time elapsed + 1,899,751,176 cycles # 2.820 GHz + 2,680,130,955 instructions # 1.41 insn per cycle + 0.732599330 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.238115e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.167791e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.574879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.270088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.149686e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558133e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.536771 sec +TOTAL : 0.538291 sec INFO: No Floating Point Exceptions have been reported - 2,185,532,891 cycles # 2.814 GHz - 3,100,266,437 instructions # 1.42 insn per cycle - 0.833787821 seconds time elapsed + 2,196,213,159 cycles # 2.822 GHz + 3,128,470,327 instructions # 1.42 insn per cycle + 0.835056301 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.000819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.021896e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.021896e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.022751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044788e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.044788e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.660472 sec +TOTAL : 1.625328 sec INFO: No Floating Point Exceptions have been reported - 4,752,171,523 cycles # 2.856 GHz - 13,466,883,992 instructions # 2.83 insn per cycle - 1.664681738 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) + 4,655,424,429 cycles # 2.864 GHz + 13,186,683,602 instructions # 2.83 insn per cycle + 1.629533606 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 694) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.835752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905918e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905918e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893791e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.915273 sec +TOTAL : 0.920546 sec INFO: No Floating Point Exceptions have been reported - 2,607,019,310 cycles # 2.837 GHz - 7,384,430,613 instructions # 2.83 insn per cycle - 0.919590344 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) + 2,650,850,704 cycles # 2.868 GHz + 7,482,411,808 instructions # 2.82 insn per cycle + 0.924860801 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3164) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.096035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.299954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.299954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.105613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.308591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.308591e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.552392 sec +TOTAL : 0.550662 sec INFO: No Floating Point Exceptions have been reported - 1,470,210,795 cycles # 2.644 GHz - 3,054,979,092 instructions # 2.08 insn per cycle - 0.556662961 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) + 1,483,239,290 cycles # 2.676 GHz + 3,134,997,530 instructions # 2.11 insn per cycle + 0.554907910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3137) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.519152e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.779200e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.779200e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.488353e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.745095e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745095e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.488880 sec +TOTAL : 0.492475 sec INFO: No Floating Point Exceptions have been reported - 1,309,516,900 cycles # 2.658 GHz - 2,929,953,488 instructions # 2.24 insn per cycle - 0.493142435 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) + 1,327,041,740 cycles # 2.675 GHz + 2,988,516,661 instructions # 2.25 insn per cycle + 0.496736378 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2905) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.182113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.279394e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.279394e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.181984e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.280563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.776228 sec +TOTAL : 0.776078 sec INFO: No Floating Point Exceptions have been reported - 1,367,728,530 cycles # 1.754 GHz - 1,969,246,999 instructions # 1.44 insn per cycle - 0.780474856 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) + 1,372,083,988 cycles # 1.760 GHz + 1,997,444,155 instructions # 1.46 insn per cycle + 0.780337995 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1737) (512y: 114) (512z: 2251) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 9ad9b977c8..04dcf15a7d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-06-02_21:00:30 +DATE: 2024-06-03_18:07:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.725668e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.143482e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.487281e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.392663e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089452e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.491656e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455675 sec +TOTAL : 0.456745 sec INFO: No Floating Point Exceptions have been reported - 1,885,065,360 cycles # 2.812 GHz - 2,651,104,600 instructions # 1.41 insn per cycle - 0.728600735 seconds time elapsed + 1,899,216,031 cycles # 2.821 GHz + 2,684,077,764 instructions # 1.41 insn per cycle + 0.731359464 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.168974e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.005384e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.392584e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.246268e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.016380e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.406292e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.537275 sec +TOTAL : 0.539355 sec INFO: No Floating Point Exceptions have been reported - 2,188,989,186 cycles # 2.824 GHz - 3,146,111,555 instructions # 1.44 insn per cycle - 0.833354304 seconds time elapsed + 2,201,245,823 cycles # 2.828 GHz + 3,151,089,400 instructions # 1.43 insn per cycle + 0.836766488 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.007306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.028735e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.028735e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050829e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.649832 sec +TOTAL : 1.616534 sec INFO: No Floating Point Exceptions have been reported - 4,736,304,640 cycles # 2.865 GHz - 13,451,261,336 instructions # 2.84 insn per cycle - 1.654076225 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) + 4,647,619,159 cycles # 2.869 GHz + 13,174,118,623 instructions # 2.83 insn per cycle + 1.620696566 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 679) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.856269e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811251e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.880051e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.880051e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.905701 sec +TOTAL : 0.927238 sec INFO: No Floating Point Exceptions have been reported - 2,609,412,517 cycles # 2.870 GHz - 7,388,220,177 instructions # 2.83 insn per cycle - 0.909956215 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) + 2,650,771,569 cycles # 2.848 GHz + 7,484,677,618 instructions # 2.82 insn per cycle + 0.931497652 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.132823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.336633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336633e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.110568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.314066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.314066e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.545981 sec +TOTAL : 0.549601 sec INFO: No Floating Point Exceptions have been reported - 1,469,511,109 cycles # 2.674 GHz - 3,055,566,040 instructions # 2.08 insn per cycle - 0.550174078 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) + 1,478,789,605 cycles # 2.673 GHz + 3,134,923,716 instructions # 2.12 insn per cycle + 0.553915068 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3115) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.536658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.797118e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797118e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.471635e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.726223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.486288 sec +TOTAL : 0.494844 sec INFO: No Floating Point Exceptions have been reported - 1,307,118,138 cycles # 2.675 GHz - 2,931,084,096 instructions # 2.24 insn per cycle - 0.490604178 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) + 1,330,853,141 cycles # 2.670 GHz + 2,989,219,775 instructions # 2.25 insn per cycle + 0.499076847 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2881) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.194909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292767e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.171319e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.270839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.270839e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.771280 sec +TOTAL : 0.779784 sec INFO: No Floating Point Exceptions have been reported - 1,369,689,187 cycles # 1.768 GHz - 1,969,498,668 instructions # 1.44 insn per cycle - 0.775554324 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) + 1,374,242,106 cycles # 1.754 GHz + 1,997,349,171 instructions # 1.45 insn per cycle + 0.784044628 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1713) (512y: 114) (512z: 2251) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt index 529929a5c3..d754778efa 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:05:44 +DATE: 2024-06-03_18:44:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.419887e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089577e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.185165e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.328854e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.085787e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186541e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.533774 sec +TOTAL : 0.531742 sec INFO: No Floating Point Exceptions have been reported - 2,165,329,100 cycles # 2.817 GHz - 3,116,509,991 instructions # 1.44 insn per cycle - 0.826546475 seconds time elapsed + 2,178,901,628 cycles # 2.821 GHz + 3,143,409,478 instructions # 1.44 insn per cycle + 0.829473381 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865718e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.915718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.915718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.580845e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.617291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.617291e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.734321 sec +TOTAL : 6.748885 sec INFO: No Floating Point Exceptions have been reported - 16,442,910,174 cycles # 2.865 GHz - 42,483,732,959 instructions # 2.58 insn per cycle - 5.739954746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 711) (avx2: 0) (512y: 0) (512z: 0) + 19,295,144,002 cycles # 2.857 GHz + 51,955,343,402 instructions # 2.69 insn per cycle + 6.754523888 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.238803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.404775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.404775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.814420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.940644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.940644e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.350989 sec +TOTAL : 3.841500 sec INFO: No Floating Point Exceptions have been reported - 9,605,090,400 cycles # 2.862 GHz - 26,316,930,760 instructions # 2.74 insn per cycle - 3.356479084 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2388) (avx2: 0) (512y: 0) (512z: 0) + 10,991,556,497 cycles # 2.858 GHz + 30,794,493,414 instructions # 2.80 insn per cycle + 3.846988521 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2929) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926105804 -Relative difference = 2.103617270732513e-07 +Avg ME (F77/C++) = 4.3134710926105795 +Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.211332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.644021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.644021e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.529737e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.853643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.853643e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.125658 sec +TOTAL : 2.429427 sec INFO: No Floating Point Exceptions have been reported - 5,695,554,831 cycles # 2.674 GHz - 12,026,163,349 instructions # 2.11 insn per cycle - 2.131034660 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2532) (512y: 0) (512z: 0) + 6,484,403,568 cycles # 2.664 GHz + 13,670,821,767 instructions # 2.11 insn per cycle + 2.434855691 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2952) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.667087e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.177658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.177658e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.971751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.359885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.359885e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 1.963849 sec +TOTAL : 2.222913 sec INFO: No Floating Point Exceptions have been reported - 5,196,538,426 cycles # 2.640 GHz - 11,156,532,822 instructions # 2.15 insn per cycle - 1.969477022 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2195) (512y: 148) (512z: 0) + 5,945,532,829 cycles # 2.669 GHz + 13,012,290,212 instructions # 2.19 insn per cycle + 2.228300354 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2684) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.473166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.654961e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.654961e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.316769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.483180e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483180e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.132981 sec +TOTAL : 3.275285 sec INFO: No Floating Point Exceptions have been reported - 5,562,641,747 cycles # 1.773 GHz - 8,071,126,847 instructions # 1.45 insn per cycle - 3.138593943 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1471) (512y: 129) (512z: 1684) + 5,818,933,451 cycles # 1.775 GHz + 8,593,198,148 instructions # 1.48 insn per cycle + 3.280653540 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1518) (512y: 128) (512z: 1942) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt index 50bff49e4f..d9a36dcf38 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:09 +DATE: 2024-06-03_18:45:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.425736e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093257e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.189942e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.309379e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.084437e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188085e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.532512 sec +TOTAL : 0.534384 sec INFO: No Floating Point Exceptions have been reported - 2,171,060,205 cycles # 2.823 GHz - 3,109,256,727 instructions # 1.43 insn per cycle - 0.825854836 seconds time elapsed + 2,166,481,115 cycles # 2.819 GHz + 3,106,540,627 instructions # 1.43 insn per cycle + 0.827144727 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.886140e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.937260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.937260e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.668736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.708844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.708844e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 5.672763 sec +TOTAL : 6.397768 sec INFO: No Floating Point Exceptions have been reported - 16,265,601,075 cycles # 2.865 GHz - 43,265,334,700 instructions # 2.66 insn per cycle - 5.678175360 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 662) (avx2: 0) (512y: 0) (512z: 0) + 18,446,624,473 cycles # 2.881 GHz + 50,082,150,196 instructions # 2.71 insn per cycle + 6.403499792 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 639) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -109,16 +109,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.297388e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.470079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.470079e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.001026e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.144373e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.144373e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.294375 sec +TOTAL : 3.608611 sec INFO: No Floating Point Exceptions have been reported - 9,446,450,203 cycles # 2.864 GHz - 25,429,379,126 instructions # 2.69 insn per cycle - 3.299817979 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2268) (avx2: 0) (512y: 0) (512z: 0) + 10,359,715,515 cycles # 2.867 GHz + 29,167,539,829 instructions # 2.82 insn per cycle + 3.614119568 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -126,8 +126,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 4.313472e+00 -Avg ME (F77/C++) = 4.3134710926105804 -Relative difference = 2.103617270732513e-07 +Avg ME (F77/C++) = 4.3134710926105795 +Relative difference = 2.1036172727915933e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,16 +137,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.653725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.997172e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.997172e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.233371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.513596e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.513596e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.366874 sec +TOTAL : 2.591194 sec INFO: No Floating Point Exceptions have been reported - 6,282,545,209 cycles # 2.649 GHz - 13,637,137,621 instructions # 2.17 insn per cycle - 2.372362603 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2629) (512y: 0) (512z: 0) + 6,943,168,004 cycles # 2.675 GHz + 15,152,528,398 instructions # 2.18 insn per cycle + 2.596694726 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3032) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -165,16 +165,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.883827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.255659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.255659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.379993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.676958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.676958e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.260406 sec +TOTAL : 2.507726 sec INFO: No Floating Point Exceptions have been reported - 6,053,972,710 cycles # 2.673 GHz - 12,722,135,295 instructions # 2.10 insn per cycle - 2.265888998 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2146) (512y: 296) (512z: 0) + 6,714,039,983 cycles # 2.673 GHz + 14,625,463,624 instructions # 2.18 insn per cycle + 2.513162943 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2634) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -193,16 +193,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.425130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.601710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.601710e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.180932e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333512e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.176107 sec +TOTAL : 3.410152 sec INFO: No Floating Point Exceptions have been reported - 5,633,600,912 cycles # 1.772 GHz - 8,927,465,538 instructions # 1.58 insn per cycle - 3.181704052 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1357) (512y: 171) (512z: 1777) + 6,047,366,940 cycles # 1.771 GHz + 10,343,477,286 instructions # 1.71 insn per cycle + 3.415537139 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1280) (512y: 214) (512z: 2129) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt index 2f0a202d23..9958b76d33 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:34 +DATE: 2024-06-03_18:45:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.564465e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483919e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.773757e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.537240e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970306e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166568e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.486107 sec +TOTAL : 0.488119 sec INFO: No Floating Point Exceptions have been reported - 2,007,999,810 cycles # 2.821 GHz - 2,899,840,704 instructions # 1.44 insn per cycle - 0.768580858 seconds time elapsed + 2,010,457,548 cycles # 2.816 GHz + 2,901,449,651 instructions # 1.44 insn per cycle + 0.771150125 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 157 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695463908836 -Relative difference = 4.162439020000051e-05 +Avg ME (F77/GPU) = 4.3136695491848513 +Relative difference = 4.162503792787837e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.944128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.000566e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.000566e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.637958e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678542e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.484463 sec +TOTAL : 6.492871 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,737,215,094 cycles # 2.868 GHz - 42,223,129,627 instructions # 2.68 insn per cycle - 5.489744872 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 601) (avx2: 0) (512y: 0) (512z: 0) + 18,609,085,451 cycles # 2.864 GHz + 51,236,813,347 instructions # 2.75 insn per cycle + 6.497972416 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135739049175754 -Relative difference = 2.2042608890083832e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.521211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.866032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.866032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.908557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.163450e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163450e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.410232 sec +TOTAL : 2.773748 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,949,798,990 cycles # 2.877 GHz - 16,918,935,545 instructions # 2.43 insn per cycle - 2.415585542 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2983) (avx2: 0) (512y: 0) (512z: 0) + 7,945,872,155 cycles # 2.860 GHz + 19,321,820,400 instructions # 2.43 insn per cycle + 2.778949610 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -130,8 +130,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722205042839 -Relative difference = 5.111872113533787e-08 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -141,27 +141,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.866213e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.881041e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.881041e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.601030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.562195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.562195e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.422293 sec +TOTAL : 1.469342 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,860,334,790 cycles # 2.706 GHz - 7,989,354,890 instructions # 2.07 insn per cycle - 1.427639931 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3289) (512y: 0) (512z: 0) + 3,953,006,654 cycles # 2.682 GHz + 8,836,458,479 instructions # 2.24 insn per cycle + 1.474496107 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3719) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645699221641 -Relative difference = 9.97035713074993e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -171,27 +169,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.312467e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.446145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.446145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.080394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.190045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.190045e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.350232 sec +TOTAL : 1.388542 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,671,588,520 cycles # 2.710 GHz - 7,492,175,118 instructions # 2.04 insn per cycle - 1.355454952 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3036) (512y: 23) (512z: 0) + 3,738,146,624 cycles # 2.684 GHz + 8,439,003,248 instructions # 2.26 insn per cycle + 1.393671238 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3555) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW -INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645699221641 -Relative difference = 9.97035713074993e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -201,16 +197,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.291103e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.907078e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.907078e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.739991e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.256621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.256621e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.755277 sec +TOTAL : 1.918383 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,329,330,756 cycles # 1.892 GHz - 5,989,173,339 instructions # 1.80 insn per cycle - 1.760500099 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2418) (512y: 32) (512z: 2031) + 3,506,051,293 cycles # 1.824 GHz + 6,249,171,634 instructions # 1.78 insn per cycle + 1.923680281 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2453) (512y: 32) (512z: 2288) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -220,8 +216,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643783025444 -Relative difference = 8.770069111236825e-08 +Avg ME (F77/C++) = 4.3135643536224961 +Relative difference = 8.197919301304478e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt index 947a9772a4..1171fdfffd 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:06:54 +DATE: 2024-06-03_18:45:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.661514e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.491247e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.780216e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.154335e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.008994e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197996e+08 ) sec^-1 MeanMatrixElemValue = ( 7.154219e+00 +- 1.620281e-01 ) GeV^0 -TOTAL : 0.485941 sec +TOTAL : 0.491386 sec INFO: No Floating Point Exceptions have been reported - 2,010,449,048 cycles # 2.824 GHz - 2,892,288,272 instructions # 1.44 insn per cycle - 0.769427686 seconds time elapsed + 2,017,077,998 cycles # 2.815 GHz + 2,893,778,786 instructions # 1.43 insn per cycle + 0.775392827 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 131 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -68,8 +68,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 4.313490e+00 -Avg ME (F77/GPU) = 4.3136695463908836 -Relative difference = 4.162439020000051e-05 +Avg ME (F77/GPU) = 4.3136695491848513 +Relative difference = 4.162503792787837e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,16 +81,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.994628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054364e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054364e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.692645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.735566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.735566e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175644e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 5.347389 sec +TOTAL : 6.286011 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 15,338,140,112 cycles # 2.867 GHz - 42,471,920,214 instructions # 2.77 insn per cycle - 5.352646805 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 559) (avx2: 0) (512y: 0) (512z: 0) + 18,009,509,756 cycles # 2.863 GHz + 49,623,104,208 instructions # 2.76 insn per cycle + 6.291263696 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -100,8 +100,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313574e+00 -Avg ME (F77/C++) = 4.3135739491553977 -Relative difference = 1.1787117204016727e-08 +Avg ME (F77/C++) = 4.3135738277342170 +Relative difference = 3.9935743068669333e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -111,16 +111,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.117220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.566866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.566866e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.394813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.720777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.720777e+05 ) sec^-1 MeanMatrixElemValue = ( 7.175642e+00 +- 1.658767e-01 ) GeV^0 -TOTAL : 2.140174 sec +TOTAL : 2.477788 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,135,855,566 cycles # 2.861 GHz - 16,262,350,066 instructions # 2.65 insn per cycle - 2.145522102 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2702) (avx2: 0) (512y: 0) (512z: 0) + 7,111,768,090 cycles # 2.865 GHz + 18,489,144,559 instructions # 2.60 insn per cycle + 2.483157584 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3247) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -130,8 +130,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313572e+00 -Avg ME (F77/C++) = 4.3135722205042839 -Relative difference = 5.111872113533787e-08 +Avg ME (F77/C++) = 4.3135722697479650 +Relative difference = 6.253470796314402e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -141,16 +141,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.475476e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.144559e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.144559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.190401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.626712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.626712e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.709024 sec +TOTAL : 2.111082 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,596,639,303 cycles # 2.683 GHz - 9,041,859,622 instructions # 1.97 insn per cycle - 1.714357652 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3558) (512y: 0) (512z: 0) + 5,653,716,321 cycles # 2.672 GHz + 10,852,271,306 instructions # 1.92 insn per cycle + 2.116527759 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4278) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -160,8 +160,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645687580109 -Relative difference = 9.997345323075056e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -171,16 +171,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.643193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.350005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.350005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.201622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.641929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.641929e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.667959 sec +TOTAL : 2.105368 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 4,476,334,932 cycles # 2.676 GHz - 8,532,641,638 instructions # 1.91 insn per cycle - 1.673325231 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3311) (512y: 10) (512z: 0) + 5,581,069,156 cycles # 2.645 GHz + 10,551,499,792 instructions # 1.89 insn per cycle + 2.110664204 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 4147) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -190,8 +190,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313565e+00 -Avg ME (F77/C++) = 4.3135645687580109 -Relative difference = 9.997345323075056e-08 +Avg ME (F77/C++) = 4.3135645242873579 +Relative difference = 1.1028294269894893e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -201,16 +201,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.116930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.706081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.706081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.153868e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.455195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.455195e+05 ) sec^-1 MeanMatrixElemValue = ( 7.198861e+00 +- 1.710281e-01 ) GeV^0 -TOTAL : 1.804265 sec +TOTAL : 2.614812 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 3,310,050,790 cycles # 1.830 GHz - 5,957,409,151 instructions # 1.80 insn per cycle - 1.809617116 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2357) (512y: 32) (512z: 2014) + 4,680,931,815 cycles # 1.787 GHz + 8,665,037,174 instructions # 1.85 insn per cycle + 2.620154476 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2929) (512y: 8) (512z: 2883) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -220,8 +220,8 @@ cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubPro INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW Avg ME (C++/C++) = 4.313564e+00 -Avg ME (F77/C++) = 4.3135643783025444 -Relative difference = 8.770069111236825e-08 +Avg ME (F77/C++) = 4.3135643536224961 +Relative difference = 8.197919301304478e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt index 4f4847b6b6..85c03177a5 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:07:15 +DATE: 2024-06-03_18:46:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.488531e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088944e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184171e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.264249e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082334e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184481e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.530669 sec +TOTAL : 0.532072 sec INFO: No Floating Point Exceptions have been reported - 2,163,522,979 cycles # 2.824 GHz - 3,135,447,150 instructions # 1.45 insn per cycle - 0.823757872 seconds time elapsed + 2,180,273,787 cycles # 2.817 GHz + 3,157,278,745 instructions # 1.45 insn per cycle + 0.830334061 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 226 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.700696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.495210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.527281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.527281e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.280235 sec +TOTAL : 7.127701 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,574,906,678 cycles # 2.797 GHz - 41,764,388,623 instructions # 2.38 insn per cycle - 6.285714574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 655) (avx2: 0) (512y: 0) (512z: 0) + 20,457,240,037 cycles # 2.868 GHz + 51,952,278,015 instructions # 2.54 insn per cycle + 7.133276776 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.952464e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.091303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.091303e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.663839e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.776043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.776043e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.665315 sec +TOTAL : 4.050680 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 10,271,038,566 cycles # 2.799 GHz - 26,354,751,502 instructions # 2.57 insn per cycle - 3.670825968 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2438) (avx2: 0) (512y: 0) (512z: 0) + 11,522,261,452 cycles # 2.841 GHz + 30,595,506,119 instructions # 2.66 insn per cycle + 4.056343844 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2982) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,16 +141,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.531327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.854267e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.854267e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.373632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.674233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674233e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.428471 sec +TOTAL : 2.511484 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,507,055,363 cycles # 2.675 GHz - 12,119,284,734 instructions # 1.86 insn per cycle - 2.434107519 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2718) (512y: 0) (512z: 0) + 6,732,763,495 cycles # 2.676 GHz + 13,614,893,320 instructions # 2.02 insn per cycle + 2.516932126 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3124) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -171,16 +171,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.902513e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.278233e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.278233e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.783746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.145465e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.145465e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.252469 sec +TOTAL : 2.305308 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,021,388,550 cycles # 2.667 GHz - 11,226,998,655 instructions # 1.86 insn per cycle - 2.257985842 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2369) (512y: 150) (512z: 0) + 6,178,690,068 cycles # 2.675 GHz + 12,983,785,016 instructions # 2.10 insn per cycle + 2.310871293 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2863) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -201,16 +201,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.151456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.300586e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.300586e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.991726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126206e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.440903 sec +TOTAL : 3.618891 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,082,239,634 cycles # 1.765 GHz - 8,214,081,257 instructions # 1.35 insn per cycle - 3.446457125 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1787) (512y: 134) (512z: 1755) + 6,406,437,804 cycles # 1.768 GHz + 8,706,435,895 instructions # 1.36 insn per cycle + 3.624513725 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1815) (512y: 134) (512z: 2012) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt index a2ade5f790..6045bdf498 100644 --- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx' -DATE: 2024-06-02_22:07:42 +DATE: 2024-06-03_18:46:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.485563e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095251e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.191531e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.277612e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.084295e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186968e+08 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 0.530779 sec +TOTAL : 0.535350 sec INFO: No Floating Point Exceptions have been reported - 2,189,344,982 cycles # 2.825 GHz - 3,162,111,956 instructions # 1.44 insn per cycle - 0.831473815 seconds time elapsed + 2,169,547,372 cycles # 2.814 GHz + 3,115,345,457 instructions # 1.44 insn per cycle + 0.828151696 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -81,16 +81,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.752561e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.796583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.568605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604083e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 6.097431 sec +TOTAL : 6.798758 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 17,482,218,633 cycles # 2.865 GHz - 43,049,154,317 instructions # 2.46 insn per cycle - 6.102997010 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 651) (avx2: 0) (512y: 0) (512z: 0) + 19,500,149,045 cycles # 2.866 GHz + 49,982,389,934 instructions # 2.56 insn per cycle + 6.804368339 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -111,16 +111,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.172478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332063e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332063e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.816408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.942934e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942934e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.418913 sec +TOTAL : 3.837058 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 9,801,825,304 cycles # 2.863 GHz - 25,166,361,997 instructions # 2.57 insn per cycle - 3.424631390 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2276) (avx2: 0) (512y: 0) (512z: 0) + 10,983,586,791 cycles # 2.859 GHz + 29,101,586,876 instructions # 2.65 insn per cycle + 3.842851713 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2818) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -141,16 +141,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.163570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.437386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.437386e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.607773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808836e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.632138 sec +TOTAL : 3.019734 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 7,035,871,657 cycles # 2.669 GHz - 12,789,981,390 instructions # 1.82 insn per cycle - 2.637561799 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2699) (512y: 0) (512z: 0) + 8,052,880,399 cycles # 2.663 GHz + 15,178,369,602 instructions # 1.88 insn per cycle + 3.025353618 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3208) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -171,16 +171,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.487333e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.798535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.798535e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.779624e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.999859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.999859e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 2.449076 sec +TOTAL : 2.887642 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,545,009,203 cycles # 2.667 GHz - 12,105,117,349 instructions # 1.85 insn per cycle - 2.454490824 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2351) (512y: 227) (512z: 0) + 7,712,100,280 cycles # 2.667 GHz + 14,487,564,678 instructions # 1.88 insn per cycle + 2.893320746 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2786) (512y: 304) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -201,16 +201,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.973614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.106585e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.106585e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.893138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.019391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019391e+05 ) sec^-1 MeanMatrixElemValue = ( 7.148017e+00 +- 1.609110e-01 ) GeV^0 -TOTAL : 3.639191 sec +TOTAL : 3.738117 sec INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW - 6,417,633,310 cycles # 1.761 GHz - 8,985,370,621 instructions # 1.40 insn per cycle - 3.644757809 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1892) (512y: 178) (512z: 2083) + 6,563,142,303 cycles # 1.754 GHz + 9,902,891,266 instructions # 1.51 insn per cycle + 3.743637878 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1588) (512y: 220) (512z: 2216) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt index 3909c2de90..9658756422 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:04:42 +DATE: 2024-06-03_18:43:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.208514e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.234770e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.239030e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.191492e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.216787e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.220642e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.465857 sec +TOTAL : 0.467466 sec INFO: No Floating Point Exceptions have been reported - 1,954,339,048 cycles # 2.816 GHz - 2,841,818,682 instructions # 1.45 insn per cycle - 0.751002948 seconds time elapsed + 1,933,277,234 cycles # 2.822 GHz + 2,826,212,304 instructions # 1.46 insn per cycle + 0.744269904 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.793700e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949156e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.959741e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.846498e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.968500e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.976840e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483528 sec +TOTAL : 0.480910 sec INFO: No Floating Point Exceptions have been reported - 1,990,538,703 cycles # 2.818 GHz - 2,859,145,317 instructions # 1.44 insn per cycle - 0.763660982 seconds time elapsed + 2,022,662,983 cycles # 2.822 GHz + 2,921,094,655 instructions # 1.44 insn per cycle + 0.772867570 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.333630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.336951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.336951e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348861e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348861e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163551 sec +TOTAL : 0.162979 sec INFO: No Floating Point Exceptions have been reported - 475,454,071 cycles # 2.848 GHz - 1,396,942,135 instructions # 2.94 insn per cycle - 0.167517734 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3991) (avx2: 0) (512y: 0) (512z: 0) + 475,099,311 cycles # 2.856 GHz + 1,396,895,904 instructions # 2.94 insn per cycle + 0.166930533 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3921) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.379547e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.391026e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.391026e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.297415e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.309092e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.309092e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.087740 sec +TOTAL : 0.088761 sec INFO: No Floating Point Exceptions have been reported - 245,466,319 cycles # 2.692 GHz - 699,170,520 instructions # 2.85 insn per cycle - 0.091863831 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9501) (avx2: 0) (512y: 0) (512z: 0) + 248,572,980 cycles # 2.693 GHz + 700,241,386 instructions # 2.82 insn per cycle + 0.092789911 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9495) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.397405e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.403101e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.403101e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.400019e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.405543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.405543e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042780 sec +TOTAL : 0.042745 sec INFO: No Floating Point Exceptions have been reported - 121,204,590 cycles # 2.623 GHz - 260,141,578 instructions # 2.15 insn per cycle - 0.046815365 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8227) (512y: 0) (512z: 0) + 121,819,291 cycles # 2.642 GHz + 265,166,117 instructions # 2.18 insn per cycle + 0.046685672 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8514) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.586602e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.593859e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.593859e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572077e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.579933e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.579933e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038358 sec +TOTAL : 0.038595 sec INFO: No Floating Point Exceptions have been reported - 108,920,181 cycles # 2.610 GHz - 240,176,540 instructions # 2.21 insn per cycle - 0.042378717 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7348) (512y: 150) (512z: 0) + 110,467,988 cycles # 2.628 GHz + 247,221,317 instructions # 2.24 insn per cycle + 0.042621267 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8157) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.175182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180276e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180276e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.155645e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160467e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050174 sec +TOTAL : 0.050919 sec INFO: No Floating Point Exceptions have been reported - 97,067,712 cycles # 1.812 GHz - 138,415,288 instructions # 1.43 insn per cycle - 0.054229752 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1692) (512y: 126) (512z: 6592) + 98,056,565 cycles # 1.804 GHz + 141,486,289 instructions # 1.44 insn per cycle + 0.054927179 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1955) (512y: 126) (512z: 7089) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt index 65eb2e6009..64cee9c3b3 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:04:52 +DATE: 2024-06-03_18:43:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.239779e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.264296e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.268375e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.233887e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.259221e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.263483e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.465734 sec +TOTAL : 0.465295 sec INFO: No Floating Point Exceptions have been reported - 1,944,767,230 cycles # 2.813 GHz - 2,830,393,799 instructions # 1.46 insn per cycle - 0.748614808 seconds time elapsed + 1,958,806,784 cycles # 2.817 GHz + 2,840,787,434 instructions # 1.45 insn per cycle + 0.751914831 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.968348e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.116174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.125970e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.956639e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.091509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.101091e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.483276 sec +TOTAL : 0.486580 sec INFO: No Floating Point Exceptions have been reported - 2,014,046,726 cycles # 2.816 GHz - 2,960,547,521 instructions # 1.47 insn per cycle - 0.771364825 seconds time elapsed + 1,989,010,639 cycles # 2.812 GHz + 2,958,637,747 instructions # 1.49 insn per cycle + 0.766005331 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.352099e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.355611e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.355611e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.332669e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.336081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.336081e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.161935 sec +TOTAL : 0.163052 sec INFO: No Floating Point Exceptions have been reported - 471,513,746 cycles # 2.852 GHz - 1,391,998,687 instructions # 2.95 insn per cycle - 0.165915651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3869) (avx2: 0) (512y: 0) (512z: 0) + 472,941,785 cycles # 2.842 GHz + 1,392,300,118 instructions # 2.94 insn per cycle + 0.167088631 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.340411e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.352264e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.352264e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.188233e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.199153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.199153e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.087524 sec +TOTAL : 0.089766 sec INFO: No Floating Point Exceptions have been reported - 244,692,087 cycles # 2.689 GHz - 695,265,791 instructions # 2.84 insn per cycle - 0.091623255 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9537) (avx2: 0) (512y: 0) (512z: 0) + 248,021,944 cycles # 2.658 GHz + 696,332,728 instructions # 2.81 insn per cycle + 0.093962685 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9540) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.384610e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.390602e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.390602e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.381818e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387327e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387327e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.042428 sec +TOTAL : 0.042683 sec INFO: No Floating Point Exceptions have been reported - 120,402,247 cycles # 2.621 GHz - 255,771,436 instructions # 2.12 insn per cycle - 0.046482789 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8181) (512y: 0) (512z: 0) + 120,036,632 cycles # 2.612 GHz + 260,692,299 instructions # 2.17 insn per cycle + 0.046713736 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8469) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.577119e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.592040e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.592040e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.560836e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.567795e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567795e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037809 sec +TOTAL : 0.038118 sec INFO: No Floating Point Exceptions have been reported - 106,943,960 cycles # 2.590 GHz - 235,812,455 instructions # 2.21 insn per cycle - 0.041801607 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7301) (512y: 150) (512z: 0) + 108,246,886 cycles # 2.601 GHz + 242,814,438 instructions # 2.24 insn per cycle + 0.042161703 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8115) (512y: 150) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.057306e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057306e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.157650e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162537e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.162537e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.054678 sec +TOTAL : 0.050130 sec INFO: No Floating Point Exceptions have been reported - 95,978,810 cycles # 1.760 GHz - 134,249,554 instructions # 1.40 insn per cycle - 0.058819108 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1641) (512y: 126) (512z: 6597) + 95,930,950 cycles # 1.791 GHz + 136,895,076 instructions # 1.43 insn per cycle + 0.054115966 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1911) (512y: 126) (512z: 7093) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt index a147c96b16..f6523a4ed4 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:03 +DATE: 2024-06-03_18:43:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.545160e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.560350e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.469580 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.441793e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452389e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.454845e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.469243 sec INFO: No Floating Point Exceptions have been reported - 1,952,036,300 cycles # 2.812 GHz - 2,839,447,182 instructions # 1.45 insn per cycle - 0.751257373 seconds time elapsed + 1,955,791,300 cycles # 2.819 GHz + 2,846,567,055 instructions # 1.46 insn per cycle + 0.750731805 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.618765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.738189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.751774e+05 ) sec^-1 -MeanMatrixElemValue = ( 8.020493e-03 +- 4.025604e-03 ) GeV^-4 -TOTAL : 0.470396 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.093405e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.186877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.197998e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.020494e-03 +- 4.025605e-03 ) GeV^-4 +TOTAL : 0.468927 sec INFO: No Floating Point Exceptions have been reported - 1,933,532,708 cycles # 2.817 GHz - 2,825,179,750 instructions # 1.46 insn per cycle - 0.744816373 seconds time elapsed + 1,947,947,883 cycles # 2.819 GHz + 2,845,586,918 instructions # 1.46 insn per cycle + 0.747736312 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272870954487585E-006 -Relative difference = 4.564329725014175e-06 +Avg ME (F77/GPU) = 8.1272869669930272E-006 +Relative difference = 4.548524165778887e-06 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.450156e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.453792e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453792e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.365791e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.369019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.369019e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.158288 sec +TOTAL : 0.162125 sec INFO: No Floating Point Exceptions have been reported - 461,148,667 cycles # 2.852 GHz - 1,393,475,309 instructions # 3.02 insn per cycle - 0.162260134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3070) (avx2: 0) (512y: 0) (512z: 0) + 472,335,702 cycles # 2.852 GHz + 1,389,145,792 instructions # 2.94 insn per cycle + 0.166189548 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105211728276E-006 -Relative difference = 5.891219330978222e-08 +Avg ME (F77/C++) = 8.1278105271212486E-006 +Relative difference = 5.8180333155894157e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.199587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203982e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.203982e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.177410e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181512e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181512e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.048879 sec +TOTAL : 0.049884 sec INFO: No Floating Point Exceptions have been reported - 138,617,500 cycles # 2.649 GHz - 375,838,324 instructions # 2.71 insn per cycle - 0.052819431 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10134) (avx2: 0) (512y: 0) (512z: 0) + 140,700,616 cycles # 2.641 GHz + 379,285,257 instructions # 2.70 insn per cycle + 0.053843733 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10152) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.607346e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.607346e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.685774e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.707905e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.707905e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.025535 sec +TOTAL : 0.024610 sec INFO: No Floating Point Exceptions have been reported - 73,091,500 cycles # 2.523 GHz - 146,753,019 instructions # 2.01 insn per cycle - 0.029504478 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8933) (512y: 0) (512z: 0) + 73,040,255 cycles # 2.606 GHz + 149,958,625 instructions # 2.05 insn per cycle + 0.028633410 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9255) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.061402e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.092590e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.092590e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.962518e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.988659e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.988659e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022191 sec +TOTAL : 0.022813 sec INFO: No Floating Point Exceptions have been reported - 67,057,729 cycles # 2.606 GHz - 136,530,201 instructions # 2.04 insn per cycle - 0.026240821 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8164) (512y: 28) (512z: 0) + 67,549,740 cycles # 2.579 GHz + 139,989,278 instructions # 2.07 insn per cycle + 0.026775785 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8975) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.320153e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.340741e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.340741e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278588e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.299871e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.299871e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.027976 sec +TOTAL : 0.028313 sec INFO: No Floating Point Exceptions have been reported - 59,523,378 cycles # 1.900 GHz - 85,246,359 instructions # 1.43 insn per cycle - 0.031991723 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2572) (512y: 32) (512z: 6935) + 60,108,907 cycles # 1.896 GHz + 86,712,066 instructions # 1.44 insn per cycle + 0.032360710 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2837) (512y: 32) (512z: 7440) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt index 6d3597262c..5ad5bc88ac 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:13 +DATE: 2024-06-03_18:44:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.556636e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.568185e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.572444e+04 ) sec^-1 -MeanMatrixElemValue = ( 7.188142e-04 +- 6.565203e-04 ) GeV^-4 -TOTAL : 0.472720 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.478245e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489517e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.493671e+04 ) sec^-1 +MeanMatrixElemValue = ( 7.188141e-04 +- 6.565202e-04 ) GeV^-4 +TOTAL : 0.468633 sec INFO: No Floating Point Exceptions have been reported - 1,931,223,021 cycles # 2.806 GHz - 2,809,227,604 instructions # 1.45 insn per cycle - 0.747270154 seconds time elapsed + 1,959,695,231 cycles # 2.816 GHz + 2,840,090,818 instructions # 1.45 insn per cycle + 0.752893998 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.951614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.008354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.009763e+06 ) sec^-1 -MeanMatrixElemValue = ( 8.020495e-03 +- 4.025606e-03 ) GeV^-4 -TOTAL : 0.469854 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.318325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.411516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.422873e+05 ) sec^-1 +MeanMatrixElemValue = ( 8.020496e-03 +- 4.025606e-03 ) GeV^-4 +TOTAL : 0.471146 sec INFO: No Floating Point Exceptions have been reported - 1,934,807,792 cycles # 2.816 GHz - 2,823,952,331 instructions # 1.46 insn per cycle - 0.744383978 seconds time elapsed + 1,938,806,864 cycles # 2.813 GHz + 2,859,507,249 instructions # 1.47 insn per cycle + 0.745707568 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -83,8 +83,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 Avg ME (C++/GPU) = 8.127250e-06 -Avg ME (F77/GPU) = 8.1272870252982758E-006 -Relative difference = 4.555698209723637e-06 +Avg ME (F77/GPU) = 8.1272866419447706E-006 +Relative difference = 4.508529302013153e-06 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -96,16 +96,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440467e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.444153e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.444153e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.357321e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360536e+03 ) sec^-1 MeanMatrixElemValue = ( 7.177153e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.157850 sec +TOTAL : 0.161848 sec INFO: No Floating Point Exceptions have been reported - 459,198,020 cycles # 2.845 GHz - 1,388,550,014 instructions # 3.02 insn per cycle - 0.161959627 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2959) (avx2: 0) (512y: 0) (512z: 0) + 470,663,785 cycles # 2.849 GHz + 1,384,011,638 instructions # 2.94 insn per cycle + 0.165810869 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2943) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -113,8 +113,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/Sub cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 Avg ME (C++/C++) = 8.127811e-06 -Avg ME (F77/C++) = 8.1278105211728276E-006 -Relative difference = 5.891219330978222e-08 +Avg ME (F77/C++) = 8.1278105271212486E-006 +Relative difference = 5.8180333155894157e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP= @@ -124,16 +124,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.193461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198367e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198367e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.174470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.178652e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.178652e+04 ) sec^-1 MeanMatrixElemValue = ( 7.177152e-04 +- 6.554185e-04 ) GeV^-4 -TOTAL : 0.048387 sec +TOTAL : 0.049176 sec INFO: No Floating Point Exceptions have been reported - 136,709,201 cycles # 2.638 GHz - 370,998,148 instructions # 2.71 insn per cycle - 0.052404685 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4:10117) (avx2: 0) (512y: 0) (512z: 0) + 138,396,950 cycles # 2.632 GHz + 374,468,590 instructions # 2.71 insn per cycle + 0.053090230 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4:10135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.706186e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.728329e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.728329e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.706805e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730168e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730168e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.023794 sec +TOTAL : 0.023755 sec INFO: No Floating Point Exceptions have been reported - 70,529,419 cycles # 2.593 GHz - 141,874,277 instructions # 2.01 insn per cycle - 0.027836206 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8887) (512y: 0) (512z: 0) + 70,854,023 cycles # 2.612 GHz + 145,181,313 instructions # 2.05 insn per cycle + 0.027711456 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 9209) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.065896e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094176e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094176e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.969928e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.998045e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.998045e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165746e-04 +- 6.542823e-04 ) GeV^-4 -TOTAL : 0.022167 sec +TOTAL : 0.022034 sec INFO: No Floating Point Exceptions have been reported - 65,090,187 cycles # 2.575 GHz - 131,753,137 instructions # 2.02 insn per cycle - 0.026221124 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8117) (512y: 28) (512z: 0) + 65,659,386 cycles # 2.581 GHz + 135,178,550 instructions # 2.06 insn per cycle + 0.026035849 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8931) (512y: 28) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.325267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345484e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345484e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.281314e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.301558e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.301558e+04 ) sec^-1 MeanMatrixElemValue = ( 7.165747e-04 +- 6.542824e-04 ) GeV^-4 -TOTAL : 0.027163 sec +TOTAL : 0.027495 sec INFO: No Floating Point Exceptions have been reported - 57,486,714 cycles # 1.879 GHz - 80,476,258 instructions # 1.40 insn per cycle - 0.031188983 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2521) (512y: 32) (512z: 6939) + 57,968,608 cycles # 1.874 GHz + 82,072,103 instructions # 1.42 insn per cycle + 0.031472874 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2792) (512y: 32) (512z: 7442) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt index f1bf8ae1ae..8c3296e4df 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:23 +DATE: 2024-06-03_18:44:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.186832e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210139e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.214000e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.182508e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.205667e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209569e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.466969 sec +TOTAL : 0.464547 sec INFO: No Floating Point Exceptions have been reported - 1,929,963,905 cycles # 2.810 GHz - 2,820,321,257 instructions # 1.46 insn per cycle - 0.744660265 seconds time elapsed + 1,952,246,686 cycles # 2.820 GHz + 2,850,005,557 instructions # 1.46 insn per cycle + 0.748916127 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.774215e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.915971e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.925293e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.811825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.935505e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.943802e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.481813 sec +TOTAL : 0.484992 sec INFO: No Floating Point Exceptions have been reported - 2,018,144,507 cycles # 2.824 GHz - 2,988,593,572 instructions # 1.48 insn per cycle - 0.771128476 seconds time elapsed + 2,003,054,759 cycles # 2.819 GHz + 2,980,865,994 instructions # 1.49 insn per cycle + 0.768589369 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.318424e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321576e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321576e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.311983e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.315138e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.315138e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.164181 sec +TOTAL : 0.164815 sec INFO: No Floating Point Exceptions have been reported - 478,545,974 cycles # 2.854 GHz - 1,405,298,148 instructions # 2.94 insn per cycle - 0.168196808 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3977) (avx2: 0) (512y: 0) (512z: 0) + 480,558,988 cycles # 2.855 GHz + 1,405,529,926 instructions # 2.92 insn per cycle + 0.169036832 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3912) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.576930e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.588828e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.588828e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.486300e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.498254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.498254e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.085353 sec +TOTAL : 0.086377 sec INFO: No Floating Point Exceptions have been reported - 242,856,261 cycles # 2.738 GHz - 691,007,271 instructions # 2.85 insn per cycle - 0.089392731 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9324) (avx2: 0) (512y: 0) (512z: 0) + 245,115,567 cycles # 2.731 GHz + 695,255,062 instructions # 2.84 insn per cycle + 0.090377006 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9339) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.350712e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.355921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.355921e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.312936e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.317784e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.317784e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.044021 sec +TOTAL : 0.045252 sec INFO: No Floating Point Exceptions have been reported - 120,750,481 cycles # 2.554 GHz - 257,896,528 instructions # 2.14 insn per cycle - 0.048182370 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8244) (512y: 0) (512z: 0) + 121,783,811 cycles # 2.496 GHz + 260,306,932 instructions # 2.14 insn per cycle + 0.049283272 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8369) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.596995e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.604840e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.604840e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.551630e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558637e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558637e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.038244 sec +TOTAL : 0.039016 sec INFO: No Floating Point Exceptions have been reported - 108,668,422 cycles # 2.610 GHz - 238,349,934 instructions # 2.19 insn per cycle - 0.042362101 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7342) (512y: 146) (512z: 0) + 109,019,559 cycles # 2.565 GHz + 240,830,620 instructions # 2.21 insn per cycle + 0.043032343 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7513) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152016e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156769e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.156769e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.142604e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147315e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147315e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.051128 sec +TOTAL : 0.051566 sec INFO: No Floating Point Exceptions have been reported - 98,593,656 cycles # 1.806 GHz - 139,368,043 instructions # 1.41 insn per cycle - 0.055141464 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1953) (512y: 122) (512z: 6323) + 98,898,934 cycles # 1.798 GHz + 140,464,966 instructions # 1.42 insn per cycle + 0.055620860 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2085) (512y: 122) (512z: 6355) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt index 1674ae1a31..9498116dcd 100644 --- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx' -DATE: 2024-06-02_22:05:34 +DATE: 2024-06-03_18:44:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.211002e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.235184e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.239290e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.212183e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.236266e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240018e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.468026 sec +TOTAL : 0.464032 sec INFO: No Floating Point Exceptions have been reported - 1,934,707,675 cycles # 2.807 GHz - 2,829,874,008 instructions # 1.46 insn per cycle - 0.746355889 seconds time elapsed + 1,950,674,091 cycles # 2.818 GHz + 2,829,648,398 instructions # 1.45 insn per cycle + 0.749030588 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,15 +67,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.925457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.068678e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.078821e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.950257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.072861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081462e+05 ) sec^-1 MeanMatrixElemValue = ( 8.048215e-03 +- 4.042405e-03 ) GeV^-4 -TOTAL : 0.484376 sec +TOTAL : 0.483935 sec INFO: No Floating Point Exceptions have been reported - 1,990,471,362 cycles # 2.818 GHz - 2,966,958,009 instructions # 1.49 insn per cycle - 0.763465136 seconds time elapsed + 2,006,012,361 cycles # 2.809 GHz + 2,974,994,139 instructions # 1.48 insn per cycle + 0.771162547 seconds time elapsed ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe [ PASSED ] 3 tests. @@ -96,16 +96,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.320537e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.323936e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.323936e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.237548e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.240992e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.240992e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.163600 sec +TOTAL : 0.167612 sec INFO: No Floating Point Exceptions have been reported - 475,927,096 cycles # 2.852 GHz - 1,400,684,973 instructions # 2.94 insn per cycle - 0.167595349 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 3871) (avx2: 0) (512y: 0) (512z: 0) + 477,099,035 cycles # 2.792 GHz + 1,400,932,341 instructions # 2.94 insn per cycle + 0.171602717 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3813) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -124,16 +124,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.590632e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.602811e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.602811e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.145549e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.157756e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.157756e+03 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.084445 sec +TOTAL : 0.090540 sec INFO: No Floating Point Exceptions have been reported - 241,578,930 cycles # 2.751 GHz - 687,384,148 instructions # 2.85 insn per cycle - 0.088448470 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 9365) (avx2: 0) (512y: 0) (512z: 0) + 245,660,090 cycles # 2.600 GHz + 691,394,625 instructions # 2.81 insn per cycle + 0.095105056 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 9372) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -152,16 +152,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.419553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425604e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425604e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.392384e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.397823e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.397823e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.041668 sec +TOTAL : 0.042222 sec INFO: No Floating Point Exceptions have been reported - 118,041,093 cycles # 2.620 GHz - 253,446,942 instructions # 2.15 insn per cycle - 0.045688456 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8196) (512y: 0) (512z: 0) + 119,541,895 cycles # 2.616 GHz + 255,861,520 instructions # 2.14 insn per cycle + 0.046263283 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 8322) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -180,16 +180,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.610064e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.617322e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.617322e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.596967e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.604589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.604589e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.037145 sec +TOTAL : 0.037416 sec INFO: No Floating Point Exceptions have been reported - 106,387,352 cycles # 2.623 GHz - 233,796,871 instructions # 2.20 insn per cycle - 0.041248038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7292) (512y: 146) (512z: 0) + 106,767,617 cycles # 2.615 GHz + 236,442,043 instructions # 2.21 insn per cycle + 0.041453124 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 7464) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. @@ -208,16 +208,16 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.153901e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158908e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.158908e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.147136e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152341e+04 ) sec^-1 MeanMatrixElemValue = ( 7.185537e-04 +- 6.562553e-04 ) GeV^-4 -TOTAL : 0.050243 sec +TOTAL : 0.050589 sec INFO: No Floating Point Exceptions have been reported - 96,297,242 cycles # 1.793 GHz - 134,709,358 instructions # 1.40 insn per cycle - 0.054323733 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1904) (512y: 122) (512z: 6323) + 96,886,279 cycles # 1.795 GHz + 135,816,090 instructions # 1.40 insn per cycle + 0.054606545 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2036) (512y: 122) (512z: 6355) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt index c46a8918fe..8e958dea3f 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:31 +DATE: 2024-06-03_18:42:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.623490e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.780511e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.419250e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.527999 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.162731e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.647642e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.389698e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.523677 sec INFO: No Floating Point Exceptions have been reported - 2,127,703,778 cycles # 2.804 GHz - 3,037,503,741 instructions # 1.43 insn per cycle - 0.819938887 seconds time elapsed + 2,134,696,018 cycles # 2.814 GHz + 3,031,565,143 instructions # 1.42 insn per cycle + 0.817462514 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314203117931 +Relative difference = 1.37574474202048e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.629121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112830e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112830e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.204706 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.860735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.012248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.012248e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.299639 sec INFO: No Floating Point Exceptions have been reported - 3,459,527,100 cycles # 2.861 GHz - 8,713,936,767 instructions # 2.52 insn per cycle - 1.210090686 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 458) (avx2: 0) (512y: 0) (512z: 0) + 3,734,089,788 cycles # 2.863 GHz + 9,713,794,615 instructions # 2.60 insn per cycle + 1.305110900 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 427) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.614799e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138803e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138803e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.766288 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.453119e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.863695e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.863695e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.838764 sec INFO: No Floating Point Exceptions have been reported - 2,201,462,395 cycles # 2.855 GHz - 5,464,414,082 instructions # 2.48 insn per cycle - 0.771678284 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1298) (avx2: 0) (512y: 0) (512z: 0) + 2,332,040,160 cycles # 2.765 GHz + 5,935,472,258 instructions # 2.55 insn per cycle + 0.844206086 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.243332e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.350818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350818e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.584870 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.167991e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.174056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.174056e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.601519 sec INFO: No Floating Point Exceptions have been reported - 1,605,862,312 cycles # 2.723 GHz - 3,180,962,176 instructions # 1.98 insn per cycle - 0.590347744 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1459) (512y: 0) (512z: 0) + 1,662,054,757 cycles # 2.741 GHz + 3,319,057,632 instructions # 2.00 insn per cycle + 0.607052491 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1551) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328694e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.520640e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520640e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.567215 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.223901e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317323e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317323e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.591038 sec INFO: No Floating Point Exceptions have been reported - 1,559,045,940 cycles # 2.726 GHz - 3,082,232,081 instructions # 1.98 insn per cycle - 0.572568359 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1274) (512y: 95) (512z: 0) + 1,627,445,040 cycles # 2.731 GHz + 3,290,175,146 instructions # 2.02 insn per cycle + 0.596727910 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1441) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015389e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015389e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.614754 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.089949e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.989851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989851e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.619709 sec INFO: No Floating Point Exceptions have been reported - 1,347,809,988 cycles # 2.176 GHz - 2,375,607,493 instructions # 1.76 insn per cycle - 0.620258555 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 584) (512y: 62) (512z: 953) + 1,359,372,785 cycles # 2.176 GHz + 2,427,696,314 instructions # 1.79 insn per cycle + 0.625192962 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 598) (512y: 60) (512z: 1020) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt index e5d6236670..a8c2ab23a4 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:43 +DATE: 2024-06-03_18:42:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.746443e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.322888e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.759689e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.521311 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.277953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.135668e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.738374e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.522097 sec INFO: No Floating Point Exceptions have been reported - 2,128,407,529 cycles # 2.814 GHz - 3,052,405,520 instructions # 1.43 insn per cycle - 0.813389325 seconds time elapsed + 2,135,028,596 cycles # 2.824 GHz + 3,021,285,386 instructions # 1.42 insn per cycle + 0.815068044 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.688675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.121548e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.121548e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.196984 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.902017e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017998e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.294184 sec INFO: No Floating Point Exceptions have been reported - 3,435,642,752 cycles # 2.859 GHz - 8,628,896,472 instructions # 2.51 insn per cycle - 1.202405155 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 403) (avx2: 0) (512y: 0) (512z: 0) + 3,718,952,494 cycles # 2.864 GHz + 9,608,091,878 instructions # 2.58 insn per cycle + 1.299745147 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637775e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.172160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.172160e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.755876 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.491839e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928644e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928644e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.819333 sec INFO: No Floating Point Exceptions have been reported - 2,176,531,869 cycles # 2.862 GHz - 5,398,906,105 instructions # 2.48 insn per cycle - 0.761260827 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1258) (avx2: 0) (512y: 0) (512z: 0) + 2,348,682,704 cycles # 2.850 GHz + 5,882,248,118 instructions # 2.50 insn per cycle + 0.824898694 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1355) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341951 -Relative difference = 2.0349321157448718e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117919 +Relative difference = 1.3757447446553162e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.236743e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.324010e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.324010e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.584681 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.158538e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.187920e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.187920e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.603487 sec INFO: No Floating Point Exceptions have been reported - 1,593,673,714 cycles # 2.704 GHz - 3,147,296,381 instructions # 1.97 insn per cycle - 0.590069472 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1386) (512y: 0) (512z: 0) + 1,667,796,633 cycles # 2.742 GHz + 3,291,585,576 instructions # 1.97 insn per cycle + 0.609046146 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.274934e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.428395e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.428395e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.577971 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.235804e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333150e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.584918 sec INFO: No Floating Point Exceptions have been reported - 1,554,970,499 cycles # 2.667 GHz - 3,061,298,117 instructions # 1.97 insn per cycle - 0.583755416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1220) (512y: 95) (512z: 0) + 1,615,178,094 cycles # 2.739 GHz + 3,267,070,231 instructions # 2.02 insn per cycle + 0.590494284 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1394) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.106748e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.027961e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027961e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.613683 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.105460e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.013661e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.013661e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.614015 sec INFO: No Floating Point Exceptions have been reported - 1,361,641,398 cycles # 2.201 GHz - 2,360,583,231 instructions # 1.73 insn per cycle - 0.619282503 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 557) (512y: 62) (512z: 944) + 1,365,598,832 cycles # 2.207 GHz + 2,412,888,019 instructions # 1.77 insn per cycle + 0.619396780 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 565) (512y: 60) (512z: 1006) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328961386341946 -Relative difference = 2.034932117056294e-07 +Avg ME (C++/C++) = 4.213632e-01 +Avg ME (F77/C++) = 0.42136314203117925 +Relative difference = 1.375744743337898e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt index e5e2512c5d..6d06ae0cff 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:03:56 +DATE: 2024-06-03_18:42:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.267825e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155988e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.256672e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.482081 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.443384e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324007e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.721983e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.221160e-01 +- 1.229724e-04 ) GeV^0 +TOTAL : 0.483044 sec INFO: No Floating Point Exceptions have been reported - 2,007,174,940 cycles # 2.814 GHz - 2,862,530,986 instructions # 1.43 insn per cycle - 0.770265614 seconds time elapsed + 1,993,113,759 cycles # 2.820 GHz + 2,854,035,752 instructions # 1.43 insn per cycle + 0.765257157 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 72 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 100 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232893e-01 -Avg ME (F77/GPU) = 0.42328959883889183 -Relative difference = 7.059920764700599e-07 +Avg ME (C++/GPU) = 4.213628e-01 +Avg ME (F77/GPU) = 0.42136313809896819 +Relative difference = 8.023939659863929e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.657361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.123963e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.123963e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.176818 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.983263e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033842e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033842e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 1.258899 sec INFO: No Floating Point Exceptions have been reported - 3,379,168,439 cycles # 2.861 GHz - 8,663,925,346 instructions # 2.56 insn per cycle - 1.182131834 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 464) (avx2: 0) (512y: 0) (512z: 0) + 3,639,115,885 cycles # 2.880 GHz + 9,590,423,758 instructions # 2.64 insn per cycle + 1.264094441 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328961598104797 -Relative difference = 3.775440734888737e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136314298841171 +Relative difference = 1.0202225045153655e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.286459e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570924e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.551800 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.214375e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.385646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.385646e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.568044 sec INFO: No Floating Point Exceptions have been reported - 1,548,820,259 cycles # 2.783 GHz - 3,686,997,614 instructions # 2.38 insn per cycle - 0.557181964 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1472) (avx2: 0) (512y: 0) (512z: 0) + 1,641,692,574 cycles # 2.868 GHz + 3,971,413,789 instructions # 2.42 insn per cycle + 0.573200191 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960439772345 -Relative difference = 1.0389396439618597e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136313297669403 +Relative difference = 7.826194092961498e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.038982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.429005e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.429005e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.435959 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.937735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.175366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.175366e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.449120 sec INFO: No Floating Point Exceptions have been reported - 1,208,731,547 cycles # 2.744 GHz - 2,424,737,625 instructions # 2.01 insn per cycle - 0.441028354 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1835) (512y: 0) (512z: 0) + 1,260,069,598 cycles # 2.777 GHz + 2,500,944,665 instructions # 1.98 insn per cycle + 0.454433467 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1934) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.104774e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.649197e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.649197e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.429216 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.028517e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.518592e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.518592e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.437042 sec INFO: No Floating Point Exceptions have been reported - 1,186,032,707 cycles # 2.731 GHz - 2,375,887,228 instructions # 2.00 insn per cycle - 0.434797682 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1716) (512y: 2) (512z: 0) + 1,229,391,410 cycles # 2.784 GHz + 2,474,796,668 instructions # 2.01 insn per cycle + 0.442158464 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1885) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.873089e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.905822e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.905822e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.457639 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.850294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.803761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.803761e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.462583 sec INFO: No Floating Point Exceptions have been reported - 1,058,281,507 cycles # 2.289 GHz - 2,045,070,071 instructions # 1.93 insn per cycle - 0.462906421 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1125) (512y: 5) (512z: 1216) + 1,074,893,018 cycles # 2.301 GHz + 2,077,590,091 instructions # 1.93 insn per cycle + 0.467776190 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1161) (512y: 5) (512z: 1289) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328957567224279 -Relative difference = 5.7473080363015266e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136310806381516 +Relative difference = 1.9137449793670585e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt index ac0cd4f08e..2a4a93a29c 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:07 +DATE: 2024-06-03_18:42:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.274861e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.207280e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.264572e+09 ) sec^-1 -MeanMatrixElemValue = ( 4.240325e-01 +- 1.231174e-04 ) GeV^0 -TOTAL : 0.483176 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.359186e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223148e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.693051e+09 ) sec^-1 +MeanMatrixElemValue = ( 4.221160e-01 +- 1.229724e-04 ) GeV^0 +TOTAL : 0.483764 sec INFO: No Floating Point Exceptions have been reported - 1,992,294,984 cycles # 2.814 GHz - 2,821,532,343 instructions # 1.42 insn per cycle - 0.766539932 seconds time elapsed + 2,020,475,653 cycles # 2.852 GHz + 2,883,715,422 instructions # 1.43 insn per cycle + 0.766821024 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 71 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 93 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232893e-01 -Avg ME (F77/GPU) = 0.42328960436861962 -Relative difference = 7.190557844040413e-07 +Avg ME (C++/GPU) = 4.213628e-01 +Avg ME (F77/GPU) = 0.42136314490926452 +Relative difference = 8.185565136220069e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.761895e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138382e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138382e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.164683 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.967681e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.030786e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.030786e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 1.260175 sec INFO: No Floating Point Exceptions have been reported - 3,347,740,263 cycles # 2.864 GHz - 8,536,643,122 instructions # 2.55 insn per cycle - 1.169926925 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 372) (avx2: 0) (512y: 0) (512z: 0) + 3,623,677,233 cycles # 2.866 GHz + 9,468,922,940 instructions # 2.61 insn per cycle + 1.265433979 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 379) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328961598104797 -Relative difference = 3.775440734888737e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136314298841171 +Relative difference = 1.0202225045153655e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.377967e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.780801e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.533084 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.136354e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.208370e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.208370e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.583400 sec INFO: No Floating Point Exceptions have been reported - 1,538,945,489 cycles # 2.861 GHz - 3,654,064,050 instructions # 2.37 insn per cycle - 0.538506706 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1417) (avx2: 0) (512y: 0) (512z: 0) + 1,648,785,142 cycles # 2.804 GHz + 3,937,247,235 instructions # 2.39 insn per cycle + 0.588593369 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1538) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328960439772345 -Relative difference = 1.0389396439618597e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136313297669403 +Relative difference = 7.826194092961498e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.061693e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.506473e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.506473e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.433720 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.943449e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.205095e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.205095e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.447831 sec INFO: No Floating Point Exceptions have been reported - 1,212,102,221 cycles # 2.764 GHz - 2,408,536,618 instructions # 1.99 insn per cycle - 0.439199966 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1739) (512y: 0) (512z: 0) + 1,255,206,933 cycles # 2.775 GHz + 2,485,414,473 instructions # 1.98 insn per cycle + 0.453037322 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1825) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.151109e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.798793e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.798793e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.423301 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.018346e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.485553e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.485553e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.437859 sec INFO: No Floating Point Exceptions have been reported - 1,183,993,010 cycles # 2.767 GHz - 2,358,621,607 instructions # 1.99 insn per cycle - 0.428753176 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1639) (512y: 2) (512z: 0) + 1,225,395,495 cycles # 2.770 GHz + 2,461,685,965 instructions # 2.01 insn per cycle + 0.442994976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1794) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328956670826301 -Relative difference = 7.865002347873079e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136309723719023 +Relative difference = 6.5568384102678676e-09 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.903396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.993626e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.993626e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240336e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.452229 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.842475e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.802452e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.802452e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221172e-01 +- 1.229727e-04 ) GeV^0 +TOTAL : 0.460436 sec INFO: No Floating Point Exceptions have been reported - 1,057,097,121 cycles # 2.313 GHz - 2,029,647,637 instructions # 1.92 insn per cycle - 0.457646372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1038) (512y: 5) (512z: 1206) + 1,070,780,988 cycles # 2.303 GHz + 2,061,995,899 instructions # 1.93 insn per cycle + 0.465732223 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1056) (512y: 5) (512z: 1271) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232896e-01 -Avg ME (F77/C++) = 0.42328957567224279 -Relative difference = 5.7473080363015266e-08 +Avg ME (C++/C++) = 4.213631e-01 +Avg ME (F77/C++) = 0.42136310806381516 +Relative difference = 1.9137449793670585e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt index 175afd95a7..19dfb3d4f1 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:18 +DATE: 2024-06-03_18:43:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.031843e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.757380e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.369019e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.522857 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.378551e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.622662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.340178e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.521109 sec INFO: No Floating Point Exceptions have been reported - 2,138,931,604 cycles # 2.823 GHz - 3,065,086,151 instructions # 1.43 insn per cycle - 0.815055642 seconds time elapsed + 2,134,508,813 cycles # 2.827 GHz + 3,062,937,081 instructions # 1.43 insn per cycle + 0.812795274 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 132 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809225 -Relative difference = 2.02678940084305e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314235618794 +Relative difference = 1.368031476336171e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.465539e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090638e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.223589 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.749768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.980431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.980431e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.315046 sec INFO: No Floating Point Exceptions have been reported - 3,510,333,340 cycles # 2.858 GHz - 8,780,325,111 instructions # 2.50 insn per cycle - 1.229177242 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) + 3,786,972,515 cycles # 2.869 GHz + 9,739,934,413 instructions # 2.57 insn per cycle + 1.320813753 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 427) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197785e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.754390 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.549537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.026486e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.026486e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.793412 sec INFO: No Floating Point Exceptions have been reported - 2,172,612,390 cycles # 2.861 GHz - 5,461,989,505 instructions # 2.51 insn per cycle - 0.760002313 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1315) (avx2: 0) (512y: 0) (512z: 0) + 2,284,858,516 cycles # 2.863 GHz + 5,921,938,395 instructions # 2.59 insn per cycle + 0.799063914 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1424) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.213379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.275644e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.275644e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.591323 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.221387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.291485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.291485e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.589193 sec INFO: No Floating Point Exceptions have been reported - 1,584,687,799 cycles # 2.657 GHz - 3,128,023,138 instructions # 1.97 insn per cycle - 0.597064372 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1508) (512y: 0) (512z: 0) + 1,633,322,367 cycles # 2.750 GHz + 3,259,196,510 instructions # 2.00 insn per cycle + 0.594688637 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1573) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.378838e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665981e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665981e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.557636 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.273104e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.400962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400962e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.578780 sec INFO: No Floating Point Exceptions have been reported - 1,515,882,226 cycles # 2.694 GHz - 2,979,109,420 instructions # 1.97 insn per cycle - 0.563386258 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1266) (512y: 104) (512z: 0) + 1,601,507,893 cycles # 2.744 GHz + 3,214,454,807 instructions # 2.01 insn per cycle + 0.584254706 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1458) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.142290e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.104661e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.104661e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.607146 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.112157e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.043444e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.043444e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.615303 sec INFO: No Floating Point Exceptions have been reported - 1,330,909,045 cycles # 2.175 GHz - 2,316,396,076 instructions # 1.74 insn per cycle - 0.612769323 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 708) (512y: 64) (512z: 1000) + 1,353,304,837 cycles # 2.183 GHz + 2,382,215,008 instructions # 1.76 insn per cycle + 0.620880493 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 768) (512y: 64) (512z: 1062) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt index c48f15473d..4729f1a754 100644 --- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x' -DATE: 2024-06-02_22:04:30 +DATE: 2024-06-03_18:43:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.249284e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.245731e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.567694e+08 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.522953 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.500719e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.111270e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.633026e+08 ) sec^-1 +MeanMatrixElemValue = ( 4.221174e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.522680 sec INFO: No Floating Point Exceptions have been reported - 2,139,960,069 cycles # 2.824 GHz - 3,035,488,341 instructions # 1.42 insn per cycle - 0.816127779 seconds time elapsed + 2,129,724,895 cycles # 2.817 GHz + 3,027,277,951 instructions # 1.42 insn per cycle + 0.814390574 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 124 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubP ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 4.232897e-01 -Avg ME (F77/GPU) = 0.42328961420809225 -Relative difference = 2.02678940084305e-07 +Avg ME (C++/GPU) = 4.213632e-01 +Avg ME (F77/GPU) = 0.42136314235618794 +Relative difference = 1.368031476336171e-07 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.565233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104572e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104572e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 1.211740 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.796862e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004350e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004350e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 1.308348 sec INFO: No Floating Point Exceptions have been reported - 3,489,404,912 cycles # 2.868 GHz - 8,691,090,951 instructions # 2.49 insn per cycle - 1.217329641 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 408) (avx2: 0) (512y: 0) (512z: 0) + 3,764,910,824 cycles # 2.867 GHz + 9,641,101,522 instructions # 2.56 insn per cycle + 1.314135370 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.593140e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.091329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.091329e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.773508 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.486685e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920049e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920049e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.822453 sec INFO: No Floating Point Exceptions have been reported - 2,171,763,818 cycles # 2.790 GHz - 5,395,529,961 instructions # 2.48 insn per cycle - 0.779263038 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 1286) (avx2: 0) (512y: 0) (512z: 0) + 2,311,474,083 cycles # 2.794 GHz + 5,864,868,802 instructions # 2.54 insn per cycle + 0.828103244 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 1379) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962565639783 -Relative difference = 1.7563291089600324e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315374060329 +Relative difference = 3.4710995643185847e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.359046e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.585537e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.585537e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.560683 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.198203e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246567e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.593532 sec INFO: No Floating Point Exceptions have been reported - 1,579,967,618 cycles # 2.793 GHz - 3,095,230,267 instructions # 1.96 insn per cycle - 0.566362230 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1403) (512y: 0) (512z: 0) + 1,642,446,088 cycles # 2.744 GHz + 3,222,193,167 instructions # 1.96 insn per cycle + 0.599144349 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.453478e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.809547e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.809547e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.542291 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.278954e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.413566e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.413566e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.575915 sec INFO: No Floating Point Exceptions have been reported - 1,503,606,383 cycles # 2.747 GHz - 2,961,368,670 instructions # 1.97 insn per cycle - 0.547945591 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1207) (512y: 104) (512z: 0) + 1,590,360,868 cycles # 2.738 GHz + 3,186,450,755 instructions # 2.00 insn per cycle + 0.581550909 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1394) (512y: 101) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.154104e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.130242e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.130242e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.240339e-01 +- 1.231178e-04 ) GeV^0 -TOTAL : 0.603301 sec +EvtsPerSec[Rmb+ME] (23) = ( 2.120832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.060002e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.060002e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.221175e-01 +- 1.229728e-04 ) GeV^0 +TOTAL : 0.609960 sec INFO: No Floating Point Exceptions have been reported - 1,337,530,185 cycles # 2.200 GHz - 2,301,032,773 instructions # 1.72 insn per cycle - 0.608821276 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 669) (512y: 64) (512z: 987) + 1,354,216,533 cycles # 2.203 GHz + 2,366,532,725 instructions # 1.75 insn per cycle + 0.615403806 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 716) (512y: 64) (512z: 1053) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 4.232897e-01 -Avg ME (F77/C++) = 0.42328962604218012 -Relative difference = 1.747215201983364e-07 +Avg ME (C++/C++) = 4.213633e-01 +Avg ME (F77/C++) = 0.42136315463570262 +Relative difference = 3.4498566291130026e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt index 279b0d02f4..3e4e2bc254 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:01:15 +DATE: 2024-06-03_18:39:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.298622e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163951e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277049e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.536129 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.560709e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.164481e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.284902e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.529405 sec INFO: No Floating Point Exceptions have been reported - 2,173,999,771 cycles # 2.814 GHz - 3,130,541,130 instructions # 1.44 insn per cycle - 0.830459706 seconds time elapsed + 2,182,288,380 cycles # 2.822 GHz + 3,131,212,236 instructions # 1.43 insn per cycle + 0.829981893 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595186 -Relative difference = 1.2987943449389332e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358666195562 +Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.020488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079496e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079496e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.301250 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.781240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.827388e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.827388e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.998536 sec INFO: No Floating Point Exceptions have been reported - 15,199,764,023 cycles # 2.865 GHz - 38,382,132,016 instructions # 2.53 insn per cycle - 5.306881853 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 673) (avx2: 0) (512y: 0) (512z: 0) + 17,213,253,642 cycles # 2.868 GHz + 45,933,739,384 instructions # 2.67 insn per cycle + 6.003818659 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 636) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194407 +Relative difference = 6.616637439061751e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.456929e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.646631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.646631e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.144938 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.100817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.254980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.254980e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.494378 sec INFO: No Floating Point Exceptions have been reported - 9,021,281,744 cycles # 2.864 GHz - 24,583,412,308 instructions # 2.73 insn per cycle - 3.150495651 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) + 10,016,228,015 cycles # 2.866 GHz + 27,811,005,547 instructions # 2.78 insn per cycle + 3.499794029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2549) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593955 -Relative difference = 1.2987947253027805e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.307946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.757750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.757750e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.087521 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.858446e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.237280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.237280e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.271634 sec INFO: No Floating Point Exceptions have been reported - 5,484,294,083 cycles # 2.622 GHz - 11,256,076,031 instructions # 2.05 insn per cycle - 2.093164240 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2379) (512y: 0) (512z: 0) + 6,108,605,654 cycles # 2.684 GHz + 12,591,544,338 instructions # 2.06 insn per cycle + 2.277349103 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2696) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.046359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.624997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.624997e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.845597 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.187418e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.614304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.614304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.132263 sec INFO: No Floating Point Exceptions have been reported - 4,960,542,470 cycles # 2.681 GHz - 10,562,896,493 instructions # 2.13 insn per cycle - 1.851112565 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2077) (512y: 144) (512z: 0) + 5,595,388,735 cycles # 2.618 GHz + 12,008,541,965 instructions # 2.15 insn per cycle + 2.137697029 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2444) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792720e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.792720e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.027661 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.355157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.526988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526988e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.239016 sec INFO: No Floating Point Exceptions have been reported - 5,393,320,109 cycles # 1.779 GHz - 7,799,680,893 instructions # 1.45 insn per cycle - 3.033220380 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1545) + 5,757,421,005 cycles # 1.775 GHz + 8,347,756,435 instructions # 1.45 insn per cycle + 3.244654976 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1486) (512y: 122) (512z: 1805) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt index c0d78783de..8f4087e613 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:01:39 +DATE: 2024-06-03_18:40:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.411711e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168792e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279006e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531618 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.573649e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281775e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.527940 sec INFO: No Floating Point Exceptions have been reported - 2,183,799,831 cycles # 2.822 GHz - 3,094,825,077 instructions # 1.42 insn per cycle - 0.831170247 seconds time elapsed + 2,161,411,588 cycles # 2.822 GHz + 3,105,080,516 instructions # 1.44 insn per cycle + 0.822592967 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795799595186 -Relative difference = 1.2987943449389332e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358666195562 +Relative difference = 6.616631711254798e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106247e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106247e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.236847 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.833695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.828520 sec INFO: No Floating Point Exceptions have been reported - 15,020,999,910 cycles # 2.866 GHz - 40,099,937,559 instructions # 2.67 insn per cycle - 5.242391746 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 16,713,198,070 cycles # 2.865 GHz + 44,917,694,732 instructions # 2.69 insn per cycle + 5.833774312 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593964 -Relative difference = 1.2987947225564713e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.600352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.806473e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.806473e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.023117 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.255035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.424858e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.424858e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.332985 sec INFO: No Floating Point Exceptions have been reported - 8,678,762,741 cycles # 2.866 GHz - 23,668,927,694 instructions # 2.73 insn per cycle - 3.028764335 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) + 9,568,995,899 cycles # 2.867 GHz + 26,692,502,057 instructions # 2.79 insn per cycle + 3.338418143 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2343) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799593955 -Relative difference = 1.2987947253027805e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194411 +Relative difference = 6.616637417031725e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.855816e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.228551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.228551e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.270383 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.426523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735021e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.480486 sec INFO: No Floating Point Exceptions have been reported - 6,094,093,478 cycles # 2.679 GHz - 13,059,046,457 instructions # 2.14 insn per cycle - 2.275971704 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2545) (512y: 0) (512z: 0) + 6,611,956,562 cycles # 2.661 GHz + 14,116,423,051 instructions # 2.13 insn per cycle + 2.486181855 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2780) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.110330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.520213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.520213e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.162713 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.682757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.348744 sec INFO: No Floating Point Exceptions have been reported - 5,811,211,556 cycles # 2.681 GHz - 12,318,701,301 instructions # 2.12 insn per cycle - 2.168344172 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2092) (512y: 294) (512z: 0) + 6,326,699,067 cycles # 2.688 GHz + 13,709,965,055 instructions # 2.17 insn per cycle + 2.354222232 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2436) (512y: 297) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.308089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.473010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.473010e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.279972 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.269508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.429886e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.429886e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.318234 sec INFO: No Floating Point Exceptions have been reported - 5,822,767,374 cycles # 1.773 GHz - 9,603,130,120 instructions # 1.65 insn per cycle - 3.285517019 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1970) + 5,942,109,401 cycles # 1.788 GHz + 10,106,220,045 instructions # 1.70 insn per cycle + 3.323654210 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1336) (512y: 208) (512z: 1985) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340795799594546 -Relative difference = 1.2987945426732077e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158358666194953 +Relative difference = 6.616634729368461e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt index 00b2a7887f..6d2dfde2c7 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:03 +DATE: 2024-06-03_18:40:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.781883e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.602042e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.975899e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.491268 sec +EvtsPerSec[Rmb+ME] (23) = ( 9.307381e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106656e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.353255e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.487472 sec INFO: No Floating Point Exceptions have been reported - 2,009,507,618 cycles # 2.799 GHz - 2,900,729,651 instructions # 1.44 insn per cycle - 0.775023034 seconds time elapsed + 2,006,518,607 cycles # 2.817 GHz + 2,898,423,990 instructions # 1.44 insn per cycle + 0.769535603 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 149 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234085e+00 -Avg ME (F77/GPU) = 3.2341253389604390 -Relative difference = 1.2473067479392238e-05 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787037944421 +Relative difference = 1.870375413642407e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.165672e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.236261e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.236261e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.931237 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.881488e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934328e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.662703 sec INFO: No Floating Point Exceptions have been reported - 14,146,731,336 cycles # 2.866 GHz - 38,345,680,249 instructions # 2.71 insn per cycle - 4.936728571 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) + 16,249,524,467 cycles # 2.867 GHz + 45,328,757,089 instructions # 2.79 insn per cycle + 5.667894899 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941932052374 -Relative difference = 5.974014286114415e-08 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.834710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.233739e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.233739e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 2.259579 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.401859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731419e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.473431 sec INFO: No Floating Point Exceptions have been reported - 6,488,445,171 cycles # 2.865 GHz - 15,819,901,990 instructions # 2.44 insn per cycle - 2.265166416 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2693) (avx2: 0) (512y: 0) (512z: 0) + 7,083,637,480 cycles # 2.859 GHz + 17,776,736,480 instructions # 2.51 insn per cycle + 2.478699418 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 3154) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340934062376618 -Relative difference = 1.2561100182708985e-07 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.775248e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005815e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005815e+06 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.284026 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.050029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.137169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.137169e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.392863 sec INFO: No Floating Point Exceptions have been reported - 3,459,365,538 cycles # 2.685 GHz - 7,598,231,538 instructions # 2.20 insn per cycle - 1.289366574 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3054) (512y: 0) (512z: 0) + 3,745,294,052 cycles # 2.680 GHz + 8,268,263,382 instructions # 2.21 insn per cycle + 1.398145455 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3379) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919882990420 -Relative difference = 3.6180040581126224e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.437805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092647e+06 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.200609 sec +EvtsPerSec[Rmb+ME] (23) = ( 8.368203e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.560931e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.560931e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.342420 sec INFO: No Floating Point Exceptions have been reported - 3,247,417,265 cycles # 2.696 GHz - 7,207,177,396 instructions # 2.22 insn per cycle - 1.205866400 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2854) (512y: 23) (512z: 0) + 3,561,581,907 cycles # 2.644 GHz + 7,923,251,903 instructions # 2.22 insn per cycle + 1.347707221 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3231) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919882990420 -Relative difference = 3.6180040581126224e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.751112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.477606e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.477606e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.642510 sec +EvtsPerSec[Rmb+ME] (23) = ( 6.251505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.866721e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.866721e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.767422 sec INFO: No Floating Point Exceptions have been reported - 3,066,183,622 cycles # 1.862 GHz - 5,839,500,735 instructions # 1.90 insn per cycle - 1.647870341 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2375) (512y: 24) (512z: 1889) + 3,256,427,300 cycles # 1.838 GHz + 6,105,183,383 instructions # 1.87 insn per cycle + 1.772647081 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2407) (512y: 24) (512z: 2153) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340921289287508 -Relative difference = 3.986551736519174e-08 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt index 2e0a99a1cf..ccfedd0706 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:23 +DATE: 2024-06-03_18:41:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,17 +49,17 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.478321e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.704119e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.049758e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294909e+00 +- 3.228140e-03 ) GeV^0 -TOTAL : 0.485405 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.045625e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.457120e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.724518e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072877e+00 +- 3.361153e-03 ) GeV^0 +TOTAL : 0.481413 sec INFO: No Floating Point Exceptions have been reported - 2,012,796,173 cycles # 2.826 GHz - 2,903,006,317 instructions # 1.44 insn per cycle - 0.768724930 seconds time elapsed + 1,998,175,303 cycles # 2.818 GHz + 2,885,226,643 instructions # 1.44 insn per cycle + 0.764831178 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234085e+00 -Avg ME (F77/GPU) = 3.2341253389604390 -Relative difference = 1.2473067479392238e-05 +Avg ME (C++/GPU) = 2.015841e+00 +Avg ME (F77/GPU) = 2.0158787037944421 +Relative difference = 1.870375413642407e-05 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.137053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.205479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.205479e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294973e+00 +- 3.228584e-03 ) GeV^0 -TOTAL : 4.995257 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.911474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.965887e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965887e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361545e-03 ) GeV^0 +TOTAL : 5.573857 sec INFO: No Floating Point Exceptions have been reported - 14,321,719,125 cycles # 2.865 GHz - 39,835,690,494 instructions # 2.78 insn per cycle - 5.000617134 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) + 15,991,216,430 cycles # 2.867 GHz + 44,432,971,909 instructions # 2.78 insn per cycle + 5.578945319 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 547) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234094e+00 -Avg ME (F77/C++) = 3.2340941675938666 -Relative difference = 5.182096339328524e-08 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158491701586172 +Relative difference = 8.441039850630506e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.647900e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198922e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294972e+00 +- 3.228583e-03 ) GeV^0 -TOTAL : 1.945695 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.002910e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.429530e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.429530e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072937e+00 +- 3.361544e-03 ) GeV^0 +TOTAL : 2.186306 sec INFO: No Floating Point Exceptions have been reported - 5,584,746,201 cycles # 2.864 GHz - 15,284,426,800 instructions # 2.74 insn per cycle - 1.951180487 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2473) (avx2: 0) (512y: 0) (512z: 0) + 6,066,277,847 cycles # 2.770 GHz + 17,078,408,150 instructions # 2.82 insn per cycle + 2.191394262 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2881) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234093e+00 -Avg ME (F77/C++) = 3.2340934062376618 -Relative difference = 1.2561100182708985e-07 +Avg ME (C++/C++) = 2.015849e+00 +Avg ME (F77/C++) = 2.0158486895961687 +Relative difference = 1.539816876576819e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.237642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.855975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.855975e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.769454 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.883569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.447155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.447155e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.870818 sec INFO: No Floating Point Exceptions have been reported - 4,749,170,648 cycles # 2.677 GHz - 9,734,022,428 instructions # 2.05 insn per cycle - 1.774782238 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3707) (512y: 0) (512z: 0) + 5,034,265,873 cycles # 2.685 GHz + 10,228,613,656 instructions # 2.03 insn per cycle + 1.876045036 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3916) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919817797840 -Relative difference = 5.633796441974414e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.407642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.062522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.062522e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 1.724353 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.939491e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.517106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.517106e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 1.853794 sec INFO: No Floating Point Exceptions have been reported - 4,623,692,148 cycles # 2.674 GHz - 9,324,388,930 instructions # 2.02 insn per cycle - 1.729730077 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3495) (512y: 0) (512z: 0) + 4,981,257,343 cycles # 2.681 GHz + 9,997,702,736 instructions # 2.01 insn per cycle + 1.858959303 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 3823) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340919817797840 -Relative difference = 5.633796441974414e-09 +Avg ME (C++/C++) = 2.015847e+00 +Avg ME (F77/C++) = 2.0158474864438176 +Relative difference = 2.4130988992271984e-07 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.457416e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.921892e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.921892e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.295004e+00 +- 3.229072e-03 ) GeV^0 -TOTAL : 2.009945 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.481243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.789058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789058e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072967e+00 +- 3.361967e-03 ) GeV^0 +TOTAL : 2.429244 sec INFO: No Floating Point Exceptions have been reported - 3,661,033,798 cycles # 1.818 GHz - 7,034,840,971 instructions # 1.92 insn per cycle - 2.015460084 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2608) (512y: 12) (512z: 2220) + 4,364,624,594 cycles # 1.794 GHz + 8,448,218,621 instructions # 1.94 insn per cycle + 2.434426241 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2895) (512y: 4) (512z: 2751) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234092e+00 -Avg ME (F77/C++) = 3.2340921270661056 -Relative difference = 3.928957668408837e-08 +Avg ME (C++/C++) = 2.015848e+00 +Avg ME (F77/C++) = 2.0158476348733529 +Relative difference = 1.8112806478434436e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt index ea5a9dfe42..cf858f4377 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:02:44 +DATE: 2024-06-03_18:41:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.411466e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167511e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278097e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531668 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.593015e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162361e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.282560e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.529138 sec INFO: No Floating Point Exceptions have been reported - 2,181,212,654 cycles # 2.815 GHz - 3,113,262,519 instructions # 1.43 insn per cycle - 0.831954647 seconds time elapsed + 2,189,111,228 cycles # 2.820 GHz + 3,106,086,555 instructions # 1.42 insn per cycle + 0.833333259 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181666 -Relative difference = 1.2865539301192385e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358639104246 +Relative difference = 6.751024171044779e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.011160e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069301e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.324881 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.754698e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799357e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 6.087390 sec INFO: No Floating Point Exceptions have been reported - 15,270,464,570 cycles # 2.866 GHz - 38,583,585,562 instructions # 2.53 insn per cycle - 5.330529959 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 677) (avx2: 0) (512y: 0) (512z: 0) + 17,437,223,674 cycles # 2.863 GHz + 46,088,336,844 instructions # 2.64 insn per cycle + 6.092892306 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 636) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.489143e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.682455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.682455e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.116860 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.122318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279676e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.470120 sec INFO: No Floating Point Exceptions have been reported - 8,946,095,501 cycles # 2.866 GHz - 24,230,074,231 instructions # 2.71 insn per cycle - 3.122402700 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) + 9,965,552,209 cycles # 2.868 GHz + 27,601,401,595 instructions # 2.77 insn per cycle + 3.475667653 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.511600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.996636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.996636e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.016820 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.892219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.272346e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.272346e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.255972 sec INFO: No Floating Point Exceptions have been reported - 5,398,022,181 cycles # 2.671 GHz - 11,281,135,154 instructions # 2.09 insn per cycle - 2.022588826 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2483) (512y: 0) (512z: 0) + 6,032,808,300 cycles # 2.668 GHz + 12,495,305,658 instructions # 2.07 insn per cycle + 2.261593430 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2783) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.136939e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.737334e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.737334e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 1.820797 sec +EvtsPerSec[Rmb+ME] (23) = ( 5.389880e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.851032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.851032e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.057011 sec INFO: No Floating Point Exceptions have been reported - 4,868,000,366 cycles # 2.667 GHz - 10,530,833,141 instructions # 2.16 insn per cycle - 1.826342910 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2170) (512y: 148) (512z: 0) + 5,514,973,271 cycles # 2.675 GHz + 11,929,839,957 instructions # 2.16 insn per cycle + 2.062497201 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2534) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.738166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.952895e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.952895e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.917099 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.457890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639313e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.145292 sec INFO: No Floating Point Exceptions have been reported - 5,206,834,374 cycles # 1.782 GHz - 7,607,869,413 instructions # 1.46 insn per cycle - 2.922673764 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1611) + 5,590,550,000 cycles # 1.775 GHz + 8,120,275,403 instructions # 1.45 insn per cycle + 3.151215102 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1668) (512y: 126) (512z: 1865) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt index 611ee95bf5..1ec6d6d579 100644 --- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt @@ -40,7 +40,7 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-06-02_22:03:07 +DATE: 2024-06-03_18:41:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -49,15 +49,15 @@ INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: Process = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.372214e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166063e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277281e+08 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 0.531410 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.483040e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.148416e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265278e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 0.530656 sec INFO: No Floating Point Exceptions have been reported - 2,165,210,077 cycles # 2.821 GHz - 3,108,583,253 instructions # 1.44 insn per cycle - 0.824896732 seconds time elapsed + 2,180,510,722 cycles # 2.824 GHz + 3,126,960,490 instructions # 1.43 insn per cycle + 0.829636549 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -67,9 +67,9 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubPro ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2 -Avg ME (C++/GPU) = 3.234080e+00 -Avg ME (F77/GPU) = 3.2340795839181666 -Relative difference = 1.2865539301192385e-07 +Avg ME (C++/GPU) = 2.015836e+00 +Avg ME (F77/GPU) = 2.0158358639104246 +Relative difference = 6.751024171044779e-08 OK (relative difference <= 5E-3) ========================================================================= Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe @@ -81,25 +81,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.998905e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.057195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057195e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 5.355715 sec +EvtsPerSec[Rmb+ME] (23) = ( 1.808177e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.855648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855648e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 5.910250 sec INFO: No Floating Point Exceptions have been reported - 15,350,187,703 cycles # 2.864 GHz - 40,368,332,178 instructions # 2.63 insn per cycle - 5.361266689 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) + 16,954,887,617 cycles # 2.866 GHz + 45,103,327,044 instructions # 2.66 insn per cycle + 5.915938477 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 581) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -109,25 +109,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.655753e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.869389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.869389e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.979107 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.172628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.333065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.333065e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.416788 sec INFO: No Floating Point Exceptions have been reported - 8,538,137,981 cycles # 2.862 GHz - 23,251,495,548 instructions # 2.72 insn per cycle - 2.984645572 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 2090) (avx2: 0) (512y: 0) (512z: 0) + 9,502,201,673 cycles # 2.777 GHz + 26,246,195,465 instructions # 2.76 insn per cycle + 3.422433352 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796721168488 -Relative difference = 1.0138374786539113e-07 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359218686011 +Relative difference = 3.8758807327712803e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -137,25 +137,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.687752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.034396e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.034396e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.348055 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.374754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.674975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.674975e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.508984 sec INFO: No Floating Point Exceptions have been reported - 6,251,107,015 cycles # 2.657 GHz - 12,960,902,963 instructions # 2.07 insn per cycle - 2.353740392 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2668) (512y: 0) (512z: 0) + 6,734,505,509 cycles # 2.680 GHz + 14,036,419,832 instructions # 2.08 insn per cycle + 2.514518815 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2901) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -165,25 +165,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.964987e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.353858e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.353858e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 2.222113 sec +EvtsPerSec[Rmb+ME] (23) = ( 4.620511e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.957323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.957323e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 2.379950 sec INFO: No Floating Point Exceptions have been reported - 5,918,688,699 cycles # 2.658 GHz - 12,237,201,089 instructions # 2.07 insn per cycle - 2.227737714 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2208) (512y: 296) (512z: 0) + 6,387,248,616 cycles # 2.678 GHz + 13,522,465,773 instructions # 2.12 insn per cycle + 2.385413757 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 2543) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP= @@ -193,25 +193,25 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.434413e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.612405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.612405e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.294877e+00 +- 3.227953e-03 ) GeV^0 -TOTAL : 3.164008 sec +EvtsPerSec[Rmb+ME] (23) = ( 3.449512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630105e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.072848e+00 +- 3.360985e-03 ) GeV^0 +TOTAL : 3.151083 sec INFO: No Floating Point Exceptions have been reported - 5,604,141,468 cycles # 1.769 GHz - 8,744,053,502 instructions # 1.56 insn per cycle - 3.169616891 seconds time elapsed -=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1908) + 5,596,390,973 cycles # 1.774 GHz + 9,216,406,251 instructions # 1.65 insn per cycle + 3.156653341 seconds time elapsed +=Symbols in CPPProcess_cpp.o= (~sse4: 0) (avx2: 1453) (512y: 212) (512z: 2058) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe [ PASSED ] 3 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2 -Avg ME (C++/C++) = 3.234080e+00 -Avg ME (F77/C++) = 3.2340796772295590 -Relative difference = 9.980286234148268e-08 +Avg ME (C++/C++) = 2.015836e+00 +Avg ME (F77/C++) = 2.0158359178371690 +Relative difference = 4.0758688308634e-08 OK (relative difference <= 5E-3) =========================================================================