From 2cb7e11db3f8b9fcc70166988ede753ed8025138 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 15 Aug 2023 11:07:34 +0200 Subject: [PATCH] [patchMadsh] ** COMPLETE PATCHMADSH ** rerun 78 tput alltees, all ok STARTED AT Tue Aug 15 07:47:04 CEST 2023 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Tue Aug 15 08:14:00 CEST 2023 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Tue Aug 15 08:23:38 CEST 2023 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Tue Aug 15 08:33:37 CEST 2023 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Tue Aug 15 08:36:51 CEST 2023 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Tue Aug 15 08:40:02 CEST 2023 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 86 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 86 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_d_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 100 +++++++++--------- .../log_gqttq_mad_f_inl0_hrd1.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 100 +++++++++--------- .../log_gqttq_mad_m_inl0_hrd1.txt | 100 +++++++++--------- 78 files changed, 3648 insertions(+), 3648 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 8b17e477c5..a87822d822 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:54:14 +DATE: 2023-08-15_07:54:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.495689e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.812329e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.762974e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.672193e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.550427e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.701539e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.732034 sec - 2,774,237,202 cycles # 2.911 GHz - 4,102,504,064 instructions # 1.48 insn per cycle - 1.233764717 seconds time elapsed +TOTAL : 0.694267 sec + 2,657,264,310 cycles # 2.858 GHz + 3,765,362,313 instructions # 1.42 insn per cycle + 1.008172040 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.206108e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.490086e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.490086e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.201139e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486063e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486063e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.616615 sec - 17,384,779,520 cycles # 3.094 GHz - 41,066,766,142 instructions # 2.36 insn per cycle - 5.713018379 seconds time elapsed +TOTAL : 5.637064 sec + 17,374,480,524 cycles # 3.081 GHz + 41,066,855,128 instructions # 2.36 insn per cycle + 5.643507902 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065807e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.157387e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.157387e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.072551e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.176931e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.176931e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.444557 sec - 10,658,180,770 cycles # 3.091 GHz - 25,327,931,494 instructions # 2.38 insn per cycle - 3.652722729 seconds time elapsed +TOTAL : 3.433727 sec + 10,651,941,944 cycles # 3.099 GHz + 25,327,627,259 instructions # 2.38 insn per cycle + 3.446001784 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.975641e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.959535e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.959535e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.915398e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.790165e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.790165e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.512311 sec - 7,460,195,426 cycles # 2.963 GHz - 14,323,446,141 instructions # 1.92 insn per cycle - 2.727277780 seconds time elapsed +TOTAL : 2.559896 sec + 7,470,114,400 cycles # 2.913 GHz + 14,323,698,348 instructions # 1.92 insn per cycle + 2.572490985 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1057) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.104953e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.444528e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.444528e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.074566e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.307252e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.307252e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.419554 sec - 7,237,146,576 cycles # 2.985 GHz - 14,030,929,585 instructions # 1.94 insn per cycle - 2.484066380 seconds time elapsed +TOTAL : 2.437591 sec + 7,267,006,379 cycles # 2.975 GHz + 14,030,605,607 instructions # 1.93 insn per cycle + 2.449534359 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1009) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.912640e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.608109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.608109e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.914292e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670582e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670582e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.554581 sec - 6,538,923,646 cycles # 2.555 GHz - 10,813,931,984 instructions # 1.65 insn per cycle - 2.601219023 seconds time elapsed +TOTAL : 2.557249 sec + 6,520,525,266 cycles # 2.545 GHz + 10,814,200,252 instructions # 1.66 insn per cycle + 2.569336783 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 268) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index e064f22a8e..791e95c3e1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:38:24 +DATE: 2023-08-15_08:27:14 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.130491e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.777817e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.777817e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.169671e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.772595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.772595e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.428111 sec - 8,158,918,348 cycles # 3.041 GHz - 13,761,638,820 instructions # 1.69 insn per cycle - 2.739937853 seconds time elapsed +TOTAL : 2.412440 sec + 8,085,879,441 cycles # 3.031 GHz + 13,804,456,818 instructions # 1.71 insn per cycle + 2.727864220 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -78,14 +78,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155134e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.416982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.416982e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.145466e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.404196e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.404196e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.061023 sec - 18,731,169,904 cycles # 3.088 GHz - 41,377,634,652 instructions # 2.21 insn per cycle - 6.068616049 seconds time elapsed +TOTAL : 6.106495 sec + 18,655,748,758 cycles # 3.053 GHz + 41,377,014,729 instructions # 2.22 insn per cycle + 6.113887499 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -105,14 +105,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.925158e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856782e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856782e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.930164e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.865257e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.865257e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.866930 sec - 11,949,797,882 cycles # 3.086 GHz - 26,175,359,546 instructions # 2.19 insn per cycle - 3.884326205 seconds time elapsed +TOTAL : 3.856414 sec + 11,884,784,674 cycles # 3.077 GHz + 26,175,543,746 instructions # 2.20 insn per cycle + 3.870932619 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.712296e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.951874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.951874e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.705471e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.966789e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.966789e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.925953 sec - 8,791,551,938 cycles # 2.998 GHz - 15,688,607,879 instructions # 1.78 insn per cycle - 2.944225027 seconds time elapsed +TOTAL : 2.932639 sec + 8,729,428,772 cycles # 2.970 GHz + 15,688,784,706 instructions # 1.80 insn per cycle + 2.946907461 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1057) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770475e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.211717e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.211717e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.816691e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.292126e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.292126e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.884799 sec - 8,640,914,666 cycles # 2.988 GHz - 15,396,257,196 instructions # 1.78 insn per cycle - 2.893096567 seconds time elapsed +TOTAL : 2.837926 sec + 8,512,441,591 cycles # 2.996 GHz + 15,397,705,654 instructions # 1.81 insn per cycle + 2.854571954 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1009) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611128e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.659867e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.659867e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.572890e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.618725e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.618725e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.034694 sec - 7,912,829,662 cycles # 2.603 GHz - 11,964,667,046 instructions # 1.51 insn per cycle - 3.042536678 seconds time elapsed +TOTAL : 3.067289 sec + 7,897,531,929 cycles # 2.570 GHz + 11,965,513,983 instructions # 1.52 insn per cycle + 3.083738006 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 268) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 33861b1df6..f7b9c99682 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:51:30 +DATE: 2023-08-15_08:40:19 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.700628e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.282382e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.620332e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.908839e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.278507e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626738e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.353340 sec - 4,745,789,010 cycles # 2.984 GHz - 7,085,336,385 instructions # 1.49 insn per cycle - 1.648804961 seconds time elapsed +TOTAL : 1.338283 sec + 4,710,085,675 cycles # 2.990 GHz + 7,055,082,394 instructions # 1.50 insn per cycle + 1.632987098 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.187398e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.467795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.467795e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.202837e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.487646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.487646e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.061854 sec - 18,517,455,084 cycles # 3.054 GHz - 41,189,706,157 instructions # 2.22 insn per cycle - 6.068015687 seconds time elapsed +TOTAL : 5.982127 sec + 18,484,607,427 cycles # 3.088 GHz + 41,193,413,905 instructions # 2.23 insn per cycle + 5.988145512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.966590e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.015415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015415e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.062982e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.164129e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.164129e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.982623 sec - 11,768,810,037 cycles # 2.951 GHz - 25,352,563,993 instructions # 2.15 insn per cycle - 3.998738175 seconds time elapsed +TOTAL : 3.793432 sec + 11,774,969,685 cycles # 3.102 GHz + 25,353,299,382 instructions # 2.15 insn per cycle + 3.799461430 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.827353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.595944e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.595944e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.964788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.886297e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.886297e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.003899 sec - 8,606,318,115 cycles # 2.862 GHz - 14,247,752,770 instructions # 1.66 insn per cycle - 3.015908863 seconds time elapsed +TOTAL : 2.869115 sec + 8,608,684,116 cycles # 2.996 GHz + 14,248,463,131 instructions # 1.66 insn per cycle + 2.887780495 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1057) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.948636e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.966383e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.966383e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.096822e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.314817e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.314817e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.916270 sec - 8,374,321,696 cycles # 2.867 GHz - 13,753,554,100 instructions # 1.64 insn per cycle - 2.922736106 seconds time elapsed +TOTAL : 2.784058 sec + 8,355,473,574 cycles # 2.997 GHz + 13,754,388,286 instructions # 1.65 insn per cycle + 2.789826895 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1009) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.717471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.205645e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.205645e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.901063e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.664325e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.664325e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.116541 sec - 7,610,300,888 cycles # 2.438 GHz - 10,533,102,199 instructions # 1.38 insn per cycle - 3.132314848 seconds time elapsed +TOTAL : 2.921492 sec + 7,710,551,616 cycles # 2.635 GHz + 10,537,417,253 instructions # 1.37 insn per cycle + 2.936780421 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 268) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 68b10dac53..2f4b273270 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:48:19 +DATE: 2023-08-15_08:37:08 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.708124e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.322638e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.700187e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.942692e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.330516e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.705460e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.001409 sec - 3,700,894,343 cycles # 2.959 GHz - 7,052,023,169 instructions # 1.91 insn per cycle - 1.308416734 seconds time elapsed +TOTAL : 0.977802 sec + 3,622,628,001 cycles # 2.987 GHz + 6,887,030,840 instructions # 1.90 insn per cycle + 1.270674651 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.205336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.492448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.492448e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.208005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.495038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.495038e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.619629 sec - 17,403,734,124 cycles # 3.095 GHz - 41,066,272,078 instructions # 2.36 insn per cycle - 5.626458046 seconds time elapsed +TOTAL : 5.605516 sec + 17,363,648,733 cycles # 3.095 GHz + 41,066,644,485 instructions # 2.37 insn per cycle + 5.612220833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.061307e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.165333e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.165333e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.056139e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.158689e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.158689e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.452227 sec - 10,709,460,852 cycles # 3.099 GHz - 25,327,345,055 instructions # 2.36 insn per cycle - 3.465312897 seconds time elapsed +TOTAL : 3.459827 sec + 10,651,024,283 cycles # 3.074 GHz + 25,327,710,293 instructions # 2.38 insn per cycle + 3.471243867 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.960547e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.914568e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.914568e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.979620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.939152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.939152e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.525788 sec - 7,494,715,889 cycles # 2.964 GHz - 14,323,138,786 instructions # 1.91 insn per cycle - 2.537704424 seconds time elapsed +TOTAL : 2.502516 sec + 7,522,714,318 cycles # 3.000 GHz + 14,323,295,499 instructions # 1.90 insn per cycle + 2.508864166 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1057) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.136914e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.420463e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.420463e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.103932e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.359476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.359476e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.397216 sec - 7,235,476,722 cycles # 3.015 GHz - 14,030,168,040 instructions # 1.94 insn per cycle - 2.412052273 seconds time elapsed +TOTAL : 2.417909 sec + 7,244,725,933 cycles # 2.990 GHz + 14,030,681,101 instructions # 1.94 insn per cycle + 2.429759330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1009) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.929480e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.719912e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.719912e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.921840e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.718079e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.718079e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.544749 sec - 6,568,504,775 cycles # 2.577 GHz - 10,813,016,587 instructions # 1.65 insn per cycle - 2.556585606 seconds time elapsed +TOTAL : 2.547151 sec + 6,577,388,785 cycles # 2.577 GHz + 10,813,767,193 instructions # 1.64 insn per cycle + 2.559250592 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 268) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 873ffa1ffb..01ad82e89d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:45:05 +DATE: 2023-08-15_08:33:54 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.250759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.240029e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.558599e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.246182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.255428e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.587144e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.049180 sec - 6,919,859,031 cycles # 3.027 GHz - 12,010,993,383 instructions # 1.74 insn per cycle - 2.343088509 seconds time elapsed +TOTAL : 2.052115 sec + 6,878,654,346 cycles # 3.005 GHz + 11,917,245,523 instructions # 1.73 insn per cycle + 2.346890943 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 @@ -71,14 +71,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.206302e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.493497e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.493497e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199648e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.484757e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484757e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.615391 sec - 17,394,665,268 cycles # 3.097 GHz - 41,066,656,227 instructions # 2.36 insn per cycle - 5.621916096 seconds time elapsed +TOTAL : 5.644079 sec + 17,374,294,583 cycles # 3.078 GHz + 41,067,072,666 instructions # 2.36 insn per cycle + 5.650187028 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -97,14 +97,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.066210e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.168812e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.168812e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.057242e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.159791e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.159791e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.440802 sec - 10,683,403,609 cycles # 3.103 GHz - 25,327,557,888 instructions # 2.37 insn per cycle - 3.452798021 seconds time elapsed +TOTAL : 3.458639 sec + 10,663,165,741 cycles # 3.080 GHz + 25,328,035,686 instructions # 2.38 insn per cycle + 3.470060797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1284) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -123,14 +123,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.979505e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.902464e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.902464e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.976299e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.977860e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.977860e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.507273 sec - 7,489,336,917 cycles # 2.981 GHz - 14,323,228,959 instructions # 1.91 insn per cycle - 2.513454364 seconds time elapsed +TOTAL : 2.506470 sec + 7,495,523,027 cycles # 2.985 GHz + 14,323,185,828 instructions # 1.91 insn per cycle + 2.521547342 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1057) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.056104e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.310307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.310307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.096324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.377515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.377515e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.455192 sec - 7,304,719,648 cycles # 2.970 GHz - 14,030,818,627 instructions # 1.92 insn per cycle - 2.468280277 seconds time elapsed +TOTAL : 2.422127 sec + 7,261,731,842 cycles # 2.995 GHz + 14,029,800,668 instructions # 1.93 insn per cycle + 2.428445118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1009) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.943920e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.761376e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.761376e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.935302e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.737567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.737567e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.533473 sec - 6,546,537,810 cycles # 2.579 GHz - 10,813,869,025 instructions # 1.65 insn per cycle - 2.545326188 seconds time elapsed +TOTAL : 2.538004 sec + 6,512,072,673 cycles # 2.561 GHz + 10,813,676,057 instructions # 1.66 insn per cycle + 2.544311743 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 268) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index d83e7b4243..0e3156c822 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:54:46 +DATE: 2023-08-15_07:55:07 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.556423e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.331645e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.067060e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.797502e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.304738e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068661e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.711111 sec - 2,761,261,889 cycles # 2.907 GHz - 4,074,830,290 instructions # 1.48 insn per cycle - 1.162335283 seconds time elapsed +TOTAL : 0.668941 sec + 2,618,550,010 cycles # 2.895 GHz + 3,725,914,034 instructions # 1.42 insn per cycle + 0.964586682 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.209594e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.495505e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.495505e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.207948e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.493925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.493925e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.600644 sec - 17,384,086,887 cycles # 3.103 GHz - 41,014,828,710 instructions # 2.36 insn per cycle - 5.662950327 seconds time elapsed +TOTAL : 5.606225 sec + 17,352,611,992 cycles # 3.095 GHz + 41,015,581,318 instructions # 2.36 insn per cycle + 5.612273631 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.044172e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.121593e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.121593e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.055642e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.147243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.147243e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.484336 sec - 10,655,563,180 cycles # 3.055 GHz - 25,290,034,473 instructions # 2.37 insn per cycle - 3.720379042 seconds time elapsed +TOTAL : 3.458988 sec + 10,650,896,510 cycles # 3.075 GHz + 25,289,274,202 instructions # 2.37 insn per cycle + 3.470313959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1271) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.019873e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.026713e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.026713e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.994887e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.987724e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.987724e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.476997 sec - 7,451,472,230 cycles # 3.001 GHz - 14,297,760,721 instructions # 1.92 insn per cycle - 2.643507806 seconds time elapsed +TOTAL : 2.494936 sec + 7,463,237,722 cycles # 2.986 GHz + 14,297,586,390 instructions # 1.92 insn per cycle + 2.501385204 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1037) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.148666e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.427633e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.427633e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.101945e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.402549e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.402549e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.396054 sec - 7,189,557,150 cycles # 2.998 GHz - 14,017,893,821 instructions # 1.95 insn per cycle - 2.530638242 seconds time elapsed +TOTAL : 2.419494 sec + 7,255,354,650 cycles # 2.993 GHz + 14,017,249,829 instructions # 1.93 insn per cycle + 2.435526163 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 989) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.035816e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.101216e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.101216e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.025481e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.091522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.091522e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.470498 sec - 6,393,043,936 cycles # 2.582 GHz - 10,693,734,070 instructions # 1.67 insn per cycle - 2.648342688 seconds time elapsed +TOTAL : 2.474734 sec + 6,401,014,262 cycles # 2.581 GHz + 10,693,180,264 instructions # 1.67 insn per cycle + 2.491261488 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 246) (512y: 0) (512z: 663) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index ae75e010b7..a36b8c245f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:27:21 +DATE: 2023-08-15_08:16:26 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.478402e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.182498e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.692411e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.914166e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.327317e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.704399e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687646 sec - 2,719,894,059 cycles # 2.944 GHz - 3,908,268,264 instructions # 1.44 insn per cycle - 0.986825265 seconds time elapsed +TOTAL : 0.682445 sec + 2,681,111,270 cycles # 2.917 GHz + 3,835,048,245 instructions # 1.43 insn per cycle + 0.985161352 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.567110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.335632e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.335632e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.293434e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.293434e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.851787 sec - 8,801,027,974 cycles # 3.082 GHz - 18,105,650,438 instructions # 2.06 insn per cycle - 2.858516961 seconds time elapsed +TOTAL : 2.888959 sec + 8,793,923,135 cycles # 3.041 GHz + 18,106,828,171 instructions # 2.06 insn per cycle + 2.895668902 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 125) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.406332e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104352e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.104352e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.326429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.963979e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.963979e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.245414 sec - 6,943,321,742 cycles # 3.087 GHz - 13,419,858,269 instructions # 1.93 insn per cycle - 2.257886246 seconds time elapsed +TOTAL : 2.299886 sec + 6,945,957,949 cycles # 3.016 GHz + 13,421,286,049 instructions # 1.93 insn per cycle + 2.311641853 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 810) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.223223e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379559e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379559e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.146868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.354064e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.354064e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.880285 sec - 5,737,821,829 cycles # 3.045 GHz - 10,013,745,424 instructions # 1.75 insn per cycle - 1.891956833 seconds time elapsed +TOTAL : 1.914407 sec + 5,714,721,147 cycles # 2.978 GHz + 10,018,011,296 instructions # 1.75 insn per cycle + 1.926003662 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 720) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.269477e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.493538e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.493538e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.305174e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522058e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522058e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.867844 sec - 5,653,991,029 cycles # 3.018 GHz - 9,879,034,651 instructions # 1.75 insn per cycle - 1.880411156 seconds time elapsed +TOTAL : 1.852893 sec + 5,605,274,427 cycles # 3.019 GHz + 9,879,619,284 instructions # 1.76 insn per cycle + 1.868960977 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 641) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.798883e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013493e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013493e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.706880e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000723e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000723e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.051092 sec - 5,623,803,425 cycles # 2.735 GHz - 9,347,298,546 instructions # 1.66 insn per cycle - 2.057473625 seconds time elapsed +TOTAL : 2.095172 sec + 5,643,037,786 cycles # 2.689 GHz + 9,343,938,303 instructions # 1.66 insn per cycle + 2.107089803 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 200) (512y: 0) (512z: 276) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 53bc73b593..55c6ae7bc3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:27:45 +DATE: 2023-08-15_08:16:50 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.605389e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.830881e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072065e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.015497e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.041509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.076611e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.678628 sec - 2,719,202,943 cycles # 2.961 GHz - 3,872,377,606 instructions # 1.42 insn per cycle - 0.981309653 seconds time elapsed +TOTAL : 0.671217 sec + 2,648,430,808 cycles # 2.914 GHz + 3,872,567,566 instructions # 1.46 insn per cycle + 0.968378046 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.242635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.690303e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690303e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.170088e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.615486e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.615486e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.338859 sec - 7,211,628,331 cycles # 3.076 GHz - 14,875,667,461 instructions # 2.06 insn per cycle - 2.345560415 seconds time elapsed +TOTAL : 2.384664 sec + 7,263,421,980 cycles # 3.040 GHz + 14,876,504,014 instructions # 2.05 insn per cycle + 2.391073812 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 122) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.977052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194638e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.194638e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.987019e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200512e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200512e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.980062 sec - 6,108,045,240 cycles # 3.077 GHz - 11,420,452,681 instructions # 1.87 insn per cycle - 1.986417759 seconds time elapsed +TOTAL : 1.975075 sec + 6,103,568,200 cycles # 3.084 GHz + 11,421,166,399 instructions # 1.87 insn per cycle + 1.981152289 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 610) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.380783e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.662004e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.662004e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.378307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.651197e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.651197e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.831098 sec - 5,563,808,976 cycles # 3.029 GHz - 9,374,733,594 instructions # 1.68 insn per cycle - 1.845647676 seconds time elapsed +TOTAL : 1.835051 sec + 5,528,943,122 cycles # 3.004 GHz + 9,375,389,845 instructions # 1.70 insn per cycle + 1.865320631 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.569146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.895112e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.895112e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.607711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.890314e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.890314e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.773117 sec - 5,405,969,916 cycles # 3.045 GHz - 9,390,641,590 instructions # 1.74 insn per cycle - 1.785202302 seconds time elapsed +TOTAL : 1.756986 sec + 5,347,368,521 cycles # 3.036 GHz + 9,390,869,751 instructions # 1.76 insn per cycle + 1.768916861 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 519) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.923842e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.114769e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.114769e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.859935e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.101667e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.101667e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.001630 sec - 5,515,126,437 cycles # 2.751 GHz - 9,060,379,577 instructions # 1.64 insn per cycle - 2.014507985 seconds time elapsed +TOTAL : 2.025711 sec + 5,554,713,597 cycles # 2.736 GHz + 9,060,598,111 instructions # 1.63 insn per cycle + 2.037110763 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 168) (512y: 0) (512z: 227) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 1405223fcc..d1fccd5830 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:55:18 +DATE: 2023-08-15_07:55:37 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096685e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.176022e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.739531e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.545574e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429400e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.762898e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.593391 sec - 2,375,596,210 cycles # 2.887 GHz - 3,391,286,329 instructions # 1.43 insn per cycle - 1.192371424 seconds time elapsed +TOTAL : 0.574402 sec + 2,306,684,617 cycles # 2.879 GHz + 3,317,349,009 instructions # 1.44 insn per cycle + 0.861431911 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.265152e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.524098e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.524098e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.265149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.524518e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.524518e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.336668 sec - 16,503,247,832 cycles # 3.091 GHz - 40,103,456,452 instructions # 2.43 insn per cycle - 5.374653765 seconds time elapsed +TOTAL : 5.331314 sec + 16,491,409,917 cycles # 3.093 GHz + 40,103,873,366 instructions # 2.43 insn per cycle + 5.337381440 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.252518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.141034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.141034e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.255546e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.158473e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.158473e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.294654 sec - 7,088,206,627 cycles # 3.082 GHz - 16,746,304,162 instructions # 2.36 insn per cycle - 2.485673367 seconds time elapsed +TOTAL : 2.289624 sec + 7,065,393,860 cycles # 3.080 GHz + 16,746,102,273 instructions # 2.37 insn per cycle + 2.295533984 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.547365e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.216728e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.216728e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.577100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.247291e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247291e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.745263 sec - 5,205,861,462 cycles # 2.975 GHz - 10,646,918,201 instructions # 2.05 insn per cycle - 1.847099574 seconds time elapsed +TOTAL : 1.728483 sec + 5,246,476,226 cycles # 3.028 GHz + 10,646,051,029 instructions # 2.03 insn per cycle + 1.744312540 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.754270e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.328541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.328541e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.684671e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304884e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.304884e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.680055 sec - 5,094,156,298 cycles # 3.021 GHz - 10,499,761,311 instructions # 2.06 insn per cycle - 1.770524136 seconds time elapsed +TOTAL : 1.701889 sec + 5,084,915,133 cycles # 2.980 GHz + 10,499,674,438 instructions # 2.06 insn per cycle + 1.716700420 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.581372e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181852e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.181852e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.560201e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.172152e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172152e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.728271 sec - 4,721,031,573 cycles # 2.723 GHz - 8,947,817,353 instructions # 1.90 insn per cycle - 1.794247906 seconds time elapsed +TOTAL : 1.731968 sec + 4,700,187,956 cycles # 2.707 GHz + 8,947,738,240 instructions # 1.90 insn per cycle + 1.744130504 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 343) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index d4bffac6ce..777be3dddb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:38:58 +DATE: 2023-08-15_08:27:48 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -55,14 +55,14 @@ WARNING! flagging abnormal ME for ievt=247522 Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=7, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.059336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.092029e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.092029e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.218476e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.056373e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.056373e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371709e-02 +- 3.270386e-06 ) GeV^0 -TOTAL : 1.714893 sec - 5,860,098,721 cycles # 3.004 GHz - 10,215,071,795 instructions # 1.74 insn per cycle - 2.008855666 seconds time elapsed +TOTAL : 1.698014 sec + 5,764,608,219 cycles # 2.984 GHz + 10,199,099,127 instructions # 1.77 insn per cycle + 1.992360425 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.226683e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.469295e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.469295e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.238322e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.485077e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485077e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.597492 sec - 17,321,976,627 cycles # 3.091 GHz - 40,271,751,376 instructions # 2.32 insn per cycle - 5.605021996 seconds time elapsed +TOTAL : 5.549150 sec + 17,156,844,123 cycles # 3.090 GHz + 40,268,372,978 instructions # 2.35 insn per cycle + 5.556229262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -124,14 +124,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.063482e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.530366e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.530366e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.021228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.444863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.444863e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.532655 sec - 7,853,784,933 cycles # 3.094 GHz - 18,080,708,576 instructions # 2.30 insn per cycle - 2.548979253 seconds time elapsed +TOTAL : 2.567788 sec + 7,842,409,842 cycles # 3.048 GHz + 18,081,290,969 instructions # 2.31 insn per cycle + 2.581763386 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -155,14 +155,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.185687e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.012800e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.012800e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.215444e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.001558e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001558e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.980333 sec - 6,025,409,609 cycles # 3.034 GHz - 11,766,040,547 instructions # 1.95 insn per cycle - 1.994473164 seconds time elapsed +TOTAL : 1.972841 sec + 5,968,906,543 cycles # 3.019 GHz + 11,766,354,244 instructions # 1.97 insn per cycle + 1.989154993 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.210314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047754e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047754e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.331823e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.057903e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057903e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.977106 sec - 5,926,567,582 cycles # 2.989 GHz - 11,619,746,666 instructions # 1.96 insn per cycle - 1.989509353 seconds time elapsed +TOTAL : 1.929002 sec + 5,852,668,636 cycles # 3.024 GHz + 11,619,769,746 instructions # 1.99 insn per cycle + 1.946679853 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -217,14 +217,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.120282e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.298899e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.298899e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.096575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.324841e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.324841e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 2.005176 sec - 5,533,854,973 cycles # 2.753 GHz - 10,154,836,221 instructions # 1.84 insn per cycle - 2.018100082 seconds time elapsed +TOTAL : 2.014606 sec + 5,539,109,033 cycles # 2.743 GHz + 10,154,997,244 instructions # 1.83 insn per cycle + 2.031462529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 343) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index ad796b281a..a8859eaa23 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:52:03 +DATE: 2023-08-15_08:40:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.301791e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.291307e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.688544e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.388130e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322176e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.699521e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.182142 sec - 4,230,653,334 cycles # 3.002 GHz - 6,489,019,710 instructions # 1.53 insn per cycle - 1.468869463 seconds time elapsed +TOTAL : 1.171138 sec + 4,175,234,437 cycles # 2.980 GHz + 6,374,434,859 instructions # 1.53 insn per cycle + 1.458340956 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.253184e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511365e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511365e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.254138e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512198e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512198e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270265e-06 ) GeV^0 -TOTAL : 5.703548 sec - 17,524,629,522 cycles # 3.071 GHz - 40,289,268,049 instructions # 2.30 insn per cycle - 5.710172819 seconds time elapsed +TOTAL : 5.696069 sec + 17,480,760,791 cycles # 3.066 GHz + 40,290,171,415 instructions # 2.30 insn per cycle + 5.702403171 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.247580e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.168027e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.168027e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.227003e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.133484e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.133484e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270265e-06 ) GeV^0 -TOTAL : 2.605833 sec - 8,102,119,359 cycles # 3.105 GHz - 16,830,466,037 instructions # 2.08 insn per cycle - 2.618728098 seconds time elapsed +TOTAL : 2.620604 sec + 8,082,957,151 cycles # 3.079 GHz + 16,832,196,831 instructions # 2.08 insn per cycle + 2.632153852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.563685e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.222484e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.222484e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.550214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.231450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.231450e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270110e-06 ) GeV^0 -TOTAL : 2.050263 sec - 6,219,646,033 cycles # 3.026 GHz - 10,560,256,546 instructions # 1.70 insn per cycle - 2.061793191 seconds time elapsed +TOTAL : 2.049014 sec + 6,214,591,019 cycles # 3.026 GHz + 10,561,469,244 instructions # 1.70 insn per cycle + 2.061054298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.670646e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308325e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.308325e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.661615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.311531e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.311531e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270110e-06 ) GeV^0 -TOTAL : 2.026328 sec - 6,146,302,402 cycles # 3.026 GHz - 10,212,665,140 instructions # 1.66 insn per cycle - 2.038325982 seconds time elapsed +TOTAL : 2.026023 sec + 6,148,544,502 cycles # 3.028 GHz + 10,213,907,636 instructions # 1.66 insn per cycle + 2.031850914 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.525568e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155476e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155476e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.520391e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158289e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.158289e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371884e-02 +- 3.270111e-06 ) GeV^0 -TOTAL : 2.066020 sec - 5,744,744,037 cycles # 2.776 GHz - 8,660,586,270 instructions # 1.51 insn per cycle - 2.081648979 seconds time elapsed +TOTAL : 2.066306 sec + 5,722,268,726 cycles # 2.765 GHz + 8,658,387,445 instructions # 1.51 insn per cycle + 2.072030119 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 343) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index e5236f0be6..08f9d90beb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:48:49 +DATE: 2023-08-15_08:37:38 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.267214e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.303941e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.751606e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.397106e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.350042e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.750966e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.874046 sec - 3,258,675,300 cycles # 2.955 GHz - 6,332,801,326 instructions # 1.94 insn per cycle - 1.161030917 seconds time elapsed +TOTAL : 0.856969 sec + 3,231,922,783 cycles # 2.982 GHz + 6,241,038,070 instructions # 1.93 insn per cycle + 1.141635259 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.259572e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.526777e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.526777e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.266454e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.526174e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.526174e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.355953 sec - 16,592,002,305 cycles # 3.095 GHz - 40,103,342,636 instructions # 2.42 insn per cycle - 5.362226624 seconds time elapsed +TOTAL : 5.327368 sec + 16,480,103,424 cycles # 3.091 GHz + 40,103,730,146 instructions # 2.43 insn per cycle + 5.333346765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.198126e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.052022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.052022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.240289e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.142776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.142776e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.330544 sec - 7,103,035,834 cycles # 3.041 GHz - 16,745,608,009 instructions # 2.36 insn per cycle - 2.342473523 seconds time elapsed +TOTAL : 2.307276 sec + 7,096,546,849 cycles # 3.073 GHz + 16,746,922,642 instructions # 2.36 insn per cycle + 2.319547037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.612557e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.240794e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.240794e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.581291e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238177e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238177e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.723678 sec - 5,223,745,159 cycles # 3.025 GHz - 10,645,784,037 instructions # 2.04 insn per cycle - 1.735676364 seconds time elapsed +TOTAL : 1.729502 sec + 5,212,976,460 cycles # 3.005 GHz + 10,646,012,351 instructions # 2.04 insn per cycle + 1.735817439 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.727904e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.321076e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321076e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.726597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323769e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323769e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.686792 sec - 5,124,131,574 cycles # 3.028 GHz - 10,499,069,650 instructions # 2.05 insn per cycle - 1.699029423 seconds time elapsed +TOTAL : 1.684285 sec + 5,114,786,489 cycles # 3.029 GHz + 10,499,635,148 instructions # 2.05 insn per cycle + 1.695702980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.505415e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.170912e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.170912e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.552900e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166453e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.166453e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.750910 sec - 4,787,552,637 cycles # 2.726 GHz - 8,947,188,258 instructions # 1.87 insn per cycle - 1.757427126 seconds time elapsed +TOTAL : 1.734671 sec + 4,714,487,833 cycles # 2.710 GHz + 8,947,783,662 instructions # 1.90 insn per cycle + 1.740864411 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 343) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index cbddd75e70..32be1777db 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:45:36 +DATE: 2023-08-15_08:34:26 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=7, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.020301e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.297757e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.551747e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.059394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.303614e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.586022e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371709e-02 +- 3.270386e-06 ) GeV^0 -TOTAL : 1.494267 sec - 5,165,206,679 cycles # 2.987 GHz - 9,143,190,999 instructions # 1.77 insn per cycle - 1.786456297 seconds time elapsed +TOTAL : 1.557569 sec + 5,330,102,791 cycles # 2.986 GHz + 9,105,022,833 instructions # 1.71 insn per cycle + 1.843537042 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 @@ -71,14 +71,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.267088e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.527097e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.527097e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.246233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.501484e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501484e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.323491 sec - 16,495,878,801 cycles # 3.096 GHz - 40,104,408,451 instructions # 2.43 insn per cycle - 5.329636006 seconds time elapsed +TOTAL : 5.409994 sec + 16,491,394,473 cycles # 3.046 GHz + 40,104,848,105 instructions # 2.43 insn per cycle + 5.415867559 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -97,14 +97,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.259466e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.179292e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.179292e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.248425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.165572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.165572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.287874 sec - 7,077,417,031 cycles # 3.086 GHz - 16,746,028,894 instructions # 2.37 insn per cycle - 2.294318739 seconds time elapsed +TOTAL : 2.296217 sec + 7,073,167,410 cycles # 3.077 GHz + 16,744,469,995 instructions # 2.37 insn per cycle + 2.301756332 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -123,14 +123,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.574442e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.239869e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.239869e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.567663e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238042e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238042e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.729504 sec - 5,239,170,279 cycles # 3.020 GHz - 10,645,991,818 instructions # 2.03 insn per cycle - 1.735938049 seconds time elapsed +TOTAL : 1.731450 sec + 5,235,103,499 cycles # 3.017 GHz + 10,646,278,392 instructions # 2.03 insn per cycle + 1.746601624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1122) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.737015e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.324100e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.324100e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.721614e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.333374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.333374e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.681423 sec - 5,103,718,712 cycles # 3.026 GHz - 10,499,740,723 instructions # 2.06 insn per cycle - 1.693119213 seconds time elapsed +TOTAL : 1.686175 sec + 5,126,659,727 cycles # 3.034 GHz + 10,499,853,434 instructions # 2.05 insn per cycle + 1.692340474 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1074) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.544376e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167643e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167643e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.567673e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.172234e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.172234e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.737867 sec - 4,717,485,261 cycles # 2.707 GHz - 8,947,920,354 instructions # 1.90 insn per cycle - 1.743878082 seconds time elapsed +TOTAL : 1.728805 sec + 4,709,895,789 cycles # 2.717 GHz + 8,947,780,732 instructions # 1.90 insn per cycle + 1.740285725 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 343) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index a3196fada6..77eb4b6776 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:55:46 +DATE: 2023-08-15_07:56:03 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.097059e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195309e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.841505e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.549024e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.454205e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.867590e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.592877 sec - 2,390,901,801 cycles # 2.906 GHz - 3,441,053,114 instructions # 1.44 insn per cycle - 0.984342322 seconds time elapsed +TOTAL : 0.573247 sec + 2,306,223,093 cycles # 2.885 GHz + 3,304,353,770 instructions # 1.43 insn per cycle + 0.859595196 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.262336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.519986e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.519986e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.261287e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.517761e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.517761e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.343212 sec - 16,531,778,355 cycles # 3.091 GHz - 40,052,529,226 instructions # 2.42 insn per cycle - 5.456102383 seconds time elapsed +TOTAL : 5.347587 sec + 16,519,340,323 cycles # 3.087 GHz + 40,052,333,384 instructions # 2.42 insn per cycle + 5.353348824 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 347) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.236599e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.184072e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.184072e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.258683e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.177997e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.177997e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.303712 sec - 7,134,740,189 cycles # 3.091 GHz - 16,670,085,805 instructions # 2.34 insn per cycle - 2.421564349 seconds time elapsed +TOTAL : 2.291784 sec + 7,066,394,987 cycles # 3.077 GHz + 16,669,891,798 instructions # 2.36 insn per cycle + 2.303456803 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1335) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.590149e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.226360e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.226360e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.586500e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.234895e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.234895e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.724569 sec - 5,210,734,644 cycles # 3.011 GHz - 10,632,736,896 instructions # 2.04 insn per cycle - 2.015147648 seconds time elapsed +TOTAL : 1.721421 sec + 5,214,494,623 cycles # 3.021 GHz + 10,632,672,125 instructions # 2.04 insn per cycle + 1.732750457 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.741796e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319541e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319541e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.718916e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.326614e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326614e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.680562 sec - 5,106,613,124 cycles # 3.030 GHz - 10,492,694,920 instructions # 2.05 insn per cycle - 1.980201219 seconds time elapsed +TOTAL : 1.686615 sec + 5,108,977,210 cycles # 3.020 GHz + 10,490,613,336 instructions # 2.05 insn per cycle + 1.698547089 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1044) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.666949e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.254446e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.254446e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.651051e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.268382e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268382e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.704689 sec - 4,652,395,767 cycles # 2.720 GHz - 8,877,174,571 instructions # 1.91 insn per cycle - 2.083853924 seconds time elapsed +TOTAL : 1.709054 sec + 4,687,414,523 cycles # 2.736 GHz + 8,876,964,748 instructions # 1.89 insn per cycle + 1.715057097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 312) (512y: 0) (512z: 678) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index bda7927da3..9bd4d8b42a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:28:08 +DATE: 2023-08-15_08:17:14 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.201001e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.290924e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.732783e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.386329e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.343893e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.734394e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.588399 sec - 2,346,105,777 cycles # 2.857 GHz - 3,398,836,583 instructions # 1.45 insn per cycle - 0.878569996 seconds time elapsed +TOTAL : 0.573807 sec + 2,338,736,868 cycles # 2.911 GHz + 3,361,616,652 instructions # 1.44 insn per cycle + 0.860710088 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.884704e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.721327e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.721327e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.907458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.789549e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789549e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.543058 sec - 7,720,122,687 cycles # 3.030 GHz - 17,419,775,645 instructions # 2.26 insn per cycle - 2.549526706 seconds time elapsed +TOTAL : 2.524937 sec + 7,695,908,300 cycles # 3.042 GHz + 17,419,908,512 instructions # 2.26 insn per cycle + 2.530886883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 141) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.702699e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.465484e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.465484e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.654228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.447937e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447937e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 1.697104 sec - 5,259,667,904 cycles # 3.089 GHz - 10,777,916,097 instructions # 2.05 insn per cycle - 1.710690484 seconds time elapsed +TOTAL : 1.715670 sec + 5,250,624,000 cycles # 3.055 GHz + 10,778,731,487 instructions # 2.05 insn per cycle + 1.726909018 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 941) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.091996e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.403223e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.403223e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.003576e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.423971e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.423971e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.383976 sec - 4,264,411,103 cycles # 3.071 GHz - 8,352,714,069 instructions # 1.96 insn per cycle - 1.396242855 seconds time elapsed +TOTAL : 1.401866 sec + 4,289,992,122 cycles # 3.051 GHz + 8,353,467,362 instructions # 1.95 insn per cycle + 1.413100405 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 855) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.216225e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.857453e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.857453e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.193956e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.892438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.892438e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.368858 sec - 4,201,938,885 cycles # 3.056 GHz - 8,332,214,470 instructions # 1.98 insn per cycle - 1.382778661 seconds time elapsed +TOTAL : 1.370603 sec + 4,199,967,342 cycles # 3.054 GHz + 8,332,687,737 instructions # 1.98 insn per cycle + 1.376565532 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 779) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.596331e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.212343e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.212343e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.581405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201035e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201035e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.479533 sec - 4,246,332,117 cycles # 2.860 GHz - 8,217,249,738 instructions # 1.94 insn per cycle - 1.491297680 seconds time elapsed +TOTAL : 1.480284 sec + 4,226,325,107 cycles # 2.846 GHz + 8,217,989,555 instructions # 1.94 insn per cycle + 1.495814004 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 280) (512y: 0) (512z: 301) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index e405a2b54d..9afa2da28d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_01:28:30 +DATE: 2023-08-15_08:17:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.206985e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.324659e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.860147e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.385884e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.371282e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.861308e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.584187 sec - 2,381,095,801 cycles # 2.928 GHz - 3,412,013,527 instructions # 1.43 insn per cycle - 0.870996720 seconds time elapsed +TOTAL : 0.576503 sec + 2,326,678,847 cycles # 2.894 GHz + 3,347,071,515 instructions # 1.44 insn per cycle + 0.861093880 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.766038e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.730190e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.730190e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.826942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.871964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.871964e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.038797 sec - 6,215,597,261 cycles # 3.042 GHz - 14,177,625,170 instructions # 2.28 insn per cycle - 2.045323335 seconds time elapsed +TOTAL : 2.005736 sec + 6,187,454,993 cycles # 3.079 GHz + 14,177,459,076 instructions # 2.29 insn per cycle + 2.011637210 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 133) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.276394e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.230685e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.230685e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.290599e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.243867e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.243867e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 1.551571 sec - 4,793,258,651 cycles # 3.079 GHz - 9,582,592,672 instructions # 2.00 insn per cycle - 1.558188693 seconds time elapsed +TOTAL : 1.550551 sec + 4,789,377,245 cycles # 3.083 GHz + 9,583,428,039 instructions # 2.00 insn per cycle + 1.562945446 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 663) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.246475e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.072528e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.072528e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.222388e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.095648e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.095648e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.360088 sec - 4,184,299,104 cycles # 3.065 GHz - 8,137,097,507 instructions # 1.94 insn per cycle - 1.372410394 seconds time elapsed +TOTAL : 1.363861 sec + 4,178,762,719 cycles # 3.053 GHz + 8,137,629,720 instructions # 1.95 insn per cycle + 1.370140957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 623) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.365917e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.478442e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.478442e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.352788e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.538240e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.538240e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.342298 sec - 4,130,352,979 cycles # 3.065 GHz - 8,144,868,120 instructions # 1.97 insn per cycle - 1.348555051 seconds time elapsed +TOTAL : 1.344937 sec + 4,128,391,295 cycles # 3.058 GHz + 8,145,545,731 instructions # 1.97 insn per cycle + 1.360573478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 590) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.779638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557684e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557684e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.792847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.582026e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.582026e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.444127 sec - 4,183,480,554 cycles # 2.886 GHz - 8,052,766,531 instructions # 1.92 insn per cycle - 1.461076312 seconds time elapsed +TOTAL : 1.440562 sec + 4,167,353,431 cycles # 2.883 GHz + 8,053,376,857 instructions # 1.93 insn per cycle + 1.446842959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 238) (512y: 0) (512z: 234) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index ade965e407..0a3064edc4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:56:15 +DATE: 2023-08-15_07:56:29 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.488062e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.781399e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.662961e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.669670e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.534549e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.653764e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.756639 sec - 2,823,355,990 cycles # 2.906 GHz - 4,095,786,321 instructions # 1.45 insn per cycle - 1.213547725 seconds time elapsed +TOTAL : 0.663511 sec + 2,639,528,093 cycles # 2.904 GHz + 3,824,856,160 instructions # 1.45 insn per cycle + 0.966243935 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.182672e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.456408e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.456408e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.183307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.458687e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.458687e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.718860 sec - 17,654,825,308 cycles # 3.084 GHz - 41,243,077,704 instructions # 2.34 insn per cycle - 5.753198928 seconds time elapsed +TOTAL : 5.714648 sec + 17,642,893,641 cycles # 3.087 GHz + 41,243,812,397 instructions # 2.34 insn per cycle + 5.720997255 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 377) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.059314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.198695e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.198695e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.077069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.197446e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.197446e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.457251 sec - 10,664,625,235 cycles # 3.081 GHz - 25,488,362,430 instructions # 2.39 insn per cycle - 3.529081819 seconds time elapsed +TOTAL : 3.426962 sec + 10,562,613,964 cycles # 3.078 GHz + 25,488,240,263 instructions # 2.41 insn per cycle + 3.432996829 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1318) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.033439e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.111676e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.111676e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.041178e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.115231e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.115231e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.469621 sec - 7,418,217,278 cycles # 2.999 GHz - 14,281,559,091 instructions # 1.93 insn per cycle - 2.521314713 seconds time elapsed +TOTAL : 2.464007 sec + 7,360,734,505 cycles # 2.983 GHz + 14,281,547,404 instructions # 1.94 insn per cycle + 2.476037508 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1211) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.115500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.536533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.536533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.160027e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.603041e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.603041e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.419253 sec - 7,161,694,940 cycles # 2.953 GHz - 13,977,695,596 instructions # 1.95 insn per cycle - 2.475879431 seconds time elapsed +TOTAL : 2.381011 sec + 7,144,233,987 cycles # 2.993 GHz + 13,977,358,492 instructions # 1.96 insn per cycle + 2.393452159 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1141) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.895554e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616381e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.616381e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.929943e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.721420e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.721420e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.571576 sec - 6,541,586,554 cycles # 2.539 GHz - 10,866,503,023 instructions # 1.66 insn per cycle - 2.650530299 seconds time elapsed +TOTAL : 2.545022 sec + 6,534,578,577 cycles # 2.563 GHz + 10,866,400,512 instructions # 1.66 insn per cycle + 2.556667131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 406) (512y: 0) (512z: 707) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 108731b931..e2daa7fa37 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-08-14_00:56:46 +DATE: 2023-08-15_07:56:59 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.551586e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.332640e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.068497e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.792190e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.296331e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073214e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.716722 sec - 2,751,428,622 cycles # 2.870 GHz - 4,051,447,017 instructions # 1.47 insn per cycle - 1.265618794 seconds time elapsed +TOTAL : 0.658774 sec + 2,632,532,045 cycles # 2.934 GHz + 3,722,497,876 instructions # 1.41 insn per cycle + 0.954702315 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.184994e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.459336e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.459336e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.186828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.463171e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463171e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.709434 sec - 17,626,141,501 cycles # 3.084 GHz - 41,192,015,197 instructions # 2.34 insn per cycle - 5.814829966 seconds time elapsed +TOTAL : 5.706749 sec + 17,621,138,449 cycles # 3.088 GHz + 41,192,591,924 instructions # 2.34 insn per cycle + 5.713150892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047921e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.144841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.144841e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.066061e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186765e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186765e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.475731 sec - 10,603,973,209 cycles # 3.046 GHz - 25,449,897,311 instructions # 2.40 insn per cycle - 3.611807783 seconds time elapsed +TOTAL : 3.443089 sec + 10,636,292,355 cycles # 3.085 GHz + 25,449,644,894 instructions # 2.39 insn per cycle + 3.458603313 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1305) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.010263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.121716e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.121716e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.025537e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.156439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.156439e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.486404 sec - 7,438,713,688 cycles # 2.985 GHz - 14,255,709,884 instructions # 1.92 insn per cycle - 2.535752404 seconds time elapsed +TOTAL : 2.471270 sec + 7,412,247,060 cycles # 2.994 GHz + 14,255,493,791 instructions # 1.92 insn per cycle + 2.482605852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1191) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.161035e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.577112e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.577112e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.180644e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.589038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.589038e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.390215 sec - 7,135,030,170 cycles # 2.983 GHz - 13,964,473,727 instructions # 1.96 insn per cycle - 2.658850423 seconds time elapsed +TOTAL : 2.367834 sec + 7,129,277,655 cycles # 3.007 GHz + 13,963,683,721 instructions # 1.96 insn per cycle + 2.383982852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1121) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.990645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.931913e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.931913e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.998078e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.969254e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.969254e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.499214 sec - 6,455,204,245 cycles # 2.579 GHz - 10,745,996,612 instructions # 1.66 insn per cycle - 2.617632642 seconds time elapsed +TOTAL : 2.494071 sec + 6,425,672,029 cycles # 2.572 GHz + 10,745,918,092 instructions # 1.67 insn per cycle + 2.500013882 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 386) (512y: 0) (512z: 688) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index cf23f176e0..64d227ea2d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:57:19 +DATE: 2023-08-15_07:57:28 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.989738e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132481e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266643e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.008846e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166760e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264507e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.611585 sec - 2,259,791,119 cycles # 2.840 GHz - 2,895,170,962 instructions # 1.28 insn per cycle - 1.041203432 seconds time elapsed +TOTAL : 0.526553 sec + 2,210,322,248 cycles # 2.904 GHz + 2,874,286,221 instructions # 1.30 insn per cycle + 0.819598652 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.962039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.025350e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.025350e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.959920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.022893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.022893e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.466966 sec - 16,833,902,638 cycles # 3.085 GHz - 45,502,141,592 instructions # 2.70 insn per cycle - 5.504841562 seconds time elapsed +TOTAL : 5.455757 sec + 16,830,029,467 cycles # 3.082 GHz + 45,503,274,966 instructions # 2.70 insn per cycle + 5.462080500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.389245e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.595994e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.595994e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.384153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.591052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.591052e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.203084 sec - 9,926,848,549 cycles # 3.096 GHz - 27,884,939,794 instructions # 2.81 insn per cycle - 3.468981266 seconds time elapsed +TOTAL : 3.203803 sec + 9,917,768,655 cycles # 3.090 GHz + 27,884,562,243 instructions # 2.81 insn per cycle + 3.210298938 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.207541e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.723681e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.723681e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.261175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.764281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.764281e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.124660 sec - 6,108,969,893 cycles # 2.869 GHz - 12,623,113,243 instructions # 2.07 insn per cycle - 2.196070980 seconds time elapsed +TOTAL : 2.098374 sec + 6,126,947,640 cycles # 2.912 GHz + 12,622,456,584 instructions # 2.06 insn per cycle + 2.110538594 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.744602e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.341484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.341484e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.687482e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.294742e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.294742e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.930844 sec - 5,648,797,033 cycles # 2.917 GHz - 12,001,028,658 instructions # 2.12 insn per cycle - 2.148342259 seconds time elapsed +TOTAL : 1.950153 sec + 5,678,878,812 cycles # 2.907 GHz + 12,001,198,241 instructions # 2.11 insn per cycle + 1.962021066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.770955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.024022e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.024022e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.771194e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.025396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.025396e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.898245 sec - 5,826,492,836 cycles # 2.014 GHz - 8,396,322,139 instructions # 1.44 insn per cycle - 3.041639435 seconds time elapsed +TOTAL : 2.887803 sec + 5,812,534,828 cycles # 2.009 GHz + 8,395,770,130 instructions # 1.44 insn per cycle + 2.900336232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1797) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index dd9abbed9a..3015f84fb6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:39:27 +DATE: 2023-08-15_08:28:16 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.010404e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.990768e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.990768e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.000928e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.975262e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.975262e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.861336 sec - 3,272,072,719 cycles # 2.943 GHz - 4,721,756,060 instructions # 1.44 insn per cycle - 1.169567949 seconds time elapsed +TOTAL : 0.861220 sec + 3,260,802,553 cycles # 2.935 GHz + 4,618,775,963 instructions # 1.42 insn per cycle + 1.169456478 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -78,14 +78,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.947120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.009252e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.009252e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.944448e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.006332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.006332e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.574346 sec - 17,207,373,844 cycles # 3.084 GHz - 45,580,405,233 instructions # 2.65 insn per cycle - 5.582006780 seconds time elapsed +TOTAL : 5.578239 sec + 17,180,177,907 cycles # 3.077 GHz + 45,580,640,750 instructions # 2.65 insn per cycle + 5.586327034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -105,14 +105,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.348266e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.549142e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.549142e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.334544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.533842e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.533842e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.316765 sec - 10,270,030,378 cycles # 3.091 GHz - 28,069,615,294 instructions # 2.73 insn per cycle - 3.330972009 seconds time elapsed +TOTAL : 3.328630 sec + 10,265,865,180 cycles # 3.079 GHz + 28,067,447,778 instructions # 2.73 insn per cycle + 3.335847383 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.194289e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.684828e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.684828e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.196651e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.682146e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.682146e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.204295 sec - 6,472,053,092 cycles # 2.928 GHz - 12,910,516,967 instructions # 1.99 insn per cycle - 2.221485069 seconds time elapsed +TOTAL : 2.201402 sec + 6,462,352,884 cycles # 2.927 GHz + 12,911,212,720 instructions # 2.00 insn per cycle + 2.218781232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.667915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.262685e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.262685e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.619696e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.207150e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.207150e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.035685 sec - 6,023,916,551 cycles # 2.952 GHz - 12,289,358,511 instructions # 2.04 insn per cycle - 2.048760853 seconds time elapsed +TOTAL : 2.049198 sec + 5,990,685,994 cycles # 2.915 GHz + 12,289,610,651 instructions # 2.05 insn per cycle + 2.062389699 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.732387e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.980202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.980202e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.732022e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.981030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.981030e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.997759 sec - 6,197,071,661 cycles # 2.063 GHz - 8,641,496,172 instructions # 1.39 insn per cycle - 3.005264550 seconds time elapsed +TOTAL : 2.995536 sec + 6,173,358,415 cycles # 2.057 GHz + 8,641,899,643 instructions # 1.40 insn per cycle + 3.012705154 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1797) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index d09bb4b7a6..76ed0eee8f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:52:31 +DATE: 2023-08-15_08:41:20 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.578164e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154713e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.263268e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.644029e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.145205e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.252895e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.639291 sec - 2,607,428,297 cycles # 2.926 GHz - 3,454,638,950 instructions # 1.32 insn per cycle - 0.948479486 seconds time elapsed +TOTAL : 0.635318 sec + 2,561,422,666 cycles # 2.945 GHz + 3,393,296,507 instructions # 1.32 insn per cycle + 0.929392556 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.940434e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.003412e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.003412e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.979808e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.043822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.043822e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.577302 sec - 17,019,858,330 cycles # 3.051 GHz - 45,522,764,588 instructions # 2.67 insn per cycle - 5.584174086 seconds time elapsed +TOTAL : 5.460485 sec + 17,004,589,822 cycles # 3.113 GHz + 45,522,952,314 instructions # 2.68 insn per cycle + 5.466956367 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.380380e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.586221e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.586221e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.390158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596269e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.267725 sec - 10,099,460,028 cycles # 3.086 GHz - 27,887,130,789 instructions # 2.76 insn per cycle - 3.280070805 seconds time elapsed +TOTAL : 3.256443 sec + 10,096,422,053 cycles # 3.096 GHz + 27,887,588,220 instructions # 2.76 insn per cycle + 3.262761314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.224044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.717619e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.717619e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.186500e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.673124e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.673124e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.173338 sec - 6,331,207,138 cycles # 2.906 GHz - 12,608,260,458 instructions # 1.99 insn per cycle - 2.185504855 seconds time elapsed +TOTAL : 2.186284 sec + 6,324,981,711 cycles # 2.887 GHz + 12,609,506,928 instructions # 1.99 insn per cycle + 2.202007823 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.691099e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.281425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.281425e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.757703e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.364269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.364269e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.010134 sec - 5,837,236,875 cycles # 2.896 GHz - 11,952,982,534 instructions # 2.05 insn per cycle - 2.016939773 seconds time elapsed +TOTAL : 1.989394 sec + 5,802,568,748 cycles # 2.909 GHz + 11,953,910,414 instructions # 2.06 insn per cycle + 2.013447216 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.778351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.031972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.031972e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.789206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.042238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.042238e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.943008 sec - 6,022,302,987 cycles # 2.043 GHz - 8,348,308,553 instructions # 1.39 insn per cycle - 2.949737015 seconds time elapsed +TOTAL : 2.931825 sec + 6,005,179,047 cycles # 2.045 GHz + 8,349,149,388 instructions # 1.39 insn per cycle + 2.943653883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1797) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 683baac6d5..490b24f7f8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:49:15 +DATE: 2023-08-15_08:38:04 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.575784e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157081e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.267562e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.668263e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155660e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265888e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.582939 sec - 2,403,212,234 cycles # 2.932 GHz - 3,389,061,046 instructions # 1.41 insn per cycle - 0.878155162 seconds time elapsed +TOTAL : 0.576348 sec + 2,379,994,863 cycles # 2.928 GHz + 3,380,609,648 instructions # 1.42 insn per cycle + 0.869854185 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.977778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.967394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.031238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.031238e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.409184 sec - 16,831,155,605 cycles # 3.110 GHz - 45,502,347,982 instructions # 2.70 insn per cycle - 5.416204621 seconds time elapsed +TOTAL : 5.438543 sec + 16,825,741,082 cycles # 3.092 GHz + 45,503,772,977 instructions # 2.70 insn per cycle + 5.444908289 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.401488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608777e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.394004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.599065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599065e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.191795 sec - 9,938,765,898 cycles # 3.111 GHz - 27,883,985,957 instructions # 2.81 insn per cycle - 3.203744530 seconds time elapsed +TOTAL : 3.196288 sec + 9,907,973,976 cycles # 3.097 GHz + 27,884,963,666 instructions # 2.81 insn per cycle + 3.217592118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.248410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.749914e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.749914e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.302667e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.812822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.812822e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.104854 sec - 6,113,991,842 cycles # 2.899 GHz - 12,621,835,537 instructions # 2.06 insn per cycle - 2.117202137 seconds time elapsed +TOTAL : 2.081335 sec + 6,127,956,387 cycles # 2.939 GHz + 12,618,745,029 instructions # 2.06 insn per cycle + 2.096904001 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.665877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.251565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.251565e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.775458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.382901e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.382901e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.957903 sec - 5,659,858,299 cycles # 2.883 GHz - 12,002,125,179 instructions # 2.12 insn per cycle - 1.969758439 seconds time elapsed +TOTAL : 1.920774 sec + 5,660,233,376 cycles # 2.940 GHz + 12,001,022,164 instructions # 2.12 insn per cycle + 1.933401381 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.784706e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.039225e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.039225e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.842171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.103040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.103040e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.881277 sec - 5,830,715,504 cycles # 2.021 GHz - 8,395,845,659 instructions # 1.44 insn per cycle - 2.893915465 seconds time elapsed +TOTAL : 2.834457 sec + 5,802,842,720 cycles # 2.045 GHz + 8,396,122,638 instructions # 1.45 insn per cycle + 2.846418059 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1797) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index ae26c38b3c..0d615af3df 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:46:03 +DATE: 2023-08-15_08:34:53 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.057740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153779e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266087e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.019428e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157075e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265817e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.758819 sec - 2,908,268,959 cycles # 2.919 GHz - 4,197,442,056 instructions # 1.44 insn per cycle - 1.053462141 seconds time elapsed +TOTAL : 0.759005 sec + 2,920,260,499 cycles # 2.930 GHz + 4,235,878,653 instructions # 1.45 insn per cycle + 1.056436552 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -71,14 +71,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.971704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.035374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.035374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.955683e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.020031e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020031e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.424085 sec - 16,823,922,089 cycles # 3.099 GHz - 45,502,594,808 instructions # 2.70 insn per cycle - 5.430799449 seconds time elapsed +TOTAL : 5.468306 sec + 16,841,401,584 cycles # 3.078 GHz + 45,503,912,146 instructions # 2.70 insn per cycle + 5.474662171 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -97,14 +97,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.409990e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618523e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618523e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.390321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.596833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596833e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.182355 sec - 9,910,451,666 cycles # 3.113 GHz - 27,884,889,500 instructions # 2.81 insn per cycle - 3.198309579 seconds time elapsed +TOTAL : 3.199564 sec + 9,905,525,381 cycles # 3.092 GHz + 27,884,768,172 instructions # 2.82 insn per cycle + 3.215066669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -123,14 +123,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.287202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.807157e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.807157e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.267750e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.776839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776839e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.089678 sec - 6,123,065,906 cycles # 2.923 GHz - 12,622,602,800 instructions # 2.06 insn per cycle - 2.101730753 seconds time elapsed +TOTAL : 2.094602 sec + 6,143,328,303 cycles # 2.927 GHz + 12,622,735,883 instructions # 2.05 insn per cycle + 2.110762418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.766335e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.371975e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.728979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.335326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.335326e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.923614 sec - 5,628,916,840 cycles # 2.919 GHz - 12,000,900,382 instructions # 2.13 insn per cycle - 1.935327170 seconds time elapsed +TOTAL : 1.934965 sec + 5,651,469,044 cycles # 2.914 GHz + 12,001,153,045 instructions # 2.12 insn per cycle + 1.946292203 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 144) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.799964e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.056456e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.056456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.774699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.028447e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.028447e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.865602 sec - 5,812,925,399 cycles # 2.025 GHz - 8,396,116,234 instructions # 1.44 insn per cycle - 2.871904491 seconds time elapsed +TOTAL : 2.882840 sec + 5,816,807,873 cycles # 2.015 GHz + 8,396,518,147 instructions # 1.44 insn per cycle + 2.894278819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1446) (512y: 122) (512z: 1797) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 10d1a35cc8..22a0517515 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:57:50 +DATE: 2023-08-15_07:57:57 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.983233e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.128032e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.262896e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.008544e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.166017e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.262922e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.544627 sec - 2,258,067,954 cycles # 2.876 GHz - 2,917,537,016 instructions # 1.29 insn per cycle - 1.035452053 seconds time elapsed +TOTAL : 0.525464 sec + 2,217,740,109 cycles # 2.909 GHz + 2,859,445,292 instructions # 1.29 insn per cycle + 0.819678835 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.013129e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079208e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079208e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.012615e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079882e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079882e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.314760 sec - 16,484,376,592 cycles # 3.100 GHz - 44,494,477,715 instructions # 2.70 insn per cycle - 5.800424614 seconds time elapsed +TOTAL : 5.316949 sec + 16,451,778,287 cycles # 3.092 GHz + 44,494,546,010 instructions # 2.70 insn per cycle + 5.323396072 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 576) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.553355e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.780251e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.525477e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.747634e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747634e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.057938 sec - 9,484,695,070 cycles # 3.096 GHz - 26,734,245,692 instructions # 2.82 insn per cycle - 3.134160361 seconds time elapsed +TOTAL : 3.089064 sec + 9,483,642,291 cycles # 3.070 GHz + 26,735,070,417 instructions # 2.82 insn per cycle + 3.100712316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2339) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.814566e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.237891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.237891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.814360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.235072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.235072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.282650 sec - 6,662,154,500 cycles # 2.912 GHz - 14,167,820,241 instructions # 2.13 insn per cycle - 2.388899112 seconds time elapsed +TOTAL : 2.284241 sec + 6,672,382,951 cycles # 2.914 GHz + 14,170,881,099 instructions # 2.12 insn per cycle + 2.296317691 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.000728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.449733e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.449733e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.075753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.538831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.538831e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.210051 sec - 6,409,451,996 cycles # 2.898 GHz - 13,692,654,091 instructions # 2.14 insn per cycle - 2.301538897 seconds time elapsed +TOTAL : 2.170660 sec + 6,388,748,184 cycles # 2.937 GHz + 13,691,137,225 instructions # 2.14 insn per cycle + 2.183487516 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2405) (512y: 296) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.628758e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.862811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.862811e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.631677e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.866426e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.866426e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.996428 sec - 6,034,648,193 cycles # 2.010 GHz - 10,190,028,146 instructions # 1.69 insn per cycle - 3.058767535 seconds time elapsed +TOTAL : 2.993412 sec + 6,037,646,643 cycles # 2.013 GHz + 10,189,515,870 instructions # 1.69 insn per cycle + 3.005471354 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 208) (512z: 1980) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index b3c460076b..3f97b3c235 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:28:51 +DATE: 2023-08-15_08:17:56 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.505336e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.149191e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.264052e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.684024e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156752e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265433e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.531684 sec - 2,231,295,184 cycles # 2.895 GHz - 2,887,465,321 instructions # 1.29 insn per cycle - 0.828622923 seconds time elapsed +TOTAL : 0.527445 sec + 2,196,196,584 cycles # 2.883 GHz + 2,837,159,945 instructions # 1.29 insn per cycle + 0.820745401 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.643772e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.643772e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.556304e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.666245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666245e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.241024 sec - 12,877,410,916 cycles # 3.033 GHz - 34,450,584,045 instructions # 2.68 insn per cycle - 4.247840296 seconds time elapsed +TOTAL : 4.208653 sec + 12,868,804,128 cycles # 3.055 GHz + 34,451,363,346 instructions # 2.68 insn per cycle + 4.214976170 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 680) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.063197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.233833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.233833e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.168050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345967e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.541787 sec - 10,661,093,531 cycles # 3.012 GHz - 24,137,629,429 instructions # 2.26 insn per cycle - 3.560275350 seconds time elapsed +TOTAL : 3.416327 sec + 10,630,599,247 cycles # 3.109 GHz + 24,137,030,173 instructions # 2.27 insn per cycle + 3.432016204 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.763224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.180717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.180717e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.809442e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.228508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.228508e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.308191 sec - 6,725,300,147 cycles # 2.907 GHz - 12,529,817,503 instructions # 1.86 insn per cycle - 2.327404703 seconds time elapsed +TOTAL : 2.285822 sec + 6,681,741,196 cycles # 2.917 GHz + 12,529,897,185 instructions # 1.88 insn per cycle + 2.301347809 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3156) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.954525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.407787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.407787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.063333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.530304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.530304e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.223778 sec - 6,376,073,243 cycles # 2.861 GHz - 11,703,972,830 instructions # 1.84 insn per cycle - 2.240275759 seconds time elapsed +TOTAL : 2.175701 sec + 6,360,455,844 cycles # 2.917 GHz + 11,704,234,219 instructions # 1.84 insn per cycle + 2.187563538 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2692) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.047722e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.338757e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.338757e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.982609e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.263683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.263683e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.698075 sec - 5,475,737,795 cycles # 2.027 GHz - 9,425,521,330 instructions # 1.72 insn per cycle - 2.704616503 seconds time elapsed +TOTAL : 2.738643 sec + 5,460,660,660 cycles # 1.991 GHz + 9,426,098,964 instructions # 1.73 insn per cycle + 2.754227372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index e4c216ffec..b296b295a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:29:19 +DATE: 2023-08-15_08:18:24 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.502531e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.148505e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.262555e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.675639e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156622e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.263843e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534033 sec - 2,214,325,568 cycles # 2.871 GHz - 2,879,800,449 instructions # 1.30 insn per cycle - 0.830897961 seconds time elapsed +TOTAL : 0.526110 sec + 2,203,683,409 cycles # 2.889 GHz + 2,882,164,152 instructions # 1.31 insn per cycle + 0.820053431 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.680039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.799073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.799073e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.681420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.800838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800838e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.019765 sec - 12,427,773,310 cycles # 3.088 GHz - 35,153,431,485 instructions # 2.83 insn per cycle - 4.026331156 seconds time elapsed +TOTAL : 4.015999 sec + 12,417,250,293 cycles # 3.088 GHz + 35,154,287,962 instructions # 2.83 insn per cycle + 4.022456422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 456) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.125367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.299261e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.299261e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.100804e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.460202 sec - 10,717,304,165 cycles # 3.093 GHz - 23,209,215,011 instructions # 2.17 insn per cycle - 3.466387248 seconds time elapsed +TOTAL : 3.491921 sec + 10,723,909,681 cycles # 3.069 GHz + 23,214,463,378 instructions # 2.16 insn per cycle + 3.504911524 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.124648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.605140e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.605140e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.133216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.615370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.615370e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.153227 sec - 6,257,937,893 cycles # 2.900 GHz - 12,084,852,257 instructions # 1.93 insn per cycle - 2.169691369 seconds time elapsed +TOTAL : 2.147355 sec + 6,243,347,073 cycles # 2.902 GHz + 12,085,563,659 instructions # 1.94 insn per cycle + 2.162840079 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2511) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.270691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.769410e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.769410e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.286872e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.794622e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.794622e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.096884 sec - 6,118,706,124 cycles # 2.911 GHz - 11,258,670,812 instructions # 1.84 insn per cycle - 2.123303174 seconds time elapsed +TOTAL : 2.089602 sec + 6,108,057,445 cycles # 2.919 GHz + 11,259,587,097 instructions # 1.84 insn per cycle + 2.105276019 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2128) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.181193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.494715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.494715e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.171723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.483529e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.483529e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.615436 sec - 5,307,075,581 cycles # 2.025 GHz - 9,147,114,973 instructions # 1.72 insn per cycle - 2.621991753 seconds time elapsed +TOTAL : 2.619031 sec + 5,330,469,490 cycles # 2.031 GHz + 9,148,199,525 instructions # 1.72 insn per cycle + 2.634939963 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1651) (512y: 208) (512z: 1566) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 3fd3afb3ed..34fa02a67a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:58:21 +DATE: 2023-08-15_07:58:26 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.160934e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.565971e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.931854e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.029364e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.652279e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.917363e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.490770 sec - 2,082,118,909 cycles # 2.881 GHz - 2,645,784,933 instructions # 1.27 insn per cycle - 1.042321469 seconds time elapsed +TOTAL : 0.480244 sec + 2,046,925,223 cycles # 2.893 GHz + 2,612,431,160 instructions # 1.28 insn per cycle + 0.764545118 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047990e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106266e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.043990e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.101952e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101952e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.206881 sec - 16,081,241,576 cycles # 3.085 GHz - 45,312,850,560 instructions # 2.82 insn per cycle - 5.236018652 seconds time elapsed +TOTAL : 5.215979 sec + 16,072,416,204 cycles # 3.078 GHz + 45,313,248,503 instructions # 2.82 insn per cycle + 5.222636507 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.864201e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.239775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.239775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.863735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.234843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.234843e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.256813 sec - 6,928,859,301 cycles # 3.080 GHz - 17,681,584,008 instructions # 2.55 insn per cycle - 2.703442708 seconds time elapsed +TOTAL : 2.242573 sec + 6,925,966,298 cycles # 3.081 GHz + 17,681,196,805 instructions # 2.55 insn per cycle + 2.256043920 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.842357e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005307e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005307e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.888993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.014224e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014224e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.271685 sec - 3,741,885,683 cycles # 2.928 GHz - 8,261,920,511 instructions # 2.21 insn per cycle - 1.337247695 seconds time elapsed +TOTAL : 1.264930 sec + 3,728,910,193 cycles # 2.939 GHz + 8,261,400,317 instructions # 2.22 insn per cycle + 1.276544799 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3361) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.345251e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071871e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.349259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071142e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071142e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.207692 sec - 3,548,701,242 cycles # 2.925 GHz - 7,873,340,317 instructions # 2.22 insn per cycle - 1.264339125 seconds time elapsed +TOTAL : 1.206356 sec + 3,546,231,866 cycles # 2.927 GHz + 7,872,842,225 instructions # 2.22 insn per cycle + 1.212871301 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.034321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.775777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.775777e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.069995e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.813167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.813167e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.577061 sec - 3,267,469,302 cycles # 2.064 GHz - 6,096,935,037 instructions # 1.87 insn per cycle - 1.702363059 seconds time elapsed +TOTAL : 1.568624 sec + 3,264,339,653 cycles # 2.075 GHz + 6,096,363,429 instructions # 1.87 insn per cycle + 1.580964642 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2367) (512y: 24) (512z: 2156) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 7dd407e021..699719261c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:39:57 +DATE: 2023-08-15_08:28:46 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.010540e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.912410e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.912410e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.113852e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.906495e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.906495e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086808e+00 +- 3.414090e-03 ) GeV^0 -TOTAL : 0.693456 sec - 2,717,729,667 cycles # 2.922 GHz - 3,878,239,062 instructions # 1.43 insn per cycle - 0.988023566 seconds time elapsed +TOTAL : 0.687407 sec + 2,703,287,943 cycles # 2.930 GHz + 3,880,423,043 instructions # 1.44 insn per cycle + 0.979473385 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -78,14 +78,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050497e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.110164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.110164e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040547e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.098289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.098289e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.246377 sec - 16,257,060,264 cycles # 3.095 GHz - 45,360,215,481 instructions # 2.79 insn per cycle - 5.253732780 seconds time elapsed +TOTAL : 5.266229 sec + 16,257,367,161 cycles # 3.084 GHz + 45,360,585,545 instructions # 2.79 insn per cycle + 5.273291355 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -105,14 +105,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.829048e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.195267e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.195267e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.841948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.207039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.207039e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.302622 sec - 7,131,297,086 cycles # 3.089 GHz - 17,961,809,083 instructions # 2.52 insn per cycle - 2.315455331 seconds time elapsed +TOTAL : 2.298048 sec + 7,125,726,743 cycles # 3.097 GHz + 17,962,120,405 instructions # 2.52 insn per cycle + 2.311237641 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.714376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.913117e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.913117e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.492655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.673572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.673572e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.333746 sec - 3,939,369,974 cycles # 2.939 GHz - 8,499,085,527 instructions # 2.16 insn per cycle - 1.346799096 seconds time elapsed +TOTAL : 1.367552 sec + 3,923,165,806 cycles # 2.856 GHz + 8,499,549,873 instructions # 2.17 insn per cycle + 1.385449516 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3361) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.239249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058206e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058206e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.178718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.052721e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052721e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.266811 sec - 3,749,844,444 cycles # 2.946 GHz - 8,110,409,781 instructions # 2.16 insn per cycle - 1.283571500 seconds time elapsed +TOTAL : 1.270952 sec + 3,751,683,787 cycles # 2.940 GHz + 8,110,729,315 instructions # 2.16 insn per cycle + 1.283890460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.991082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.714470e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.714470e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.936240e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.658700e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.658700e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.631584 sec - 3,472,446,198 cycles # 2.120 GHz - 6,351,366,915 instructions # 1.83 insn per cycle - 1.639067181 seconds time elapsed +TOTAL : 1.643742 sec + 3,464,570,040 cycles # 2.100 GHz + 6,351,528,205 instructions # 1.83 insn per cycle + 1.660374822 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2367) (512y: 24) (512z: 2156) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index ba8743b43c..8ab67a0fea 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:53:01 +DATE: 2023-08-15_08:41:49 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.244084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.619464e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.931950e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.538632e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.606886e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.900636e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.588199 sec - 2,356,687,666 cycles # 2.877 GHz - 3,128,600,593 instructions # 1.33 insn per cycle - 0.877489308 seconds time elapsed +TOTAL : 0.580696 sec + 2,338,833,755 cycles # 2.895 GHz + 3,122,120,979 instructions # 1.33 insn per cycle + 0.865057805 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.058641e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.117455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.117455e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.041632e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.100080e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.100080e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.237638 sec - 16,242,934,153 cycles # 3.100 GHz - 45,342,934,109 instructions # 2.79 insn per cycle - 5.243915805 seconds time elapsed +TOTAL : 5.280524 sec + 16,237,390,286 cycles # 3.076 GHz + 45,344,009,234 instructions # 2.79 insn per cycle + 5.286363951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.887670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.264467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.264467e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.818343e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.185778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.185778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.284531 sec - 7,099,082,575 cycles # 3.100 GHz - 17,693,728,547 instructions # 2.49 insn per cycle - 2.291110477 seconds time elapsed +TOTAL : 2.314082 sec + 7,088,214,755 cycles # 3.056 GHz + 17,695,395,796 instructions # 2.50 insn per cycle + 2.326567496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.858691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010326e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010326e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.843902e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007868e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007868e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.323527 sec - 3,906,499,539 cycles # 2.938 GHz - 8,245,762,880 instructions # 2.11 insn per cycle - 1.336058516 seconds time elapsed +TOTAL : 1.324203 sec + 3,892,773,802 cycles # 2.929 GHz + 8,246,686,406 instructions # 2.12 insn per cycle + 1.335693323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3361) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.327926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072423e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072423e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.319473e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.067358e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.067358e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.265117 sec - 3,731,493,336 cycles # 2.938 GHz - 7,823,716,244 instructions # 2.10 insn per cycle - 1.271337850 seconds time elapsed +TOTAL : 1.265326 sec + 3,710,510,965 cycles # 2.921 GHz + 7,824,604,210 instructions # 2.11 insn per cycle + 1.271514391 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.101982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.851633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.851633e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.065974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.817314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.817314e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.617072 sec - 3,435,879,758 cycles # 2.118 GHz - 6,047,344,361 instructions # 1.76 insn per cycle - 1.629238619 seconds time elapsed +TOTAL : 1.622939 sec + 3,424,491,515 cycles # 2.104 GHz + 6,048,054,064 instructions # 1.77 insn per cycle + 1.628727410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2367) (512y: 24) (512z: 2156) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index d42239a574..b810c04a14 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:49:44 +DATE: 2023-08-15_08:38:33 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.002237e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.618906e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.931498e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.598687e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.638860e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.937584e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.535648 sec - 2,204,801,464 cycles # 2.887 GHz - 3,129,835,481 instructions # 1.42 insn per cycle - 0.821032344 seconds time elapsed +TOTAL : 0.532463 sec + 2,194,687,457 cycles # 2.866 GHz + 3,113,635,867 instructions # 1.42 insn per cycle + 0.825483096 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.066093e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.124968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.124968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.104740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.104740e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.161995 sec - 16,074,665,789 cycles # 3.113 GHz - 45,312,747,035 instructions # 2.82 insn per cycle - 5.168223881 seconds time elapsed +TOTAL : 5.222255 sec + 16,104,081,465 cycles # 3.082 GHz + 45,313,673,726 instructions # 2.81 insn per cycle + 5.228069360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.902389e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.277188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.277188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.837866e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.208016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.208016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.225204 sec - 6,924,375,600 cycles # 3.104 GHz - 17,680,565,169 instructions # 2.55 insn per cycle - 2.241098124 seconds time elapsed +TOTAL : 2.253970 sec + 6,919,832,712 cycles # 3.063 GHz + 17,681,383,889 instructions # 2.56 insn per cycle + 2.265893838 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.813065e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.003999e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.003999e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.842268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006130e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006130e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.276288 sec - 3,741,495,813 cycles # 2.920 GHz - 8,261,125,998 instructions # 2.21 insn per cycle - 1.291811025 seconds time elapsed +TOTAL : 1.270817 sec + 3,727,622,493 cycles # 2.922 GHz + 8,261,948,722 instructions # 2.22 insn per cycle + 1.283347035 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3361) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.090966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038383e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.038383e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.342312e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.070330e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070330e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.239469 sec - 3,543,410,137 cycles # 2.847 GHz - 7,872,674,692 instructions # 2.22 insn per cycle - 1.255239873 seconds time elapsed +TOTAL : 1.206395 sec + 3,540,586,382 cycles # 2.922 GHz + 7,873,268,970 instructions # 2.22 insn per cycle + 1.218817952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.065396e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.806178e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.806178e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.961762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.678036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.678036e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.570450 sec - 3,259,955,922 cycles # 2.070 GHz - 6,096,147,222 instructions # 1.87 insn per cycle - 1.584218068 seconds time elapsed +TOTAL : 1.591067 sec + 3,266,654,573 cycles # 2.048 GHz + 6,096,994,037 instructions # 1.87 insn per cycle + 1.597459012 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2367) (512y: 24) (512z: 2156) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index a7efeabd7b..d169068d52 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:46:32 +DATE: 2023-08-15_08:35:22 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.701205e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.626024e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.929227e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.823545e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634071e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.926324e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086808e+00 +- 3.414090e-03 ) GeV^0 -TOTAL : 0.636129 sec - 2,532,842,583 cycles # 2.933 GHz - 3,583,092,222 instructions # 1.41 insn per cycle - 0.922854096 seconds time elapsed +TOTAL : 0.634580 sec + 2,519,256,028 cycles # 2.927 GHz + 3,597,945,906 instructions # 1.43 insn per cycle + 0.919708259 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -71,14 +71,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.055116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.114401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.114401e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.047792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.106404e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.187971 sec - 16,068,115,306 cycles # 3.095 GHz - 45,313,503,736 instructions # 2.82 insn per cycle - 5.194876670 seconds time elapsed +TOTAL : 5.205858 sec + 16,074,209,490 cycles # 3.085 GHz + 45,313,436,920 instructions # 2.82 insn per cycle + 5.212589062 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 622) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -97,14 +97,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.883543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.266859e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.266859e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.875281e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.250038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.250038e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.233711 sec - 6,931,855,014 cycles # 3.098 GHz - 17,681,249,450 instructions # 2.55 insn per cycle - 2.249909528 seconds time elapsed +TOTAL : 2.236319 sec + 6,919,411,058 cycles # 3.088 GHz + 17,681,439,898 instructions # 2.56 insn per cycle + 2.242251935 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -123,14 +123,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.911856e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.016101e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.016101e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.834711e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.005706e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.005706e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.261932 sec - 3,727,000,978 cycles # 2.943 GHz - 8,261,741,243 instructions # 2.22 insn per cycle - 1.274289497 seconds time elapsed +TOTAL : 1.271538 sec + 3,732,044,660 cycles # 2.923 GHz + 8,262,000,987 instructions # 2.21 insn per cycle + 1.283281912 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3361) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.322510e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.069569e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.069569e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.348762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071277e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071277e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.210153 sec - 3,553,215,469 cycles # 2.924 GHz - 7,873,042,149 instructions # 2.22 insn per cycle - 1.216160042 seconds time elapsed +TOTAL : 1.205855 sec + 3,543,052,493 cycles # 2.926 GHz + 7,873,220,879 instructions # 2.22 insn per cycle + 1.217711284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.040854e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.795524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.795524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.085582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.832029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.832029e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.575401 sec - 3,269,165,561 cycles # 2.069 GHz - 6,096,776,633 instructions # 1.86 insn per cycle - 1.587219712 seconds time elapsed +TOTAL : 1.564744 sec + 3,257,258,720 cycles # 2.075 GHz + 6,096,841,191 instructions # 1.87 insn per cycle + 1.580476949 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2367) (512y: 24) (512z: 2156) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 9feef9b484..6ff4bf4cd5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:58:49 +DATE: 2023-08-15_07:58:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.143860e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.594415e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.974248e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036556e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.702022e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979115e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.492804 sec - 2,075,163,081 cycles # 2.871 GHz - 2,634,662,565 instructions # 1.27 insn per cycle - 1.090909734 seconds time elapsed +TOTAL : 0.482274 sec + 2,051,946,623 cycles # 2.890 GHz + 2,621,601,452 instructions # 1.28 insn per cycle + 0.767984472 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.084437e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.144983e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.144983e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.072485e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132741e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.116661 sec - 15,910,056,887 cycles # 3.108 GHz - 44,489,738,058 instructions # 2.80 insn per cycle - 5.199281284 seconds time elapsed +TOTAL : 5.147272 sec + 15,849,140,606 cycles # 3.077 GHz + 44,489,793,003 instructions # 2.81 insn per cycle + 5.153665912 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 576) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.647815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.171017e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.171017e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.721713e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.243717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.243717e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.944753 sec - 5,940,873,467 cycles # 3.045 GHz - 16,979,224,556 instructions # 2.86 insn per cycle - 2.035295555 seconds time elapsed +TOTAL : 1.918320 sec + 5,932,795,544 cycles # 3.085 GHz + 16,978,686,078 instructions # 2.86 insn per cycle + 1.930651299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2881) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.426224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.054822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.054822e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.428912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.059732e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.059732e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.717755 sec - 5,027,531,882 cycles # 2.916 GHz - 10,226,017,896 instructions # 2.03 insn per cycle - 1.806927196 seconds time elapsed +TOTAL : 1.715657 sec + 5,020,215,358 cycles # 2.917 GHz + 10,225,262,266 instructions # 2.04 insn per cycle + 1.727545908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.502012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.143044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.143044e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.480209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.124270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.124270e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.698439 sec - 4,965,927,578 cycles # 2.913 GHz - 9,949,108,842 instructions # 2.00 insn per cycle - 1.782058504 seconds time elapsed +TOTAL : 1.703473 sec + 4,964,855,233 cycles # 2.906 GHz + 9,948,546,373 instructions # 2.00 insn per cycle + 1.716183447 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3789) (512y: 2) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.071249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.442783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.442783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.068334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.440911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.440911e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.165063 sec - 4,376,914,328 cycles # 2.027 GHz - 8,454,256,022 instructions # 1.93 insn per cycle - 2.201448693 seconds time elapsed +TOTAL : 2.161474 sec + 4,375,796,303 cycles # 2.024 GHz + 8,454,458,995 instructions # 1.93 insn per cycle + 2.173311929 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2882) (512y: 4) (512z: 2751) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 15fc39d339..568b6f41d6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:29:47 +DATE: 2023-08-15_08:18:52 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.835982e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.614523e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.933300e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.616621e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635958e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931720e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486316 sec - 2,086,040,390 cycles # 2.904 GHz - 2,659,832,399 instructions # 1.28 insn per cycle - 0.776013350 seconds time elapsed +TOTAL : 0.485656 sec + 2,051,660,572 cycles # 2.884 GHz + 2,621,055,870 instructions # 1.28 insn per cycle + 0.770512814 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.694024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.795030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.795030e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.632191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730309e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.979003 sec - 12,374,442,707 cycles # 3.108 GHz - 34,382,741,449 instructions # 2.78 insn per cycle - 3.985316588 seconds time elapsed +TOTAL : 4.071252 sec + 12,383,661,598 cycles # 3.038 GHz + 34,383,527,670 instructions # 2.78 insn per cycle + 4.077255403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.648771e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.154768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.154768e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.524261e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.028231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.028231e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.943294 sec - 6,026,346,378 cycles # 3.092 GHz - 14,884,776,043 instructions # 2.47 insn per cycle - 1.955652485 seconds time elapsed +TOTAL : 1.984266 sec + 6,019,410,390 cycles # 3.025 GHz + 14,885,531,276 instructions # 2.47 insn per cycle + 1.996173107 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.401998e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.281350e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.281350e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.506887e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.379568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.379568e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.504962 sec - 4,282,145,103 cycles # 2.835 GHz - 9,052,871,661 instructions # 2.11 insn per cycle - 1.511748022 seconds time elapsed +TOTAL : 1.483333 sec + 4,312,836,331 cycles # 2.899 GHz + 9,052,928,178 instructions # 2.10 insn per cycle + 1.496086705 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4445) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.822836e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.748555e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.748555e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.730445e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651954e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.426805 sec - 4,217,811,035 cycles # 2.947 GHz - 8,687,767,685 instructions # 2.06 insn per cycle - 1.438794580 seconds time elapsed +TOTAL : 1.441275 sec + 4,220,027,025 cycles # 2.918 GHz + 8,688,292,013 instructions # 2.06 insn per cycle + 1.456976050 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4244) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.834137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.333036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.333036e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.856381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.355382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.355382e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.883313 sec - 3,861,123,651 cycles # 2.045 GHz - 7,831,049,030 instructions # 2.03 insn per cycle - 1.899756239 seconds time elapsed +TOTAL : 1.875924 sec + 3,851,740,176 cycles # 2.048 GHz + 7,831,569,152 instructions # 2.03 insn per cycle + 1.891782027 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4420) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 7eca0d6aab..fff20efbc0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_01:30:11 +DATE: 2023-08-15_08:19:16 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.869620e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.648133e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.970109e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.645282e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.665963e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.984825e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.487292 sec - 2,053,806,819 cycles # 2.863 GHz - 2,627,373,563 instructions # 1.28 insn per cycle - 0.776306461 seconds time elapsed +TOTAL : 0.482528 sec + 2,065,134,611 cycles # 2.914 GHz + 2,629,043,759 instructions # 1.27 insn per cycle + 0.768011802 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.791893e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.900622e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.900622e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.762246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.873496e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.873496e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.841710 sec - 11,675,516,396 cycles # 3.035 GHz - 35,119,248,304 instructions # 3.01 insn per cycle - 3.848151194 seconds time elapsed +TOTAL : 3.883907 sec + 11,714,309,141 cycles # 3.014 GHz + 35,120,310,634 instructions # 3.00 insn per cycle + 3.889825259 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.746193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.271964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.271964e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.556370e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.046775e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.046775e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.911160 sec - 5,914,651,050 cycles # 3.087 GHz - 14,494,397,948 instructions # 2.45 insn per cycle - 1.926971498 seconds time elapsed +TOTAL : 1.972772 sec + 6,124,406,949 cycles # 3.099 GHz + 14,495,501,909 instructions # 2.37 insn per cycle + 1.985116410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.891492e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.850916e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.850916e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.918220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.878370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.878370e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.414190 sec - 4,154,137,756 cycles # 2.926 GHz - 8,870,804,673 instructions # 2.14 insn per cycle - 1.426203777 seconds time elapsed +TOTAL : 1.408181 sec + 4,130,037,548 cycles # 2.922 GHz + 8,871,524,390 instructions # 2.15 insn per cycle + 1.424438220 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3563) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.818798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.748862e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.748862e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.886901e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.839065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.839065e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.426013 sec - 4,186,172,836 cycles # 2.925 GHz - 8,435,650,259 instructions # 2.02 insn per cycle - 1.438079536 seconds time elapsed +TOTAL : 1.414127 sec + 4,130,988,668 cycles # 2.911 GHz + 8,436,330,650 instructions # 2.04 insn per cycle + 1.425576395 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3320) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.927493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.446510e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.446510e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.909459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.424475e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.424475e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.854590 sec - 3,808,599,712 cycles # 2.048 GHz - 7,723,969,624 instructions # 2.03 insn per cycle - 1.866333874 seconds time elapsed +TOTAL : 1.862780 sec + 3,795,301,905 cycles # 2.035 GHz + 7,724,865,647 instructions # 2.04 insn per cycle + 1.874275599 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3436) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index d5a1671318..8baebbd2f3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:59:16 +DATE: 2023-08-15_07:59:17 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.980989e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.124053e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.259854e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.009693e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.164196e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.261064e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541302 sec - 2,261,658,830 cycles # 2.891 GHz - 2,900,999,936 instructions # 1.28 insn per cycle - 1.249993617 seconds time elapsed +TOTAL : 0.527651 sec + 2,207,019,653 cycles # 2.898 GHz + 2,847,015,082 instructions # 1.29 insn per cycle + 0.821000983 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.956428e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.019301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.019301e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.011489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.011489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.466375 sec - 16,961,720,728 cycles # 3.100 GHz - 45,670,201,073 instructions # 2.69 insn per cycle - 5.517481147 seconds time elapsed +TOTAL : 5.486964 sec + 16,967,538,345 cycles # 3.091 GHz + 45,670,632,768 instructions # 2.69 insn per cycle + 5.493623126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.414778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622596e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622596e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.425735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.635769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.635769e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.177295 sec - 9,852,165,521 cycles # 3.096 GHz - 27,643,385,462 instructions # 2.81 insn per cycle - 3.308033691 seconds time elapsed +TOTAL : 3.167024 sec + 9,830,113,960 cycles # 3.099 GHz + 27,642,674,750 instructions # 2.81 insn per cycle + 3.179154372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2591) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.287239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.788029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.788029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.356184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.870584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.870584e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.089486 sec - 6,099,010,448 cycles # 2.911 GHz - 12,536,557,068 instructions # 2.06 insn per cycle - 2.261296960 seconds time elapsed +TOTAL : 2.063479 sec + 6,070,660,065 cycles # 2.936 GHz + 12,536,006,083 instructions # 2.07 insn per cycle + 2.079986594 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2753) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.808395e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.427637e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.427637e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.722108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.342034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.342034e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.911598 sec - 5,608,592,386 cycles # 2.925 GHz - 11,921,515,037 instructions # 2.13 insn per cycle - 2.261130407 seconds time elapsed +TOTAL : 1.938671 sec + 5,596,368,925 cycles # 2.879 GHz + 11,921,657,746 instructions # 2.13 insn per cycle + 1.951359996 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2503) (512y: 146) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.859147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.127272e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.127272e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.872349e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.139324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.139324e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.826035 sec - 5,726,121,904 cycles # 2.023 GHz - 8,189,921,866 instructions # 1.43 insn per cycle - 2.914211587 seconds time elapsed +TOTAL : 2.814204 sec + 5,702,887,021 cycles # 2.023 GHz + 8,189,649,910 instructions # 1.44 insn per cycle + 2.825947269 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 126) (512z: 1854) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 28f2075748..4970867e08 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-08-14_00:59:48 +DATE: 2023-08-15_07:59:45 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.990742e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.133865e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270245e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.010623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.171983e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271089e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.575854 sec - 2,283,009,141 cycles # 2.902 GHz - 2,905,157,639 instructions # 1.27 insn per cycle - 1.121628077 seconds time elapsed +TOTAL : 0.526957 sec + 2,195,732,629 cycles # 2.890 GHz + 2,848,939,282 instructions # 1.30 insn per cycle + 0.819885244 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -69,14 +69,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.004045e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.070914e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.070914e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.000475e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.066045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.066045e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.339027 sec - 16,534,965,811 cycles # 3.094 GHz - 44,662,225,463 instructions # 2.70 insn per cycle - 5.403018235 seconds time elapsed +TOTAL : 5.347714 sec + 16,529,515,673 cycles # 3.088 GHz + 44,662,506,175 instructions # 2.70 insn per cycle + 5.354104244 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 574) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe @@ -95,14 +95,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.531223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.756350e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.756350e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.580475e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.810771e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810771e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.077772 sec - 9,395,229,445 cycles # 3.048 GHz - 26,288,307,569 instructions # 2.80 insn per cycle - 3.158311115 seconds time elapsed +TOTAL : 3.034928 sec + 9,395,348,615 cycles # 3.090 GHz + 26,286,630,411 instructions # 2.80 insn per cycle + 3.051011696 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.707805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.106081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.106081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.626501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.016802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.016802e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.333429 sec - 6,815,499,414 cycles # 2.914 GHz - 14,083,572,667 instructions # 2.07 insn per cycle - 2.374712294 seconds time elapsed +TOTAL : 2.374125 sec + 6,801,396,202 cycles # 2.859 GHz + 14,083,469,511 instructions # 2.07 insn per cycle + 2.401099228 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2876) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe @@ -147,14 +147,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.948969e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.387231e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.387231e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.955222e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.399396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.399396e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.224240 sec - 6,500,763,911 cycles # 2.915 GHz - 13,522,297,824 instructions # 2.08 insn per cycle - 2.283652366 seconds time elapsed +TOTAL : 2.221630 sec + 6,484,638,076 cycles # 2.912 GHz + 13,521,792,259 instructions # 2.09 insn per cycle + 2.228210892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2516) (512y: 302) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe @@ -173,14 +173,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.866867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.133830e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.133830e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.847830e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.110433e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.110433e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.825858 sec - 5,688,912,887 cycles # 2.013 GHz - 9,297,759,565 instructions # 1.63 insn per cycle - 2.947206128 seconds time elapsed +TOTAL : 2.830298 sec + 5,669,518,548 cycles # 2.000 GHz + 9,292,775,819 instructions # 1.64 insn per cycle + 2.836480185 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1439) (512y: 212) (512z: 2053) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 9c073cd28f..97bc98c2b8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:00:19 +DATE: 2023-08-15_08:00:15 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.099250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000850e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.016247e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.346608e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.920121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004289e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.487488 sec - 2,022,246,842 cycles # 2.928 GHz - 2,560,448,488 instructions # 1.27 insn per cycle - 1.053824985 seconds time elapsed +TOTAL : 0.471560 sec + 1,973,950,685 cycles # 2.856 GHz + 2,560,293,237 instructions # 1.30 insn per cycle + 0.748224369 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.036097e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313538e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.330505e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.109718e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.317244e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329134e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.617163 sec - 2,523,728,565 cycles # 2.922 GHz - 3,487,370,863 instructions # 1.38 insn per cycle - 0.923238668 seconds time elapsed +TOTAL : 0.606081 sec + 2,471,054,363 cycles # 2.919 GHz + 3,431,530,401 instructions # 1.39 insn per cycle + 0.906910842 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.644173e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.661009e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.661009e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.643786e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660673e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660673e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.253418 sec - 19,261,768,893 cycles # 3.094 GHz - 59,004,550,974 instructions # 3.06 insn per cycle - 6.311957472 seconds time elapsed +TOTAL : 6.221919 sec + 19,243,608,253 cycles # 3.091 GHz + 59,004,769,590 instructions # 3.07 insn per cycle + 6.226949298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1189) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.862799e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.920368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.920368e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.901079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.958523e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.958523e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.394958 sec - 10,453,064,474 cycles # 3.076 GHz - 31,037,189,368 instructions # 2.97 insn per cycle - 3.575829119 seconds time elapsed +TOTAL : 3.366236 sec + 10,452,664,686 cycles # 3.103 GHz + 31,036,674,927 instructions # 2.97 insn per cycle + 3.376930277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5217) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.658170e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.883748e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.883748e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.651217e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.875611e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.875611e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.785897 sec - 4,991,064,291 cycles # 2.893 GHz - 11,349,773,903 instructions # 2.27 insn per cycle - 2.036627819 seconds time elapsed +TOTAL : 1.720655 sec + 4,978,164,910 cycles # 2.887 GHz + 11,347,041,260 instructions # 2.28 insn per cycle + 1.725704266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.079594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107333e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107333e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070490e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098163e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.547654 sec - 4,435,837,502 cycles # 2.871 GHz - 10,526,848,673 instructions # 2.37 insn per cycle - 1.897179646 seconds time elapsed +TOTAL : 1.554956 sec + 4,439,057,094 cycles # 2.849 GHz + 10,527,304,462 instructions # 2.37 insn per cycle + 1.569562073 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4296) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.723766e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.867968e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.867968e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.744665e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.887734e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.887734e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.147517 sec - 4,130,815,941 cycles # 1.919 GHz - 5,948,266,819 instructions # 1.44 insn per cycle - 2.376745402 seconds time elapsed +TOTAL : 2.141679 sec + 4,129,307,754 cycles # 1.925 GHz + 5,947,543,583 instructions # 1.44 insn per cycle + 2.156123063 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1544) (512y: 95) (512z: 3573) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index cb0d4f348f..86dff22d67 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:40:22 +DATE: 2023-08-15_08:29:11 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.284111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.491100e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.491100e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.321686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.599127e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.599127e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.523119 sec - 2,131,163,265 cycles # 2.873 GHz - 2,920,538,228 instructions # 1.37 insn per cycle - 0.799431859 seconds time elapsed +TOTAL : 0.519457 sec + 2,163,157,863 cycles # 2.900 GHz + 2,940,679,116 instructions # 1.36 insn per cycle + 0.803109015 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.397803e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.556496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.556496e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.396633e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.578542e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.578542e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.871566 sec - 3,367,847,266 cycles # 2.984 GHz - 4,940,448,044 instructions # 1.47 insn per cycle - 1.188710754 seconds time elapsed +TOTAL : 0.871347 sec + 3,319,474,295 cycles # 2.942 GHz + 4,919,963,320 instructions # 1.48 insn per cycle + 1.187456919 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.648484e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.665629e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.665629e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.621127e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637548e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.637548e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.224236 sec - 19,288,562,312 cycles # 3.099 GHz - 59,012,561,718 instructions # 3.06 insn per cycle - 6.230442828 seconds time elapsed +TOTAL : 6.280218 sec + 19,360,349,430 cycles # 3.081 GHz + 59,009,876,662 instructions # 3.05 insn per cycle + 6.285804933 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1189) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.853727e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.911660e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.911660e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.863217e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920233e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.920233e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.408736 sec - 10,489,490,359 cycles # 3.075 GHz - 31,086,984,430 instructions # 2.96 insn per cycle - 3.414471031 seconds time elapsed +TOTAL : 3.399573 sec + 10,479,860,332 cycles # 3.080 GHz + 31,084,657,008 instructions # 2.97 insn per cycle + 3.404850916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5217) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.688094e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.911978e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.911978e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.661744e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.884797e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.884797e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.722293 sec - 5,014,732,717 cycles # 2.904 GHz - 11,400,128,158 instructions # 2.27 insn per cycle - 1.727950388 seconds time elapsed +TOTAL : 1.726595 sec + 5,018,022,522 cycles # 2.899 GHz + 11,400,093,423 instructions # 2.27 insn per cycle + 1.732312900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4552) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.086704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.114720e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.114720e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.087002e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114719e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.537636 sec - 4,467,033,054 cycles # 2.898 GHz - 10,577,257,774 instructions # 2.37 insn per cycle - 1.548472351 seconds time elapsed +TOTAL : 1.537550 sec + 4,464,241,884 cycles # 2.896 GHz + 10,576,995,228 instructions # 2.37 insn per cycle + 1.552181935 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4296) (512y: 91) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.794399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.940611e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.940611e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.671174e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.815552e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.815552e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.134460 sec - 4,159,083,379 cycles # 1.945 GHz - 5,985,589,412 instructions # 1.44 insn per cycle - 2.145088241 seconds time elapsed +TOTAL : 2.169001 sec + 4,168,553,925 cycles # 1.921 GHz + 5,987,800,885 instructions # 1.44 insn per cycle + 2.179997665 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1544) (512y: 95) (512z: 3573) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index eddd8fad03..287719fb4c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:00:51 +DATE: 2023-08-15_08:00:44 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.999377e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.849340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.996028e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.300130e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841205e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.963795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.471420 sec - 1,996,361,801 cycles # 2.887 GHz - 2,542,563,280 instructions # 1.27 insn per cycle - 1.036246972 seconds time elapsed +TOTAL : 0.473190 sec + 1,977,903,116 cycles # 2.866 GHz + 2,527,671,450 instructions # 1.28 insn per cycle + 0.749450924 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.029751e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304937e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321509e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.104513e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309505e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.321059e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.615492 sec - 2,506,598,152 cycles # 2.905 GHz - 3,468,048,659 instructions # 1.38 insn per cycle - 0.921580518 seconds time elapsed +TOTAL : 0.601033 sec + 2,446,341,332 cycles # 2.907 GHz + 3,477,720,369 instructions # 1.42 insn per cycle + 0.900988844 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.614094e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.630706e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.630706e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.621645e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.638256e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.638256e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.292095 sec - 19,335,768,948 cycles # 3.071 GHz - 59,271,813,670 instructions # 3.07 insn per cycle - 6.385555998 seconds time elapsed +TOTAL : 6.273973 sec + 19,401,728,760 cycles # 3.091 GHz + 59,271,474,776 instructions # 3.05 insn per cycle + 6.279292583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1314) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.978424e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.037757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.037757e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.897061e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.954733e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.954733e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.313975 sec - 10,324,081,139 cycles # 3.112 GHz - 30,745,344,638 instructions # 2.98 insn per cycle - 3.433047318 seconds time elapsed +TOTAL : 3.369303 sec + 10,321,836,606 cycles # 3.060 GHz + 30,745,027,571 instructions # 2.98 insn per cycle + 3.374742155 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5043) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.421432e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.633258e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.633258e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.225882e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.431722e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.431722e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.762876 sec - 5,136,038,904 cycles # 2.906 GHz - 11,828,989,844 instructions # 2.30 insn per cycle - 2.156646029 seconds time elapsed +TOTAL : 1.801331 sec + 5,141,865,663 cycles # 2.849 GHz + 11,828,366,799 instructions # 2.30 insn per cycle + 1.806756237 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4668) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049290e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049290e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.020691e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045131e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.651315 sec - 4,720,343,271 cycles # 2.893 GHz - 11,077,581,173 instructions # 2.35 insn per cycle - 1.849545287 seconds time elapsed +TOTAL : 1.628258 sec + 4,713,345,183 cycles # 2.888 GHz + 11,074,061,148 instructions # 2.35 insn per cycle + 1.638619467 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4331) (512y: 245) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.815331e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.957259e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.957259e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.733326e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875000e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875000e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.121168 sec - 4,150,521,964 cycles # 1.954 GHz - 6,223,997,367 instructions # 1.50 insn per cycle - 2.169900611 seconds time elapsed +TOTAL : 2.143662 sec + 4,150,381,558 cycles # 1.933 GHz + 6,223,232,573 instructions # 1.50 insn per cycle + 2.158094654 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1458) (512y: 139) (512z: 3673) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e0c59794b7..2201bc5f3b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:01:23 +DATE: 2023-08-15_08:01:14 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.275683e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.242014e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.350197e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.406431e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.311489e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.406809e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.483022 sec - 1,940,447,874 cycles # 2.894 GHz - 2,416,090,954 instructions # 1.25 insn per cycle - 0.934619931 seconds time elapsed +TOTAL : 0.453050 sec + 1,920,468,364 cycles # 2.866 GHz + 2,412,861,677 instructions # 1.26 insn per cycle + 0.727980038 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.035988e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.397681e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.495727e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.336680e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.395129e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.463483e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.505975 sec - 2,135,477,409 cycles # 2.868 GHz - 2,760,482,426 instructions # 1.29 insn per cycle - 0.802431213 seconds time elapsed +TOTAL : 0.498499 sec + 2,112,948,915 cycles # 2.903 GHz + 2,744,565,868 instructions # 1.30 insn per cycle + 0.786132716 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.700553e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.714824e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.714824e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.696981e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.711531e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.711531e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.088884 sec - 18,839,883,467 cycles # 3.092 GHz - 59,477,327,112 instructions # 3.16 insn per cycle - 6.114158646 seconds time elapsed +TOTAL : 6.097926 sec + 18,842,737,798 cycles # 3.089 GHz + 59,477,078,656 instructions # 3.16 insn per cycle + 6.103001324 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.507386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.656018e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.656018e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.464668e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.611770e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.611770e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.963591 sec - 5,828,258,521 cycles # 2.986 GHz - 16,931,583,071 instructions # 2.91 insn per cycle - 2.351020415 seconds time elapsed +TOTAL : 1.956560 sec + 5,827,138,618 cycles # 2.974 GHz + 16,930,455,676 instructions # 2.91 insn per cycle + 1.966949100 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5857) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.864944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934699e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934699e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.860314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928538e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928538e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.899707 sec - 2,624,583,793 cycles # 2.902 GHz - 6,157,832,437 instructions # 2.35 insn per cycle - 0.957836827 seconds time elapsed +TOTAL : 0.902356 sec + 2,622,038,190 cycles # 2.894 GHz + 6,157,146,340 instructions # 2.35 insn per cycle + 0.907479962 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.136979e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.136979e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.060318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.145170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.145170e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.819351 sec - 2,381,825,131 cycles # 2.891 GHz - 5,719,379,332 instructions # 2.40 insn per cycle - 0.902027432 seconds time elapsed +TOTAL : 0.816442 sec + 2,380,166,524 cycles # 2.901 GHz + 5,718,523,485 instructions # 2.40 insn per cycle + 0.821587636 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4804) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.587550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637445e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637445e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.588424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.639513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.639513e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.055673 sec - 2,064,467,819 cycles # 1.947 GHz - 3,381,046,521 instructions # 1.64 insn per cycle - 1.127137147 seconds time elapsed +TOTAL : 1.054803 sec + 2,062,209,519 cycles # 1.949 GHz + 3,380,248,799 instructions # 1.64 insn per cycle + 1.059522460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 40) (512z: 3776) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 3c06f7e180..7f43a19b3d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:40:52 +DATE: 2023-08-15_08:29:41 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.947150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106863e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106863e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.946079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201180e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201180e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.479811 sec - 2,058,327,884 cycles # 2.891 GHz - 2,660,457,214 instructions # 1.29 insn per cycle - 0.769844082 seconds time elapsed +TOTAL : 0.481798 sec + 2,015,187,560 cycles # 2.853 GHz + 2,654,607,280 instructions # 1.32 insn per cycle + 0.765431808 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.704197e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.677917e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.677917e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.740355e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.667132e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.667132e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737489e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.659835 sec - 2,658,971,725 cycles # 2.925 GHz - 3,739,744,389 instructions # 1.41 insn per cycle - 0.966449510 seconds time elapsed +TOTAL : 0.659208 sec + 2,647,206,982 cycles # 2.918 GHz + 3,707,484,646 instructions # 1.40 insn per cycle + 0.964897832 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.691338e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.705885e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.705885e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.694248e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.708560e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.708560e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.113376 sec - 18,846,363,204 cycles # 3.081 GHz - 59,481,983,083 instructions # 3.16 insn per cycle - 6.118400925 seconds time elapsed +TOTAL : 6.106197 sec + 18,844,738,268 cycles # 3.084 GHz + 59,481,932,456 instructions # 3.16 insn per cycle + 6.116008012 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 961) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.562477e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.713069e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.713069e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.506852e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.654162e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.654162e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.938806 sec - 5,847,030,697 cycles # 3.010 GHz - 16,978,868,455 instructions # 2.90 insn per cycle - 1.943739002 seconds time elapsed +TOTAL : 1.951328 sec + 5,846,261,275 cycles # 2.992 GHz + 16,978,874,386 instructions # 2.90 insn per cycle + 1.956229643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5857) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.861780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.932176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932176e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.851729e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920803e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.905080 sec - 2,640,071,027 cycles # 2.905 GHz - 6,194,204,139 instructions # 2.35 insn per cycle - 0.909836257 seconds time elapsed +TOTAL : 0.909940 sec + 2,643,321,044 cycles # 2.892 GHz + 6,194,221,153 instructions # 2.34 insn per cycle + 0.915140180 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5019) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.151341e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.151341e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.038921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.122854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.122854e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.818668 sec - 2,397,879,941 cycles # 2.917 GHz - 5,755,814,569 instructions # 2.40 insn per cycle - 0.823962776 seconds time elapsed +TOTAL : 0.831632 sec + 2,400,502,783 cycles # 2.878 GHz + 5,756,071,420 instructions # 2.40 insn per cycle + 0.836907229 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4804) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.608408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.659975e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.659975e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.517398e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565711e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.046020 sec - 2,082,283,991 cycles # 1.985 GHz - 3,421,985,952 instructions # 1.64 insn per cycle - 1.051201537 seconds time elapsed +TOTAL : 1.108961 sec + 2,086,070,681 cycles # 1.875 GHz + 3,422,372,371 instructions # 1.64 insn per cycle + 1.113925906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 40) (512z: 3776) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index abaa4409ee..1df556d75b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:01:50 +DATE: 2023-08-15_08:01:39 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.257100e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.203055e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.308741e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.422212e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288151e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386780e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.452248 sec - 1,930,296,393 cycles # 2.879 GHz - 2,411,632,497 instructions # 1.25 insn per cycle - 1.198627957 seconds time elapsed +TOTAL : 0.448849 sec + 1,941,942,406 cycles # 2.915 GHz + 2,406,681,347 instructions # 1.24 insn per cycle + 0.723909819 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.037866e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.407682e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.504884e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.352571e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.421119e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.492595e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.504167 sec - 2,169,596,462 cycles # 2.921 GHz - 2,775,792,895 instructions # 1.28 insn per cycle - 0.802374857 seconds time elapsed +TOTAL : 0.502246 sec + 2,107,695,366 cycles # 2.885 GHz + 2,759,759,873 instructions # 1.31 insn per cycle + 0.790343780 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.711183e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.725806e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.725806e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.716507e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.731257e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.731257e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.064626 sec - 18,795,336,069 cycles # 3.098 GHz - 59,224,515,195 instructions # 3.15 insn per cycle - 6.110990355 seconds time elapsed +TOTAL : 6.052780 sec + 18,786,762,925 cycles # 3.104 GHz + 59,224,528,935 instructions # 3.15 insn per cycle + 6.057917316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.170351e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.342154e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.342154e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.131862e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.305099e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.305099e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.816391 sec - 5,584,397,323 cycles # 3.082 GHz - 16,725,642,181 instructions # 3.00 insn per cycle - 2.240547427 seconds time elapsed +TOTAL : 1.815057 sec + 5,582,881,371 cycles # 3.069 GHz + 16,724,850,076 instructions # 3.00 insn per cycle + 1.826820128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.625152e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.677968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.677968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.627559e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.680257e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.680257e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.029099 sec - 3,001,452,731 cycles # 2.903 GHz - 6,825,194,241 instructions # 2.27 insn per cycle - 1.143200913 seconds time elapsed +TOTAL : 1.027563 sec + 2,998,369,464 cycles # 2.909 GHz + 6,824,345,384 instructions # 2.28 insn per cycle + 1.032324331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5670) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.755913e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.817308e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.817308e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.739116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799649e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008857e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 0.980002 sec - 2,776,588,686 cycles # 2.897 GHz - 6,372,381,786 instructions # 2.30 insn per cycle - 1.055443529 seconds time elapsed +TOTAL : 0.963274 sec + 2,776,231,077 cycles # 2.872 GHz + 6,371,695,047 instructions # 2.30 insn per cycle + 0.974745543 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5429) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.456751e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.498873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.498873e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.448866e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.491409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.491409e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.147780 sec - 2,244,664,687 cycles # 1.948 GHz - 3,753,987,747 instructions # 1.67 insn per cycle - 1.394185157 seconds time elapsed +TOTAL : 1.154218 sec + 2,240,419,741 cycles # 1.935 GHz + 3,753,173,024 instructions # 1.68 insn per cycle + 1.165467448 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2375) (512y: 30) (512z: 4073) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 0040b376c4..bb3e881615 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:02:17 +DATE: 2023-08-15_08:02:04 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.962157e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.817424e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.969407e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.290398e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.845540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.964726e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.473666 sec - 1,986,204,707 cycles # 2.864 GHz - 2,563,184,558 instructions # 1.29 insn per cycle - 0.999089400 seconds time elapsed +TOTAL : 0.470961 sec + 1,982,748,816 cycles # 2.880 GHz + 2,541,468,026 instructions # 1.28 insn per cycle + 0.747537205 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.031536e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306905e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.324080e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.107272e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315172e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326836e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.618988 sec - 2,501,332,558 cycles # 2.890 GHz - 3,521,956,860 instructions # 1.41 insn per cycle - 0.925249141 seconds time elapsed +TOTAL : 0.609050 sec + 2,450,461,311 cycles # 2.891 GHz + 3,472,533,054 instructions # 1.42 insn per cycle + 0.909431094 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.599055e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.615206e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.615206e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.600949e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.617125e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.617125e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.328009 sec - 19,590,687,969 cycles # 3.094 GHz - 60,091,889,027 instructions # 3.07 insn per cycle - 6.453071261 seconds time elapsed +TOTAL : 6.332094 sec + 19,626,072,012 cycles # 3.102 GHz + 60,092,931,892 instructions # 3.06 insn per cycle + 6.337235404 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1224) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.726696e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779821e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779821e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.803362e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.859060e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.859060e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.500566 sec - 10,287,367,151 cycles # 2.945 GHz - 30,781,662,236 instructions # 2.99 insn per cycle - 3.558144813 seconds time elapsed +TOTAL : 3.434987 sec + 10,282,391,139 cycles # 2.990 GHz + 30,780,626,926 instructions # 2.99 insn per cycle + 3.440411022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5353) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.808934e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.004105e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004105e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.662251e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.887127e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.887127e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.694070 sec - 4,956,221,693 cycles # 2.919 GHz - 11,307,596,970 instructions # 2.28 insn per cycle - 1.764109065 seconds time elapsed +TOTAL : 1.720090 sec + 4,950,116,741 cycles # 2.872 GHz + 11,306,969,988 instructions # 2.28 insn per cycle + 1.725207077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4684) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.109336e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138632e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.102470e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131191e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131191e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.503970 sec - 4,374,623,154 cycles # 2.907 GHz - 10,479,136,559 instructions # 2.40 insn per cycle - 1.825430329 seconds time elapsed +TOTAL : 1.511102 sec + 4,375,018,658 cycles # 2.889 GHz + 10,478,373,615 instructions # 2.40 insn per cycle + 1.525805694 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4420) (512y: 83) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.342939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.472646e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.472646e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.504396e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.639957e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.639957e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.298039 sec - 4,246,108,718 cycles # 1.877 GHz - 6,157,862,693 instructions # 1.45 insn per cycle - 2.594404713 seconds time elapsed +TOTAL : 2.209455 sec + 4,242,345,884 cycles # 1.917 GHz + 6,156,808,300 instructions # 1.45 insn per cycle + 2.214700190 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 117) (512z: 3648) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 4fcfa50357..d7a6c1c5a3 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-08-14_01:02:50 +DATE: 2023-08-15_08:02:33 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.930845e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.739142e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.883114e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.229649e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.745477e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.863213e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470887 sec - 1,999,227,745 cycles # 2.894 GHz - 2,545,778,262 instructions # 1.27 insn per cycle - 0.796916587 seconds time elapsed +TOTAL : 0.471873 sec + 1,969,411,167 cycles # 2.855 GHz + 2,527,637,472 instructions # 1.28 insn per cycle + 0.747596794 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.023669e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.294926e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.311092e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.094888e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.298904e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.310235e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.614088 sec - 2,518,069,579 cycles # 2.931 GHz - 3,441,254,464 instructions # 1.37 insn per cycle - 0.919942128 seconds time elapsed +TOTAL : 0.602742 sec + 2,460,497,015 cycles # 2.922 GHz + 3,449,636,442 instructions # 1.40 insn per cycle + 0.902413801 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.584079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.599811e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.599811e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.587494e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.603624e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.603624e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.365028 sec - 19,635,233,406 cycles # 3.083 GHz - 60,300,920,642 instructions # 3.07 insn per cycle - 6.416497109 seconds time elapsed +TOTAL : 6.355894 sec + 19,662,899,968 cycles # 3.093 GHz + 60,301,262,777 instructions # 3.07 insn per cycle + 6.360918487 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1271) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.778508e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.832485e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.832485e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.814453e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.870401e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.870401e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.451621 sec - 10,153,680,901 cycles # 2.938 GHz - 30,489,429,144 instructions # 3.00 insn per cycle - 3.524789920 seconds time elapsed +TOTAL : 3.426935 sec + 10,157,287,624 cycles # 2.960 GHz + 30,490,441,918 instructions # 3.00 insn per cycle + 3.432394941 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5151) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.488487e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.702032e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.702032e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.381746e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.595958e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.595958e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.765610 sec - 5,120,052,959 cycles # 2.920 GHz - 11,824,296,455 instructions # 2.31 insn per cycle - 1.921244164 seconds time elapsed +TOTAL : 1.770488 sec + 5,122,017,261 cycles # 2.888 GHz + 11,823,506,295 instructions # 2.31 insn per cycle + 1.776023203 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4797) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.034946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.060684e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.060684e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.032538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058039e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.679527 sec - 4,679,110,878 cycles # 2.904 GHz - 11,036,779,885 instructions # 2.36 insn per cycle - 2.203467005 seconds time elapsed +TOTAL : 1.610226 sec + 4,674,616,829 cycles # 2.896 GHz + 11,035,878,763 instructions # 2.36 insn per cycle + 1.615179746 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4427) (512y: 236) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.529198e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.664407e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.664407e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.513081e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.648869e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.648869e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.224533 sec - 4,266,734,708 cycles # 1.935 GHz - 6,396,762,185 instructions # 1.50 insn per cycle - 2.685626616 seconds time elapsed +TOTAL : 2.205682 sec + 4,258,221,941 cycles # 1.927 GHz + 6,395,609,321 instructions # 1.50 insn per cycle + 2.220421883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1957) (512y: 163) (512z: 3727) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 086e60392a..ee99dff53c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:03:23 +DATE: 2023-08-15_08:03:03 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.431322e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.487063e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.490197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.453892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.496986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499134e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.542971 sec - 2,245,039,009 cycles # 2.903 GHz - 3,197,789,906 instructions # 1.42 insn per cycle - 1.050641791 seconds time elapsed +TOTAL : 0.532264 sec + 2,202,591,124 cycles # 2.880 GHz + 3,145,234,675 instructions # 1.43 insn per cycle + 0.827585575 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.125711e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.167518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.169230e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.128592e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.158726e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.053858 sec - 10,132,762,041 cycles # 3.052 GHz - 22,945,757,801 instructions # 2.26 insn per cycle - 3.376841030 seconds time elapsed +TOTAL : 3.048277 sec + 10,045,436,491 cycles # 3.027 GHz + 21,587,671,766 instructions # 2.15 insn per cycle + 3.375324480 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.006303e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.007503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.007503e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.975608e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976863e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976863e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.187452 sec - 25,300,257,541 cycles # 3.089 GHz - 78,719,314,808 instructions # 3.11 insn per cycle - 8.296737804 seconds time elapsed +TOTAL : 8.320519 sec + 25,296,824,937 cycles # 3.041 GHz + 78,718,706,014 instructions # 3.11 insn per cycle + 8.325864427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.654814e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.658954e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.658954e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.681821e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.686349e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.686349e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.530989 sec - 12,973,917,464 cycles # 2.882 GHz - 39,329,674,127 instructions # 3.03 insn per cycle - 4.627684528 seconds time elapsed +TOTAL : 4.465998 sec + 12,962,442,263 cycles # 2.901 GHz + 39,326,857,637 instructions # 3.03 insn per cycle + 4.471243281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.611897e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.633281e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.633281e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.545695e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.567841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.567841e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.978404 sec - 5,589,925,942 cycles # 2.914 GHz - 13,927,085,384 instructions # 2.49 insn per cycle - 2.475854442 seconds time elapsed +TOTAL : 1.929314 sec + 5,594,587,140 cycles # 2.894 GHz + 13,926,374,724 instructions # 2.49 insn per cycle + 1.934668446 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.600500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.627806e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.627806e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.267998e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.295121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.295121e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.774071 sec - 4,991,535,401 cycles # 2.896 GHz - 12,569,102,761 instructions # 2.52 insn per cycle - 2.113684373 seconds time elapsed +TOTAL : 1.779909 sec + 5,160,580,380 cycles # 2.893 GHz + 12,568,523,308 instructions # 2.44 insn per cycle + 1.785283591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10999) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.712499e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731678e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731678e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.597716e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.615205e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.615205e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.137869 sec - 4,132,366,170 cycles # 1.930 GHz - 6,454,990,849 instructions # 1.56 insn per cycle - 2.238560133 seconds time elapsed +TOTAL : 2.168878 sec + 4,135,215,705 cycles # 1.903 GHz + 6,452,419,322 instructions # 1.56 insn per cycle + 2.174332343 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1809) (512y: 102) (512z:10109) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index ffef5c18db..0b2de777dc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:41:52 +DATE: 2023-08-15_08:30:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.986704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367572e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367572e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.014063e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.384512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.384512e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.538452 sec - 2,231,307,127 cycles # 2.882 GHz - 3,187,161,055 instructions # 1.43 insn per cycle - 0.833507885 seconds time elapsed +TOTAL : 0.537620 sec + 2,245,393,273 cycles # 2.907 GHz + 3,255,628,352 instructions # 1.45 insn per cycle + 0.832611719 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.556719e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.112734e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.112734e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.548113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.099320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099320e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.351583 sec - 11,071,340,164 cycles # 3.049 GHz - 24,218,517,995 instructions # 2.19 insn per cycle - 3.686932501 seconds time elapsed +TOTAL : 3.358261 sec + 11,062,414,783 cycles # 3.042 GHz + 23,490,877,966 instructions # 2.12 insn per cycle + 3.693864709 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.004051e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.005313e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.005313e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.017662e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.019011e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.019011e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.197191 sec - 25,319,667,347 cycles # 3.087 GHz - 78,721,851,078 instructions # 3.11 insn per cycle - 8.202714260 seconds time elapsed +TOTAL : 8.143634 sec + 25,319,073,361 cycles # 3.109 GHz + 78,725,760,258 instructions # 3.11 insn per cycle + 8.148951373 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.681744e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685945e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685945e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.653505e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.658020e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.658020e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.470738 sec - 12,988,345,966 cycles # 2.903 GHz - 39,342,184,052 instructions # 3.03 insn per cycle - 4.476150690 seconds time elapsed +TOTAL : 4.508369 sec + 13,019,069,255 cycles # 2.886 GHz + 39,344,398,700 instructions # 3.02 insn per cycle + 4.513744525 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.582091e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.604092e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.604092e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.544470e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.566776e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.566776e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.924537 sec - 5,601,022,426 cycles # 2.904 GHz - 13,935,138,004 instructions # 2.49 insn per cycle - 1.930030295 seconds time elapsed +TOTAL : 1.935380 sec + 5,614,957,088 cycles # 2.895 GHz + 13,940,977,670 instructions # 2.48 insn per cycle + 1.940728454 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.665994e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.695427e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.695427e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.572880e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.603073e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.603073e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.710344 sec - 5,001,805,426 cycles # 2.917 GHz - 12,577,102,796 instructions # 2.51 insn per cycle - 1.715981463 seconds time elapsed +TOTAL : 1.727110 sec + 5,000,794,718 cycles # 2.888 GHz + 12,577,122,736 instructions # 2.52 insn per cycle + 1.732450309 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10999) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.502380e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.520645e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.520645e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.594797e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.613505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.613505e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.202272 sec - 4,154,860,416 cycles # 1.883 GHz - 6,467,799,206 instructions # 1.56 insn per cycle - 2.207882271 seconds time elapsed +TOTAL : 2.175375 sec + 4,166,851,801 cycles # 1.912 GHz + 6,467,787,989 instructions # 1.55 insn per cycle + 2.180731533 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1809) (512y: 102) (512z:10109) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 57e2272706..77780d410d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:53:26 +DATE: 2023-08-15_08:42:14 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.415832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.459571e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.461969e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.429732e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.467711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469814e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.532960 sec - 2,212,726,044 cycles # 2.890 GHz - 3,151,265,157 instructions # 1.42 insn per cycle - 0.828018633 seconds time elapsed +TOTAL : 0.528918 sec + 2,209,872,003 cycles # 2.894 GHz + 3,176,003,884 instructions # 1.44 insn per cycle + 0.824220410 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.158371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.191848e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.193184e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.160148e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.193585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.194939e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.141223 sec - 10,423,864,532 cycles # 3.049 GHz - 21,980,500,221 instructions # 2.11 insn per cycle - 3.474630916 seconds time elapsed +TOTAL : 3.139841 sec + 10,329,103,375 cycles # 3.040 GHz + 22,212,942,478 instructions # 2.15 insn per cycle + 3.455656111 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003289e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.004494e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.004494e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.979419e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.980640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.980640e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.204066 sec - 25,332,100,981 cycles # 3.088 GHz - 78,717,361,741 instructions # 3.11 insn per cycle - 8.209264001 seconds time elapsed +TOTAL : 8.297424 sec + 25,302,053,764 cycles # 3.050 GHz + 78,718,480,647 instructions # 3.11 insn per cycle + 8.302445273 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.685476e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.689916e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.689916e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.636065e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640264e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640264e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.462358 sec - 12,968,184,176 cycles # 2.904 GHz - 39,326,521,557 instructions # 3.03 insn per cycle - 4.467505686 seconds time elapsed +TOTAL : 4.522525 sec + 12,975,254,269 cycles # 2.867 GHz + 39,327,280,013 instructions # 3.03 insn per cycle + 4.527497502 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.580571e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.603788e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.603788e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.483373e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.505325e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.505325e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.923444 sec - 5,599,083,123 cycles # 2.905 GHz - 13,925,478,466 instructions # 2.49 insn per cycle - 1.928599294 seconds time elapsed +TOTAL : 1.945229 sec + 5,600,197,846 cycles # 2.876 GHz + 13,925,895,358 instructions # 2.49 insn per cycle + 1.950206271 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.624579e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.654326e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.654326e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.569990e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.598824e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.598824e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.716071 sec - 4,994,544,982 cycles # 2.904 GHz - 12,566,354,806 instructions # 2.52 insn per cycle - 1.721250297 seconds time elapsed +TOTAL : 1.725342 sec + 4,994,452,767 cycles # 2.893 GHz + 12,566,891,411 instructions # 2.52 insn per cycle + 1.730492317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10999) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.590078e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608331e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608331e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.586191e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.604384e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.604384e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.173088 sec - 4,162,709,067 cycles # 1.912 GHz - 6,450,224,623 instructions # 1.55 insn per cycle - 2.178211354 seconds time elapsed +TOTAL : 2.174746 sec + 4,138,656,564 cycles # 1.901 GHz + 6,452,749,279 instructions # 1.56 insn per cycle + 2.179472637 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1809) (512y: 102) (512z:10109) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index cc293e1420..686f1e49a1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:50:09 +DATE: 2023-08-15_08:38:58 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.385916e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.436245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.438633e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.408896e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.453347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.455502e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.528050 sec - 2,221,204,697 cycles # 2.913 GHz - 3,126,159,935 instructions # 1.41 insn per cycle - 0.823128192 seconds time elapsed +TOTAL : 0.527305 sec + 2,220,070,983 cycles # 2.916 GHz + 3,151,259,549 instructions # 1.42 insn per cycle + 0.821841341 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.159983e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.193512e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.194846e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.154789e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.188369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.189721e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.087016 sec - 10,210,935,859 cycles # 3.051 GHz - 22,434,765,062 instructions # 2.20 insn per cycle - 3.404928758 seconds time elapsed +TOTAL : 3.094429 sec + 10,128,470,165 cycles # 3.020 GHz + 23,253,458,467 instructions # 2.30 insn per cycle + 3.411550143 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.009056e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.010327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.010327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.008156e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.009429e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.009429e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.174460 sec - 25,307,284,896 cycles # 3.095 GHz - 78,718,554,464 instructions # 3.11 insn per cycle - 8.179409864 seconds time elapsed +TOTAL : 8.177574 sec + 25,302,901,901 cycles # 3.093 GHz + 78,716,976,187 instructions # 3.11 insn per cycle + 8.182798610 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.684281e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.688589e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.688589e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.587950e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.592033e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592033e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.464077 sec - 12,969,767,723 cycles # 2.904 GHz - 39,328,955,189 instructions # 3.03 insn per cycle - 4.469042059 seconds time elapsed +TOTAL : 4.582675 sec + 13,508,007,326 cycles # 2.947 GHz + 39,327,551,179 instructions # 2.91 insn per cycle + 4.587694775 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.450197e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.472764e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.472764e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.486007e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.508549e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.508549e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.952168 sec - 5,584,534,361 cycles # 2.857 GHz - 13,928,298,356 instructions # 2.49 insn per cycle - 1.957225657 seconds time elapsed +TOTAL : 1.943227 sec + 5,595,681,232 cycles # 2.874 GHz + 13,926,558,643 instructions # 2.49 insn per cycle + 1.948271986 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.587169e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.617441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.617441e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.596256e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.625091e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.625091e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.721614 sec - 4,995,796,858 cycles # 2.895 GHz - 12,570,194,511 instructions # 2.52 insn per cycle - 1.726607468 seconds time elapsed +TOTAL : 1.719247 sec + 4,990,025,907 cycles # 2.896 GHz + 12,568,338,549 instructions # 2.52 insn per cycle + 1.724314858 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10999) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.652793e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.670180e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.670180e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.657567e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.675488e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.675488e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.157120 sec - 4,129,723,962 cycles # 1.913 GHz - 6,452,608,941 instructions # 1.56 insn per cycle - 2.162282533 seconds time elapsed +TOTAL : 2.151986 sec + 4,134,393,503 cycles # 1.918 GHz + 6,452,537,166 instructions # 1.56 insn per cycle + 2.157197097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1809) (512y: 102) (512z:10109) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index ee91f4202a..992b9aa27f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:46:57 +DATE: 2023-08-15_08:35:47 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.084030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.440818e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.443166e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072022e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.440583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.442697e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.532431 sec - 2,232,700,383 cycles # 2.911 GHz - 3,204,959,213 instructions # 1.44 insn per cycle - 0.827266803 seconds time elapsed +TOTAL : 0.533318 sec + 2,211,373,511 cycles # 2.880 GHz + 3,228,795,304 instructions # 1.46 insn per cycle + 0.828284741 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -63,14 +63,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.660360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.181193e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.665049e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.183102e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.271265 sec - 10,620,790,619 cycles # 3.011 GHz - 22,861,636,990 instructions # 2.15 insn per cycle - 3.589608095 seconds time elapsed +TOTAL : 3.247163 sec + 10,670,290,958 cycles # 3.043 GHz + 21,862,228,593 instructions # 2.05 insn per cycle + 3.563318288 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -85,14 +85,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.012284e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.013529e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.013529e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.004157e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.005423e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.005423e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.163560 sec - 25,325,959,470 cycles # 3.102 GHz - 78,719,551,951 instructions # 3.11 insn per cycle - 8.169802088 seconds time elapsed +TOTAL : 8.193994 sec + 25,318,715,213 cycles # 3.090 GHz + 78,717,630,381 instructions # 3.11 insn per cycle + 8.199107320 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -111,14 +111,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.690912e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.695296e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.695296e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.685853e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.690241e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.690241e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.454330 sec - 12,935,389,975 cycles # 2.902 GHz - 39,328,728,315 instructions # 3.04 insn per cycle - 4.459291695 seconds time elapsed +TOTAL : 4.460276 sec + 12,967,494,098 cycles # 2.906 GHz + 39,327,269,410 instructions # 3.03 insn per cycle + 4.465144824 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -137,14 +137,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.533795e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.556331e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.556331e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.566997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.590444e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.590444e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.932326 sec - 5,600,687,687 cycles # 2.892 GHz - 13,926,263,948 instructions # 2.49 insn per cycle - 1.937665950 seconds time elapsed +TOTAL : 1.927198 sec + 5,598,489,142 cycles # 2.901 GHz + 13,926,756,722 instructions # 2.49 insn per cycle + 1.932331133 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -163,14 +163,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.607016e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.636322e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.636322e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.590912e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.619419e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.619419e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.717571 sec - 4,997,653,299 cycles # 2.904 GHz - 12,568,419,556 instructions # 2.51 insn per cycle - 1.722777430 seconds time elapsed +TOTAL : 1.720802 sec + 4,990,138,053 cycles # 2.894 GHz + 12,570,402,284 instructions # 2.52 insn per cycle + 1.725569173 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10999) (512y: 88) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -189,14 +189,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.413536e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.431117e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.431117e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.667683e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.685389e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.685389e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.223052 sec - 4,137,293,227 cycles # 1.858 GHz - 6,452,545,570 instructions # 1.56 insn per cycle - 2.228297559 seconds time elapsed +TOTAL : 2.150008 sec + 4,128,778,267 cycles # 1.917 GHz + 6,452,453,162 instructions # 1.56 insn per cycle + 2.155042794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1809) (512y: 102) (512z:10109) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 90b381aac9..237ca99358 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:04:02 +DATE: 2023-08-15_08:03:40 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.405581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.456408e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.458875e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.411666e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.457808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.459974e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.611939 sec - 2,235,978,586 cycles # 2.912 GHz - 3,218,167,747 instructions # 1.44 insn per cycle - 1.094653611 seconds time elapsed +TOTAL : 0.530347 sec + 2,215,627,654 cycles # 2.903 GHz + 3,117,473,452 instructions # 1.41 insn per cycle + 0.825930987 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.121456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161766e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.163178e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.174090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.175257e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.057654 sec - 10,137,929,963 cycles # 3.052 GHz - 22,991,929,048 instructions # 2.27 insn per cycle - 3.379272793 seconds time elapsed +TOTAL : 3.043950 sec + 10,077,982,824 cycles # 3.048 GHz + 20,884,643,426 instructions # 2.07 insn per cycle + 3.362779620 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.006956e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.008195e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.008195e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.001628e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.002869e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002869e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.190029 sec - 25,353,763,516 cycles # 3.096 GHz - 78,464,006,615 instructions # 3.09 insn per cycle - 8.237114127 seconds time elapsed +TOTAL : 8.203624 sec + 25,355,069,104 cycles # 3.089 GHz + 78,461,481,347 instructions # 3.09 insn per cycle + 8.208964551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4141) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.688084e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.692419e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.692419e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.649553e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.653948e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.653948e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.457421 sec - 12,949,208,267 cycles # 2.902 GHz - 39,279,495,578 instructions # 3.03 insn per cycle - 5.036185726 seconds time elapsed +TOTAL : 4.504807 sec + 12,940,264,818 cycles # 2.872 GHz + 39,278,818,959 instructions # 3.04 insn per cycle + 4.509858133 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12921) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.488080e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.508923e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.508923e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.391541e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.414022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.414022e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.009105 sec - 5,642,898,054 cycles # 2.898 GHz - 14,043,583,291 instructions # 2.49 insn per cycle - 2.847319866 seconds time elapsed +TOTAL : 1.964481 sec + 5,630,549,019 cycles # 2.860 GHz + 14,042,768,532 instructions # 2.49 insn per cycle + 1.969790831 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.514614e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.542191e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.542191e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.707847e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.733033e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.733033e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.751531 sec - 5,038,767,629 cycles # 2.899 GHz - 12,696,213,108 instructions # 2.52 insn per cycle - 1.833248732 seconds time elapsed +TOTAL : 1.894730 sec + 5,044,848,529 cycles # 2.657 GHz + 12,697,937,577 instructions # 2.52 insn per cycle + 1.900080529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10994) (512y: 240) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.579113e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.597024e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.597024e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.667824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.686951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.686951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.299962 sec - 4,127,981,537 cycles # 1.896 GHz - 6,575,302,267 instructions # 1.59 insn per cycle - 2.370553569 seconds time elapsed +TOTAL : 2.149855 sec + 4,131,471,523 cycles # 1.920 GHz + 6,576,312,702 instructions # 1.59 insn per cycle + 2.154822899 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1638) (512y: 192) (512z:10078) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 9e6dde3915..b0cb514025 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:30:34 +DATE: 2023-08-15_08:19:39 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.163529e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.206192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.208473e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.186111e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.222258e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.224187e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.541774 sec - 2,243,959,392 cycles # 2.852 GHz - 3,195,074,435 instructions # 1.42 insn per cycle - 0.848030629 seconds time elapsed +TOTAL : 0.534685 sec + 2,221,949,437 cycles # 2.892 GHz + 3,229,701,697 instructions # 1.45 insn per cycle + 0.828035424 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.747655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.776210e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.777350e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.745971e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.772990e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.774074e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.300795 sec - 10,814,858,147 cycles # 3.038 GHz - 23,539,102,873 instructions # 2.18 insn per cycle - 3.617402971 seconds time elapsed +TOTAL : 3.294695 sec + 10,798,249,474 cycles # 3.040 GHz + 25,165,680,445 instructions # 2.33 insn per cycle + 3.610046788 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.464255e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.464863e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.464863e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.440167e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.440795e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.440795e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.756565 sec - 113,473,857,207 cycles # 3.087 GHz - 144,976,195,092 instructions # 1.28 insn per cycle - 36.761866540 seconds time elapsed +TOTAL : 36.950676 sec + 113,548,188,280 cycles # 3.073 GHz + 144,974,209,663 instructions # 1.28 insn per cycle + 36.955853236 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:21600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.329429e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.333010e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.333010e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.329065e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332654e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332654e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.937251 sec - 14,710,626,118 cycles # 2.977 GHz - 37,589,728,667 instructions # 2.56 insn per cycle - 4.942405023 seconds time elapsed +TOTAL : 4.936971 sec + 14,692,912,106 cycles # 2.974 GHz + 37,589,294,182 instructions # 2.56 insn per cycle + 4.941927376 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68118) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.758143e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.776407e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.776407e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.700518e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.719059e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.719059e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.125139 sec - 6,165,446,684 cycles # 2.896 GHz - 13,077,164,597 instructions # 2.12 insn per cycle - 2.130180198 seconds time elapsed +TOTAL : 2.143912 sec + 6,137,805,962 cycles # 2.862 GHz + 13,076,339,536 instructions # 2.13 insn per cycle + 2.149352332 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.433752e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.463229e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.463229e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.391844e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.419528e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.419528e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.750036 sec - 5,070,845,258 cycles # 2.891 GHz - 11,455,830,862 instructions # 2.26 insn per cycle - 1.755066850 seconds time elapsed +TOTAL : 1.757108 sec + 5,072,480,201 cycles # 2.881 GHz + 11,455,808,544 instructions # 2.26 insn per cycle + 1.762035410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40433) (512y: 285) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.931587e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951446e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951446e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.876076e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.894961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.894961e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.078843 sec - 3,988,613,609 cycles # 1.915 GHz - 5,956,806,031 instructions # 1.49 insn per cycle - 2.083988262 seconds time elapsed +TOTAL : 2.093367 sec + 3,994,032,080 cycles # 1.905 GHz + 5,957,196,685 instructions # 1.49 insn per cycle + 2.098310322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39409) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 237d4b1f1f..a6581703dc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:31:43 +DATE: 2023-08-15_08:20:48 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.182162e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.228576e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.230676e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.187705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.222893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.224731e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.538710 sec - 2,237,092,402 cycles # 2.901 GHz - 3,239,298,803 instructions # 1.45 insn per cycle - 0.831393972 seconds time elapsed +TOTAL : 0.535653 sec + 2,220,323,389 cycles # 2.893 GHz + 3,156,830,694 instructions # 1.42 insn per cycle + 0.828034854 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.771745e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800804e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.802002e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.771115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.798449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799565e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.293922 sec - 10,818,904,576 cycles # 3.047 GHz - 25,176,874,120 instructions # 2.33 insn per cycle - 3.610871654 seconds time elapsed +TOTAL : 3.292251 sec + 10,760,515,723 cycles # 3.035 GHz + 25,239,650,000 instructions # 2.35 insn per cycle + 3.607701526 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.430289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.430931e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.430931e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.415087e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.415720e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.415720e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.038776 sec - 114,260,758,210 cycles # 3.085 GHz - 145,573,369,114 instructions # 1.27 insn per cycle - 37.043808553 seconds time elapsed +TOTAL : 37.159775 sec + 114,301,534,784 cycles # 3.076 GHz + 145,574,530,561 instructions # 1.27 insn per cycle + 37.164733277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:22238) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.266164e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269348e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269348e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.261786e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.265194e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.265194e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.031364 sec - 15,188,051,220 cycles # 3.017 GHz - 37,775,079,277 instructions # 2.49 insn per cycle - 5.036828039 seconds time elapsed +TOTAL : 5.044434 sec + 15,176,552,874 cycles # 3.008 GHz + 37,775,533,795 instructions # 2.49 insn per cycle + 5.049599176 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68446) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.956472e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.976804e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.976804e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.943060e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.962497e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.962497e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072406 sec - 6,007,960,717 cycles # 2.895 GHz - 12,913,377,246 instructions # 2.15 insn per cycle - 2.077554125 seconds time elapsed +TOTAL : 2.074892 sec + 6,010,581,709 cycles # 2.891 GHz + 12,911,568,202 instructions # 2.15 insn per cycle + 2.080135608 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45936) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.366464e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.393491e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.393491e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.321078e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.349408e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.349408e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.761433 sec - 5,110,418,656 cycles # 2.895 GHz - 11,460,870,333 instructions # 2.24 insn per cycle - 1.766670258 seconds time elapsed +TOTAL : 1.770348 sec + 5,116,510,829 cycles # 2.884 GHz + 11,462,830,925 instructions # 2.24 insn per cycle + 1.775692855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40124) (512y: 219) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.968882e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.987948e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.987948e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.877630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.896567e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.896567e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.068575 sec - 3,964,128,637 cycles # 1.913 GHz - 5,909,253,003 instructions # 1.49 insn per cycle - 2.073904542 seconds time elapsed +TOTAL : 2.092406 sec + 3,959,116,760 cycles # 1.889 GHz + 5,909,678,629 instructions # 1.49 insn per cycle + 2.097595629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38938) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 4a1d86025d..87c14a9fc2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:04:41 +DATE: 2023-08-15_08:04:17 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.145356e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.236099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.242246e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.113680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.206120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.211586e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.489449 sec - 2,039,002,920 cycles # 2.869 GHz - 2,745,150,377 instructions # 1.35 insn per cycle - 0.956174279 seconds time elapsed +TOTAL : 0.489030 sec + 2,018,556,536 cycles # 2.860 GHz + 2,695,229,333 instructions # 1.34 insn per cycle + 0.764433321 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.476965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.566424e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.570141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.501612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.564106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.566732e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.749416 sec - 5,911,386,429 cycles # 2.970 GHz - 11,874,687,747 instructions # 2.01 insn per cycle - 2.050933296 seconds time elapsed +TOTAL : 1.742005 sec + 5,900,924,146 cycles # 2.991 GHz + 12,364,294,177 instructions # 2.10 insn per cycle + 2.032841021 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.052771e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.053796e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.053796e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.077935e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079032e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079032e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.007326 sec - 24,473,428,166 cycles # 3.058 GHz - 78,139,046,428 instructions # 3.19 insn per cycle - 8.030050407 seconds time elapsed +TOTAL : 7.903687 sec + 24,463,617,809 cycles # 3.095 GHz + 78,139,167,105 instructions # 3.19 insn per cycle + 7.908373975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.531104e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.545655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.545655e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.252580e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.266716e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.266716e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.228177 sec - 6,333,012,717 cycles # 2.889 GHz - 20,186,200,946 instructions # 3.19 insn per cycle - 2.348887805 seconds time elapsed +TOTAL : 2.270339 sec + 6,331,371,908 cycles # 2.784 GHz + 20,185,378,375 instructions # 3.19 insn per cycle + 2.275323970 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.691030e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698298e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698298e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.605269e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.612091e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.612091e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.978963 sec - 2,859,749,410 cycles # 2.909 GHz - 7,122,305,734 instructions # 2.49 insn per cycle - 1.343178186 seconds time elapsed +TOTAL : 1.031349 sec + 2,857,704,024 cycles # 2.761 GHz + 7,121,520,002 instructions # 2.49 insn per cycle + 1.036403125 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907442e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916268e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916268e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.888192e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.897460e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897460e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.868821 sec - 2,535,616,057 cycles # 2.902 GHz - 6,417,617,935 instructions # 2.53 insn per cycle - 0.926489303 seconds time elapsed +TOTAL : 0.877915 sec + 2,532,938,739 cycles # 2.874 GHz + 6,416,830,601 instructions # 2.53 insn per cycle + 0.882777780 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11552) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.540164e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546050e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546050e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.499109e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.505240e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.505240e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.073968 sec - 2,063,634,905 cycles # 1.914 GHz - 3,330,474,156 instructions # 1.61 insn per cycle - 1.194107633 seconds time elapsed +TOTAL : 1.103806 sec + 2,068,951,024 cycles # 1.868 GHz + 3,329,621,366 instructions # 1.61 insn per cycle + 1.109490178 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 47) (512z:10312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index dd4d0d0e94..8638dd3103 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:42:29 +DATE: 2023-08-15_08:31:19 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.490877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.173392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.173392e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.531230e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.195294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.195294e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.492869 sec - 2,047,656,788 cycles # 2.884 GHz - 2,763,560,810 instructions # 1.35 insn per cycle - 0.770167598 seconds time elapsed +TOTAL : 0.491228 sec + 2,027,497,668 cycles # 2.858 GHz + 2,777,232,069 instructions # 1.37 insn per cycle + 0.767453903 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.220022e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.476319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.476319e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.199928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.478309e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.478309e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641709e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.922513 sec - 6,528,767,062 cycles # 3.018 GHz - 14,035,203,089 instructions # 2.15 insn per cycle - 2.222915371 seconds time elapsed +TOTAL : 1.925284 sec + 6,532,368,683 cycles # 3.019 GHz + 13,853,967,442 instructions # 2.12 insn per cycle + 2.224547208 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.079103e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.080149e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.080149e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.069434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070543e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.901425 sec - 24,471,434,613 cycles # 3.097 GHz - 78,143,425,809 instructions # 3.19 insn per cycle - 7.906546316 seconds time elapsed +TOTAL : 7.937844 sec + 24,489,796,577 cycles # 3.084 GHz + 78,143,313,735 instructions # 3.19 insn per cycle + 7.942733555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.573389e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.587761e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587761e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.501137e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.515705e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.515705e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.177030 sec - 6,339,651,761 cycles # 2.907 GHz - 20,194,458,846 instructions # 3.19 insn per cycle - 2.182345512 seconds time elapsed +TOTAL : 2.197820 sec + 6,342,440,687 cycles # 2.882 GHz + 20,194,640,120 instructions # 3.18 insn per cycle + 2.202831490 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.691696e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.699256e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.699256e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.680755e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.688331e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.688331e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.980975 sec - 2,863,872,872 cycles # 2.907 GHz - 7,131,329,274 instructions # 2.49 insn per cycle - 0.986346435 seconds time elapsed +TOTAL : 0.987293 sec + 2,865,332,330 cycles # 2.890 GHz + 7,131,409,419 instructions # 2.49 insn per cycle + 0.992557144 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907052e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.916770e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916770e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898899e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.908726e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.908726e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.871521 sec - 2,541,741,239 cycles # 2.905 GHz - 6,426,656,602 instructions # 2.53 insn per cycle - 0.876410518 seconds time elapsed +TOTAL : 0.875197 sec + 2,543,327,231 cycles # 2.892 GHz + 6,426,714,975 instructions # 2.53 insn per cycle + 0.880579478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11552) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.538590e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.544738e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.544738e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.541618e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.547883e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.547883e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.077568 sec - 2,079,170,055 cycles # 1.923 GHz - 3,340,060,936 instructions # 1.61 insn per cycle - 1.082815151 seconds time elapsed +TOTAL : 1.075533 sec + 2,075,747,014 cycles # 1.922 GHz + 3,339,997,983 instructions # 1.61 insn per cycle + 1.080773183 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 47) (512z:10312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index c40ef2f48c..0ab22d9f59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:54:02 +DATE: 2023-08-15_08:42:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.133585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.217646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.223454e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.144326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.215205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.220785e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.486343 sec - 2,014,074,595 cycles # 2.858 GHz - 2,718,911,350 instructions # 1.35 insn per cycle - 0.762273031 seconds time elapsed +TOTAL : 0.488557 sec + 2,011,133,432 cycles # 2.858 GHz + 2,717,062,488 instructions # 1.35 insn per cycle + 0.763355966 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.587045e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.662861e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.665952e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.586236e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.659086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.662124e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.824603 sec - 6,204,405,303 cycles # 3.017 GHz - 12,029,883,895 instructions # 1.94 insn per cycle - 2.125412222 seconds time elapsed +TOTAL : 1.821809 sec + 6,199,621,275 cycles # 3.020 GHz + 12,302,218,981 instructions # 1.98 insn per cycle + 2.113042818 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.058314e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.059406e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.059406e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.070465e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.071518e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071518e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.983211 sec - 24,490,191,818 cycles # 3.067 GHz - 78,140,749,573 instructions # 3.19 insn per cycle - 7.987838004 seconds time elapsed +TOTAL : 7.936750 sec + 24,501,268,872 cycles # 3.089 GHz + 78,140,977,505 instructions # 3.19 insn per cycle + 7.941817846 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.594817e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.610329e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.610329e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.577149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.592541e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.592541e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.169733 sec - 6,334,366,600 cycles # 2.915 GHz - 20,185,347,462 instructions # 3.19 insn per cycle - 2.174729550 seconds time elapsed +TOTAL : 2.174855 sec + 6,336,150,180 cycles # 2.910 GHz + 20,185,517,198 instructions # 3.19 insn per cycle + 2.179654718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.676217e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.683874e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.683874e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.679095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.686422e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.686422e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.988945 sec - 2,861,355,077 cycles # 2.883 GHz - 7,120,772,946 instructions # 2.49 insn per cycle - 0.993999045 seconds time elapsed +TOTAL : 0.987011 sec + 2,858,107,376 cycles # 2.886 GHz + 7,120,975,733 instructions # 2.49 insn per cycle + 0.991535494 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.903902e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.913919e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913919e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.895324e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904535e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904535e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.872029 sec - 2,537,630,694 cycles # 2.898 GHz - 6,414,974,721 instructions # 2.53 insn per cycle - 0.876653530 seconds time elapsed +TOTAL : 0.875618 sec + 2,534,850,734 cycles # 2.883 GHz + 6,415,151,431 instructions # 2.53 insn per cycle + 0.880122223 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11552) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.549492e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.555983e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.555983e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.534730e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.540899e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.540899e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.068992 sec - 2,067,152,085 cycles # 1.927 GHz - 3,327,718,138 instructions # 1.61 insn per cycle - 1.073523555 seconds time elapsed +TOTAL : 1.081909 sec + 2,066,983,805 cycles # 1.907 GHz + 3,328,304,762 instructions # 1.61 insn per cycle + 1.086461140 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 47) (512z:10312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 9fd9118329..9f2ef7c120 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:50:46 +DATE: 2023-08-15_08:39:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.071717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.165465e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.171075e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.082660e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.168017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.173141e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487226 sec - 2,023,558,777 cycles # 2.877 GHz - 2,703,123,931 instructions # 1.34 insn per cycle - 0.762477701 seconds time elapsed +TOTAL : 0.485959 sec + 2,007,340,793 cycles # 2.856 GHz + 2,718,317,159 instructions # 1.35 insn per cycle + 0.760972781 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.583685e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.658297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.661256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.588938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.660631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.663590e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.776155 sec - 6,090,706,251 cycles # 3.028 GHz - 12,631,519,102 instructions # 2.07 insn per cycle - 2.068493407 seconds time elapsed +TOTAL : 1.773538 sec + 6,041,038,525 cycles # 3.010 GHz + 13,199,098,490 instructions # 2.18 insn per cycle + 2.063993421 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.080202e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.081239e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.081239e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.068156e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069195e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069195e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.893925 sec - 24,459,537,360 cycles # 3.097 GHz - 78,138,845,381 instructions # 3.19 insn per cycle - 7.898651157 seconds time elapsed +TOTAL : 7.938597 sec + 24,486,012,837 cycles # 3.083 GHz + 78,138,824,946 instructions # 3.19 insn per cycle + 7.943607441 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.530130e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.546052e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.546052e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.545303e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.560195e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.560195e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.187440 sec - 6,327,110,881 cycles # 2.887 GHz - 20,184,981,069 instructions # 3.19 insn per cycle - 2.192831639 seconds time elapsed +TOTAL : 2.187527 sec + 6,329,240,263 cycles # 2.893 GHz + 20,185,499,978 instructions # 3.19 insn per cycle + 2.192263277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.684556e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.691799e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691799e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.681389e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.688756e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.688756e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.986600 sec - 2,856,974,548 cycles # 2.892 GHz - 7,121,782,414 instructions # 2.49 insn per cycle - 0.991762581 seconds time elapsed +TOTAL : 0.984275 sec + 2,856,185,163 cycles # 2.891 GHz + 7,121,459,357 instructions # 2.49 insn per cycle + 0.989290512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.906147e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.915608e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.915608e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.909484e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.919062e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.919062e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.869355 sec - 2,531,308,258 cycles # 2.900 GHz - 6,416,600,138 instructions # 2.53 insn per cycle - 0.873935392 seconds time elapsed +TOTAL : 0.867556 sec + 2,532,676,049 cycles # 2.911 GHz + 6,416,893,378 instructions # 2.53 insn per cycle + 0.872102079 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11552) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.533942e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.540164e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.540164e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.531203e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.537250e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.537250e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.078607 sec - 2,066,736,625 cycles # 1.910 GHz - 3,329,381,020 instructions # 1.61 insn per cycle - 1.083251454 seconds time elapsed +TOTAL : 1.080158 sec + 2,062,887,429 cycles # 1.904 GHz + 3,329,660,109 instructions # 1.61 insn per cycle + 1.085064073 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 47) (512z:10312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 369d81c9e4..bf788b11b5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:47:34 +DATE: 2023-08-15_08:36:24 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -45,14 +45,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.523405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.183111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.190702e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.576733e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.221598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.226830e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.489128 sec - 2,013,700,317 cycles # 2.835 GHz - 2,714,073,467 instructions # 1.35 insn per cycle - 0.767304374 seconds time elapsed +TOTAL : 0.487901 sec + 2,025,095,734 cycles # 2.872 GHz + 2,738,438,516 instructions # 1.35 insn per cycle + 0.762207032 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -63,14 +63,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.448697e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646047e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.649139e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.456403e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.644388e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.647370e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641709e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.858817 sec - 6,339,278,158 cycles # 3.032 GHz - 13,501,915,106 instructions # 2.13 insn per cycle - 2.151515631 seconds time elapsed +TOTAL : 1.853771 sec + 6,296,786,374 cycles # 3.016 GHz + 12,875,787,727 instructions # 2.04 insn per cycle + 2.154051650 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -85,14 +85,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.074030e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.075154e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.075154e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.058673e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.059792e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.059792e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.916741 sec - 24,468,897,845 cycles # 3.090 GHz - 78,138,684,143 instructions # 3.19 insn per cycle - 7.921685859 seconds time elapsed +TOTAL : 7.975780 sec + 24,487,620,137 cycles # 3.070 GHz + 78,140,745,843 instructions # 3.19 insn per cycle + 7.980846967 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -111,14 +111,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.595306e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.610054e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.610054e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.553798e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.569045e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.569045e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.168239 sec - 6,331,280,785 cycles # 2.917 GHz - 20,184,983,066 instructions # 3.19 insn per cycle - 2.172981160 seconds time elapsed +TOTAL : 2.179841 sec + 6,328,389,261 cycles # 2.901 GHz + 20,185,367,490 instructions # 3.19 insn per cycle + 2.184524511 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -137,14 +137,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.694691e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.702095e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.702095e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.690124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.697333e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.697333e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.976879 sec - 2,852,905,655 cycles # 2.912 GHz - 7,121,219,441 instructions # 2.50 insn per cycle - 0.981430803 seconds time elapsed +TOTAL : 0.979140 sec + 2,856,842,686 cycles # 2.911 GHz + 7,121,570,376 instructions # 2.49 insn per cycle + 0.983637729 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11880) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -163,14 +163,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.901114e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910592e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910592e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896699e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.906222e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.906222e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.871773 sec - 2,535,172,867 cycles # 2.896 GHz - 6,416,502,924 instructions # 2.53 insn per cycle - 0.876367040 seconds time elapsed +TOTAL : 0.873736 sec + 2,532,762,027 cycles # 2.888 GHz + 6,416,752,526 instructions # 2.53 insn per cycle + 0.878374086 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11552) (512y: 43) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -189,14 +189,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.540411e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546508e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546508e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.541857e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548013e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548013e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.074285 sec - 2,061,249,728 cycles # 1.913 GHz - 3,329,386,368 instructions # 1.62 insn per cycle - 1.078893170 seconds time elapsed +TOTAL : 1.072693 sec + 2,060,338,474 cycles # 1.914 GHz + 3,329,512,577 instructions # 1.62 insn per cycle + 1.077219821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 47) (512z:10312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index c6511dd036..d991aa119f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:05:13 +DATE: 2023-08-15_08:04:47 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.122129e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.211007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.217194e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.128116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.217668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.222594e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.488528 sec - 2,033,436,264 cycles # 2.868 GHz - 2,732,332,049 instructions # 1.34 insn per cycle - 1.264142072 seconds time elapsed +TOTAL : 0.487229 sec + 2,026,991,222 cycles # 2.878 GHz + 2,709,678,447 instructions # 1.34 insn per cycle + 0.762361654 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.556334e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.640609e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.643802e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.575581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.638965e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.642004e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.735348 sec - 5,928,201,343 cycles # 3.006 GHz - 11,700,630,530 instructions # 1.97 insn per cycle - 2.029407230 seconds time elapsed +TOTAL : 1.739337 sec + 5,714,760,202 cycles # 2.894 GHz + 11,499,716,249 instructions # 2.01 insn per cycle + 2.031805952 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.089763e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.090834e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.090834e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.990978e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.991940e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.991940e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.923336 sec - 24,324,560,498 cycles # 3.094 GHz - 77,882,270,959 instructions # 3.20 insn per cycle - 8.162694675 seconds time elapsed +TOTAL : 8.247144 sec + 24,315,749,458 cycles # 2.947 GHz + 77,883,463,554 instructions # 3.20 insn per cycle + 8.251967591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.623334e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.638012e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.638012e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.446333e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.461104e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.461104e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.201452 sec - 6,288,100,376 cycles # 2.905 GHz - 20,152,816,971 instructions # 3.20 insn per cycle - 2.361368947 seconds time elapsed +TOTAL : 2.211601 sec + 6,289,071,271 cycles # 2.840 GHz + 20,152,105,466 instructions # 3.20 insn per cycle + 2.216825625 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.641834e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.648725e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.648725e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.576783e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.583356e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.583356e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.015950 sec - 2,916,491,014 cycles # 2.881 GHz - 7,261,875,186 instructions # 2.49 insn per cycle - 1.356452434 seconds time elapsed +TOTAL : 1.049646 sec + 2,917,818,703 cycles # 2.769 GHz + 7,260,965,647 instructions # 2.49 insn per cycle + 1.054753836 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12273) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.848896e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.857118e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.857118e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848490e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856955e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856955e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.895877 sec - 2,613,554,864 cycles # 2.903 GHz - 6,559,099,851 instructions # 2.51 insn per cycle - 1.067111884 seconds time elapsed +TOTAL : 0.895882 sec + 2,613,663,217 cycles # 2.907 GHz + 6,558,326,054 instructions # 2.51 insn per cycle + 0.900676022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11966) (512y: 26) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.498628e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504041e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504041e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.479912e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.485645e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.485645e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.177928 sec - 2,131,733,578 cycles # 1.927 GHz - 3,490,292,434 instructions # 1.64 insn per cycle - 1.355169076 seconds time elapsed +TOTAL : 1.117454 sec + 2,129,795,413 cycles # 1.900 GHz + 3,489,517,961 instructions # 1.64 insn per cycle + 1.122073937 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2901) (512y: 23) (512z:10269) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index b83294289a..313bb42091 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:32:51 +DATE: 2023-08-15_08:21:57 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.357480e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.426786e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.431413e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.395758e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.460925e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.465060e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.495430 sec - 2,082,740,515 cycles # 2.864 GHz - 2,803,369,686 instructions # 1.35 insn per cycle - 0.785194017 seconds time elapsed +TOTAL : 0.491081 sec + 2,078,405,452 cycles # 2.876 GHz + 2,866,034,185 instructions # 1.38 insn per cycle + 0.779842200 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.824107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.886375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.889033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.825853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.888169e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.862687 sec - 6,352,829,585 cycles # 3.022 GHz - 13,878,239,699 instructions # 2.18 insn per cycle - 2.161482921 seconds time elapsed +TOTAL : 1.848365 sec + 6,206,449,631 cycles # 2.970 GHz + 13,392,412,167 instructions # 2.16 insn per cycle + 2.146713266 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.835360e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.836215e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.836215e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.780076e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.780894e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.780894e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.113899 sec - 86,676,288,155 cycles # 3.083 GHz - 135,544,232,902 instructions # 1.56 insn per cycle - 28.119286007 seconds time elapsed +TOTAL : 28.383541 sec + 87,166,803,809 cycles # 3.071 GHz + 135,545,164,925 instructions # 1.56 insn per cycle + 28.388103344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.148850e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.162675e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.162675e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.375173e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.389905e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.389905e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.303270 sec - 6,720,322,884 cycles # 2.913 GHz - 19,395,797,209 instructions # 2.89 insn per cycle - 2.308482512 seconds time elapsed +TOTAL : 2.232940 sec + 6,813,994,495 cycles # 3.046 GHz + 19,395,856,674 instructions # 2.85 insn per cycle + 2.238369966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69680) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.505707e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511354e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511354e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.498442e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.504286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504286e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.098614 sec - 3,181,298,583 cycles # 2.886 GHz - 6,817,418,896 instructions # 2.14 insn per cycle - 1.103266719 seconds time elapsed +TOTAL : 1.103395 sec + 3,175,966,194 cycles # 2.870 GHz + 6,817,821,844 instructions # 2.15 insn per cycle + 1.108102716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.768154e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.776267e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.776267e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.808044e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816407e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816407e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.936679 sec - 2,655,264,984 cycles # 2.824 GHz - 5,996,165,276 instructions # 2.26 insn per cycle - 0.941868087 seconds time elapsed +TOTAL : 0.922171 sec + 2,651,507,093 cycles # 2.875 GHz + 5,996,888,437 instructions # 2.26 insn per cycle + 0.927031811 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.518561e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.524599e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.524599e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.512862e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.518753e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.518753e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.089341 sec - 2,084,561,179 cycles # 1.907 GHz - 3,510,496,887 instructions # 1.68 insn per cycle - 1.094387993 seconds time elapsed +TOTAL : 1.093278 sec + 2,078,575,504 cycles # 1.895 GHz + 3,510,282,026 instructions # 1.69 insn per cycle + 1.097953673 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5198) (512y: 3) (512z:44822) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 3c5aca42b7..8832827e81 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:33:43 +DATE: 2023-08-15_08:22:49 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.341794e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.413655e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.418131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.366921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.431014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.435160e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.492091 sec - 2,122,785,924 cycles # 2.942 GHz - 2,834,085,211 instructions # 1.34 insn per cycle - 0.779635047 seconds time elapsed +TOTAL : 0.492320 sec + 2,070,385,138 cycles # 2.865 GHz + 2,831,765,240 instructions # 1.37 insn per cycle + 0.779919428 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.603505e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.663630e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.666132e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.621752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.677977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.680342e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.872278 sec - 6,389,319,285 cycles # 3.022 GHz - 13,391,699,362 instructions # 2.10 insn per cycle - 2.171441443 seconds time elapsed +TOTAL : 1.879104 sec + 6,357,166,988 cycles # 2.996 GHz + 13,486,685,413 instructions # 2.12 insn per cycle + 2.182208907 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.880632e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.881484e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.881484e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.926941e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.927826e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.927826e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 27.897182 sec - 85,390,502,526 cycles # 3.061 GHz - 136,007,351,125 instructions # 1.59 insn per cycle - 27.902055204 seconds time elapsed +TOTAL : 27.679825 sec + 84,768,166,768 cycles # 3.062 GHz + 136,006,770,863 instructions # 1.60 insn per cycle + 27.684475907 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:15937) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.142778e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.156351e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.156351e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.055087e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.067922e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.067922e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.304996 sec - 6,800,553,717 cycles # 2.947 GHz - 19,447,684,479 instructions # 2.86 insn per cycle - 2.310136835 seconds time elapsed +TOTAL : 2.333434 sec + 6,800,648,023 cycles # 2.910 GHz + 19,447,790,102 instructions # 2.86 insn per cycle + 2.338354760 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69722) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.552785e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.552785e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.542489e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548518e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548518e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.069611 sec - 3,123,329,712 cycles # 2.912 GHz - 6,728,538,162 instructions # 2.15 insn per cycle - 1.074493704 seconds time elapsed +TOTAL : 1.071802 sec + 3,111,815,719 cycles # 2.894 GHz + 6,728,747,187 instructions # 2.16 insn per cycle + 1.076475555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.828087e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.836666e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.836666e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.829500e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.838005e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.838005e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.906163 sec - 2,632,051,111 cycles # 2.893 GHz - 5,979,216,778 instructions # 2.27 insn per cycle - 0.910926560 seconds time elapsed +TOTAL : 0.906923 sec + 2,637,944,229 cycles # 2.901 GHz + 5,979,583,857 instructions # 2.27 insn per cycle + 0.911799920 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.525716e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.531907e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.531907e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.482750e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.488249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.488249e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.084185 sec - 2,081,615,272 cycles # 1.914 GHz - 3,503,636,494 instructions # 1.68 insn per cycle - 1.089152487 seconds time elapsed +TOTAL : 1.115491 sec + 2,082,566,778 cycles # 1.862 GHz + 3,504,063,228 instructions # 1.68 insn per cycle + 1.120169830 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4162) (512y: 4) (512z:44465) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 1eb61bb80f..e7c0a23b5c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:05:45 +DATE: 2023-08-15_08:05:17 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.416709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.469239e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471726e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.422363e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.468020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.470345e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530187 sec - 2,217,354,706 cycles # 2.890 GHz - 3,142,220,219 instructions # 1.42 insn per cycle - 0.937108666 seconds time elapsed +TOTAL : 0.530782 sec + 2,220,044,517 cycles # 2.895 GHz + 3,146,111,901 instructions # 1.42 insn per cycle + 0.827597323 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.172941e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.174345e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.136489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.165443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.166620e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.064124 sec - 10,144,064,593 cycles # 3.042 GHz - 21,567,616,198 instructions # 2.13 insn per cycle - 3.391924115 seconds time elapsed +TOTAL : 3.042440 sec + 10,003,660,868 cycles # 3.030 GHz + 22,703,917,020 instructions # 2.27 insn per cycle + 3.361771302 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.975669e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976929e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976929e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.976585e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.977873e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.977873e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.314637 sec - 25,675,388,948 cycles # 3.087 GHz - 79,184,995,534 instructions # 3.08 insn per cycle - 8.355157420 seconds time elapsed +TOTAL : 8.309361 sec + 25,674,704,286 cycles # 3.089 GHz + 79,183,385,361 instructions # 3.08 insn per cycle + 8.314358382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4708) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.722873e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.727109e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.727109e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.705238e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.709428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.709428e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.421867 sec - 12,794,179,126 cycles # 2.894 GHz - 38,589,797,349 instructions # 3.02 insn per cycle - 4.474066948 seconds time elapsed +TOTAL : 4.437753 sec + 12,812,708,621 cycles # 2.886 GHz + 38,591,867,024 instructions # 3.01 insn per cycle + 4.442817559 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.575519e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.599417e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.599417e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.522545e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.545503e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.545503e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.923531 sec - 5,606,464,367 cycles # 2.909 GHz - 13,721,384,596 instructions # 2.45 insn per cycle - 2.146627177 seconds time elapsed +TOTAL : 1.935510 sec + 5,601,392,957 cycles # 2.889 GHz + 13,721,052,527 instructions # 2.45 insn per cycle + 1.940418002 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11246) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.688651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.718968e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.718968e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.666679e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.696450e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.696450e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.703431 sec - 4,944,409,629 cycles # 2.896 GHz - 12,362,642,219 instructions # 2.50 insn per cycle - 1.759432802 seconds time elapsed +TOTAL : 1.706777 sec + 4,941,339,381 cycles # 2.888 GHz + 12,361,926,927 instructions # 2.50 insn per cycle + 1.712002744 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10898) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.580980e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.598893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.598893e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.527956e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.546398e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.546398e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.182283 sec - 4,171,145,931 cycles # 1.911 GHz - 6,457,188,547 instructions # 1.55 insn per cycle - 2.225839416 seconds time elapsed +TOTAL : 2.189538 sec + 4,169,817,395 cycles # 1.901 GHz + 6,455,711,002 instructions # 1.55 insn per cycle + 2.194538825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1796) (512y: 93) (512z:10086) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index a9278d2617..62cebd797f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-08-14_01:06:24 +DATE: 2023-08-15_08:05:54 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.434642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.479208e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.481732e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.450693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501640e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533307 sec - 2,207,815,568 cycles # 2.873 GHz - 3,082,371,539 instructions # 1.40 insn per cycle - 1.042674355 seconds time elapsed +TOTAL : 0.527663 sec + 2,207,556,580 cycles # 2.897 GHz + 3,167,926,058 instructions # 1.44 insn per cycle + 0.822538371 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.129691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164416e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165795e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.166130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167323e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.054049 sec - 10,092,973,581 cycles # 3.044 GHz - 22,716,830,356 instructions # 2.25 insn per cycle - 3.372955627 seconds time elapsed +TOTAL : 3.040251 sec + 10,050,103,857 cycles # 3.047 GHz + 22,708,720,443 instructions # 2.26 insn per cycle + 3.354433991 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.973402e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.974624e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.974624e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.970205e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.971421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.971421e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.324729 sec - 25,716,275,237 cycles # 3.090 GHz - 79,210,318,499 instructions # 3.08 insn per cycle - 8.365322428 seconds time elapsed +TOTAL : 8.334167 sec + 25,659,809,564 cycles # 3.078 GHz + 79,207,559,063 instructions # 3.09 insn per cycle + 8.339421888 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4383) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.701138e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.705330e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.705330e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.715781e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.720204e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.720204e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.441483 sec - 12,824,910,458 cycles # 2.885 GHz - 38,547,506,749 instructions # 3.01 insn per cycle - 4.520813757 seconds time elapsed +TOTAL : 4.423645 sec + 12,805,908,593 cycles # 2.892 GHz + 38,546,927,376 instructions # 3.01 insn per cycle + 4.428745496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12902) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.577329e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.599340e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.599340e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.552852e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.577102e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.577102e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.969694 sec - 5,586,335,948 cycles # 2.900 GHz - 13,838,593,845 instructions # 2.48 insn per cycle - 2.097244509 seconds time elapsed +TOTAL : 1.928072 sec + 5,590,516,918 cycles # 2.894 GHz + 13,839,830,857 instructions # 2.48 insn per cycle + 1.932926149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11349) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.535452e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.565040e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.565040e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.504293e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.533513e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.533513e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.730561 sec - 4,998,577,126 cycles # 2.881 GHz - 12,493,954,637 instructions # 2.50 insn per cycle - 1.794659842 seconds time elapsed +TOTAL : 1.736286 sec + 4,999,124,094 cycles # 2.873 GHz + 12,491,426,446 instructions # 2.50 insn per cycle + 1.741494791 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10894) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.548623e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.565250e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.565250e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.587755e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605628e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.605628e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.185793 sec - 4,168,129,981 cycles # 1.905 GHz - 6,557,312,578 instructions # 1.57 insn per cycle - 2.342770253 seconds time elapsed +TOTAL : 2.172271 sec + 4,166,068,862 cycles # 1.914 GHz + 6,558,419,576 instructions # 1.57 insn per cycle + 2.177416583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1626) (512y: 191) (512z:10049) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index c890f7583c..80fba0b18f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:09:02 +DATE: 2023-08-15_08:08:16 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059327e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060559e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060660e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.060900e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.062148e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.062247e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.603392 sec - 8,363,225,814 cycles # 3.036 GHz - 18,256,945,309 instructions # 2.18 insn per cycle - 3.094023307 seconds time elapsed +TOTAL : 2.389867 sec + 8,215,695,678 cycles # 3.030 GHz + 17,275,019,503 instructions # 2.10 insn per cycle + 2.770513749 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.197775e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.201219e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.201404e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.207928e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.211176e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.211361e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.997766 sec - 13,234,855,194 cycles # 3.059 GHz - 31,720,469,557 instructions # 2.40 insn per cycle - 4.381941370 seconds time elapsed +TOTAL : 3.993094 sec + 13,152,666,523 cycles # 3.046 GHz + 31,702,540,241 instructions # 2.41 insn per cycle + 4.376722174 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.326851e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.327168e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.327168e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.114407e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.114707e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.114707e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.375029 sec - 19,294,392,016 cycles # 3.039 GHz - 54,054,446,819 instructions # 2.80 insn per cycle - 6.408232704 seconds time elapsed +TOTAL : 6.512842 sec + 19,958,191,083 cycles # 3.063 GHz + 54,053,553,080 instructions # 2.71 insn per cycle + 6.517944789 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32344) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.647567e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.647690e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.647690e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.646584e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.646697e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646697e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.276622 sec - 9,974,852,314 cycles # 3.104 GHz - 27,089,957,031 instructions # 2.72 insn per cycle - 3.602602833 seconds time elapsed +TOTAL : 3.212662 sec + 9,945,345,971 cycles # 3.094 GHz + 27,088,220,434 instructions # 2.72 insn per cycle + 3.217491965 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96405) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.545087e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.545646e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.545646e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.529039e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.529559e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.529559e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.542883 sec - 4,300,574,890 cycles # 2.868 GHz - 9,675,636,640 instructions # 2.25 insn per cycle - 1.599996711 seconds time elapsed +TOTAL : 1.502572 sec + 4,344,096,873 cycles # 2.885 GHz + 9,674,263,200 instructions # 2.23 insn per cycle + 1.507666318 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.977331e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.978024e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.978024e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.992732e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.993396e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.993396e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.338146 sec - 3,859,071,163 cycles # 2.887 GHz - 8,625,868,560 instructions # 2.24 insn per cycle - 1.434583522 seconds time elapsed +TOTAL : 1.328375 sec + 3,847,060,555 cycles # 2.889 GHz + 8,624,394,596 instructions # 2.24 insn per cycle + 1.333304826 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84025) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.538142e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.538829e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538829e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.596524e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.597264e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.597264e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.562404 sec - 2,716,901,219 cycles # 1.809 GHz - 4,346,266,756 instructions # 1.60 insn per cycle - 1.708272067 seconds time elapsed +TOTAL : 1.473376 sec + 2,714,389,029 cycles # 1.837 GHz + 4,343,683,031 instructions # 1.60 insn per cycle + 1.478445198 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2293) (512y: 103) (512z:83066) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index e74944ef41..1be170b683 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:42:59 +DATE: 2023-08-15_08:31:48 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059573e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060768e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060768e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.060453e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.061571e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061571e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.376874 sec - 8,072,484,796 cycles # 2.986 GHz - 16,938,609,233 instructions # 2.10 insn per cycle - 2.760869723 seconds time elapsed +TOTAL : 2.369462 sec + 8,134,002,448 cycles # 3.017 GHz + 18,269,507,455 instructions # 2.25 insn per cycle + 2.752099708 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.172016e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.212528e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.212528e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.152234e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.192819e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.192819e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.013584 sec - 13,230,749,646 cycles # 3.049 GHz - 31,568,370,369 instructions # 2.39 insn per cycle - 4.398123554 seconds time elapsed +TOTAL : 4.012383 sec + 13,220,105,631 cycles # 3.045 GHz + 31,192,462,945 instructions # 2.36 insn per cycle + 4.397831586 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.509310e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.509630e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.509630e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.311172e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.311488e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.311488e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.217515 sec - 19,184,314,172 cycles # 3.084 GHz - 54,052,981,952 instructions # 2.82 insn per cycle - 6.222432290 seconds time elapsed +TOTAL : 6.356497 sec + 19,159,072,593 cycles # 3.013 GHz + 54,053,597,485 instructions # 2.82 insn per cycle + 6.361488517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32344) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.627370e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.627486e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.627486e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.618618e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.618732e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.618732e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.250429 sec - 10,019,712,784 cycles # 3.079 GHz - 27,089,138,978 instructions # 2.70 insn per cycle - 3.255541804 seconds time elapsed +TOTAL : 3.276148 sec + 10,008,076,049 cycles # 3.052 GHz + 27,089,522,265 instructions # 2.71 insn per cycle + 3.281084610 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96405) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.546980e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.547537e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.547537e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.522177e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.522803e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.522803e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.494720 sec - 4,324,778,455 cycles # 2.887 GHz - 9,675,132,122 instructions # 2.24 insn per cycle - 1.499298440 seconds time elapsed +TOTAL : 1.505343 sec + 4,322,216,596 cycles # 2.865 GHz + 9,675,489,699 instructions # 2.24 insn per cycle + 1.510154474 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.930807e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.931467e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.931467e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.959978e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960721e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.960721e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.349355 sec - 3,906,123,677 cycles # 2.886 GHz - 8,625,426,747 instructions # 2.21 insn per cycle - 1.354396482 seconds time elapsed +TOTAL : 1.339988 sec + 3,869,344,477 cycles # 2.880 GHz + 8,625,428,660 instructions # 2.23 insn per cycle + 1.344873313 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84025) (512y: 89) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.729735e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.730496e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.730496e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.665923e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.666615e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.666615e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.421944 sec - 2,709,187,359 cycles # 1.900 GHz - 4,344,600,460 instructions # 1.60 insn per cycle - 1.427094546 seconds time elapsed +TOTAL : 1.445605 sec + 2,710,048,805 cycles # 1.872 GHz + 4,344,691,123 instructions # 1.60 insn per cycle + 1.450245895 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2293) (512y: 103) (512z:83066) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 84751700b2..b48d707ba0 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:10:10 +DATE: 2023-08-15_08:09:20 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.050010e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.051227e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.051320e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.052449e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.053751e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.053867e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.624008 sec - 8,312,521,844 cycles # 3.012 GHz - 18,860,739,925 instructions # 2.27 insn per cycle - 3.543291629 seconds time elapsed +TOTAL : 2.401792 sec + 8,176,355,448 cycles # 3.002 GHz + 18,453,741,110 instructions # 2.26 insn per cycle + 2.783918975 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.226384e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.229945e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.230133e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.211968e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.215119e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.215326e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.995782 sec - 13,214,764,161 cycles # 3.048 GHz - 31,147,325,863 instructions # 2.36 insn per cycle - 4.392618756 seconds time elapsed +TOTAL : 3.995060 sec + 13,201,969,473 cycles # 3.051 GHz + 29,028,921,322 instructions # 2.20 insn per cycle + 4.382926828 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.305982e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.306285e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.306285e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.385858e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.386156e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.386156e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.388170 sec - 19,251,020,153 cycles # 3.026 GHz - 54,076,999,316 instructions # 2.81 insn per cycle - 6.408962792 seconds time elapsed +TOTAL : 6.303840 sec + 19,464,475,657 cycles # 3.088 GHz + 54,075,582,748 instructions # 2.78 insn per cycle + 6.308485796 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32250) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.635978e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.636087e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.636087e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.630524e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630632e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.630632e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.254285 sec - 9,971,831,230 cycles # 3.078 GHz - 27,084,550,159 instructions # 2.72 insn per cycle - 3.366598291 seconds time elapsed +TOTAL : 3.248802 sec + 9,955,685,609 cycles # 3.062 GHz + 27,083,252,200 instructions # 2.72 insn per cycle + 3.253385767 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96261) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.533836e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534465e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534465e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.474200e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.474751e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.474751e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.503056 sec - 4,339,385,797 cycles # 2.885 GHz - 9,687,047,671 instructions # 2.23 insn per cycle - 1.538517934 seconds time elapsed +TOTAL : 1.526083 sec + 4,358,815,644 cycles # 2.850 GHz + 9,685,708,561 instructions # 2.22 insn per cycle + 1.530767974 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84456) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.959338e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.960008e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.960008e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.953611e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.954307e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.954307e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.401841 sec - 3,860,666,699 cycles # 2.872 GHz - 8,635,848,419 instructions # 2.24 insn per cycle - 1.641256445 seconds time elapsed +TOTAL : 1.341317 sec + 3,852,272,040 cycles # 2.864 GHz + 8,634,436,493 instructions # 2.24 insn per cycle + 1.345985072 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83903) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.713030e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.713771e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.713771e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.652417e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.653091e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.653091e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.479855 sec - 2,709,859,297 cycles # 1.892 GHz - 4,354,311,113 instructions # 1.61 insn per cycle - 1.724515909 seconds time elapsed +TOTAL : 1.451809 sec + 2,728,793,957 cycles # 1.876 GHz + 4,352,618,842 instructions # 1.60 insn per cycle + 1.456299369 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2175) (512y: 185) (512z:83037) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bddb28ca75..c2f69eff61 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:11:19 +DATE: 2023-08-15_08:10:24 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.806434e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.808788e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.809067e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.817389e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.820205e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.820500e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.912078 sec - 5,816,187,845 cycles # 3.010 GHz - 11,609,814,800 instructions # 2.00 insn per cycle - 2.396686309 seconds time elapsed +TOTAL : 1.617573 sec + 5,689,264,894 cycles # 3.003 GHz + 12,093,638,450 instructions # 2.13 insn per cycle + 1.953515172 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.299440e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.300749e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.300829e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.287920e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.289001e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.289084e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.886383 sec - 6,540,170,119 cycles # 3.017 GHz - 13,328,224,584 instructions # 2.04 insn per cycle - 2.224583993 seconds time elapsed +TOTAL : 1.877475 sec + 6,493,452,356 cycles # 3.010 GHz + 13,536,213,574 instructions # 2.08 insn per cycle + 2.214676058 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.905802e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.906081e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.906081e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.933156e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.933445e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.933445e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.942283 sec - 18,209,018,174 cycles # 3.067 GHz - 53,645,414,893 instructions # 2.95 insn per cycle - 5.989924071 seconds time elapsed +TOTAL : 5.930546 sec + 18,238,911,118 cycles # 3.076 GHz + 53,645,677,508 instructions # 2.94 insn per cycle + 5.935454775 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20320) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619369e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.619878e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.619878e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.592142e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.592603e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592603e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.477851 sec - 4,522,404,746 cycles # 3.081 GHz - 13,772,167,964 instructions # 3.05 insn per cycle - 1.557310125 seconds time elapsed +TOTAL : 1.477317 sec + 4,525,666,351 cycles # 3.058 GHz + 13,770,742,105 instructions # 3.04 insn per cycle + 1.481927048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96921) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.990816e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.992612e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.992612e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.977692e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.979690e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979690e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.761643 sec - 2,210,264,761 cycles # 2.886 GHz - 4,878,574,961 instructions # 2.21 insn per cycle - 0.986718635 seconds time elapsed +TOTAL : 0.763048 sec + 2,202,514,087 cycles # 2.872 GHz + 4,877,098,573 instructions # 2.21 insn per cycle + 0.767957660 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84898) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.915043e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.917294e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.917294e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.981050e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.983277e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.983277e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.775398 sec - 1,936,593,838 cycles # 2.858 GHz - 4,350,766,942 instructions # 2.25 insn per cycle - 1.064894447 seconds time elapsed +TOTAL : 0.668655 sec + 1,928,988,428 cycles # 2.872 GHz + 4,349,149,408 instructions # 2.25 insn per cycle + 0.673285423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84581) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.439909e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.442888e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.442888e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.478929e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.481681e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.481681e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.721557 sec - 1,369,259,588 cycles # 1.899 GHz - 2,201,062,368 instructions # 1.61 insn per cycle - 0.886021023 seconds time elapsed +TOTAL : 0.713031 sec + 1,364,935,338 cycles # 1.905 GHz + 2,199,428,003 instructions # 1.61 insn per cycle + 0.718033552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2884) (512y: 48) (512z:83271) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 177490ccb5..47c696a06e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:44:02 +DATE: 2023-08-15_08:32:52 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.604120e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.605972e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.605972e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.603720e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.605621e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.605621e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.629607 sec - 5,747,175,828 cycles # 3.011 GHz - 12,397,004,525 instructions # 2.16 insn per cycle - 1.966672250 seconds time elapsed +TOTAL : 1.622946 sec + 5,724,029,892 cycles # 2.999 GHz + 12,052,755,071 instructions # 2.11 insn per cycle + 1.966452002 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.329103e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.342942e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.342942e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.303465e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.316648e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.316648e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.879053 sec - 6,484,462,862 cycles # 3.002 GHz - 13,547,794,276 instructions # 2.09 insn per cycle - 2.217392683 seconds time elapsed +TOTAL : 1.899173 sec + 6,557,417,589 cycles # 3.011 GHz + 13,516,884,926 instructions # 2.06 insn per cycle + 2.237244968 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.942118e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.942394e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.942394e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.907375e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.907662e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.907662e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.914411 sec - 18,228,035,767 cycles # 3.080 GHz - 53,644,537,425 instructions # 2.94 insn per cycle - 5.919455127 seconds time elapsed +TOTAL : 5.936791 sec + 18,252,219,781 cycles # 3.074 GHz + 53,645,575,809 instructions # 2.94 insn per cycle + 5.941222879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20320) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.602430e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.602902e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.602902e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.604346e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.604809e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.604809e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.472193 sec - 4,541,562,905 cycles # 3.077 GHz - 13,771,424,082 instructions # 3.03 insn per cycle - 1.476910344 seconds time elapsed +TOTAL : 1.471022 sec + 4,520,493,698 cycles # 3.066 GHz + 13,771,561,641 instructions # 3.05 insn per cycle + 1.475822140 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96921) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.981367e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.983288e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.983288e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.900672e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.902327e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.902327e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.762195 sec - 2,197,646,924 cycles # 2.869 GHz - 4,877,932,488 instructions # 2.22 insn per cycle - 0.767027404 seconds time elapsed +TOTAL : 0.771686 sec + 2,223,639,585 cycles # 2.868 GHz + 4,878,036,591 instructions # 2.19 insn per cycle + 0.776555403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84898) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.979933e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.982243e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.982243e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.972027e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.974296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.974296e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.667539 sec - 1,933,856,224 cycles # 2.880 GHz - 4,350,048,984 instructions # 2.25 insn per cycle - 0.672552477 seconds time elapsed +TOTAL : 0.668515 sec + 1,929,712,588 cycles # 2.872 GHz + 4,350,045,769 instructions # 2.25 insn per cycle + 0.673226229 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84581) (512y: 44) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.385202e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.387538e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.387538e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.310786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.313011e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.313011e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.722736 sec - 1,367,653,768 cycles # 1.883 GHz - 2,200,386,250 instructions # 1.61 insn per cycle - 0.727430339 seconds time elapsed +TOTAL : 0.729035 sec + 1,369,441,194 cycles # 1.870 GHz + 2,200,529,525 instructions # 1.61 insn per cycle + 0.733767903 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2884) (512y: 48) (512z:83271) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 919cd71a61..9da08c83fd 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:12:11 +DATE: 2023-08-15_08:11:12 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.618209e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.620660e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.620902e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.623177e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.625959e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.626212e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 2.015743 sec - 5,790,092,141 cycles # 3.005 GHz - 11,886,911,142 instructions # 2.05 insn per cycle - 2.511246140 seconds time elapsed +TOTAL : 1.645327 sec + 5,745,791,715 cycles # 2.977 GHz + 11,621,961,179 instructions # 2.02 insn per cycle + 1.988370539 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.305654e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.306873e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.306959e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.261103e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262365e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.893582 sec - 6,597,577,116 cycles # 3.036 GHz - 14,552,946,223 instructions # 2.21 insn per cycle - 2.230880507 seconds time elapsed +TOTAL : 1.900522 sec + 6,625,830,421 cycles # 3.021 GHz + 13,870,125,042 instructions # 2.09 insn per cycle + 2.251246688 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.996827e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.997127e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.997127e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.922448e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.922723e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.922723e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.895379 sec - 18,182,409,029 cycles # 3.090 GHz - 53,663,254,524 instructions # 2.95 insn per cycle - 5.943525615 seconds time elapsed +TOTAL : 5.927760 sec + 18,213,792,071 cycles # 3.072 GHz + 53,662,418,994 instructions # 2.95 insn per cycle + 5.932795623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:20477) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.612752e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613204e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613204e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.621315e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.621811e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621811e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.577517 sec - 4,540,350,213 cycles # 3.084 GHz - 13,764,975,796 instructions # 3.03 insn per cycle - 1.916738265 seconds time elapsed +TOTAL : 1.463851 sec + 4,517,955,155 cycles # 3.080 GHz + 13,763,574,365 instructions # 3.05 insn per cycle + 1.468420531 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96593) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.180548e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.182356e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.182356e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.102450e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.104369e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.104369e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.769841 sec - 2,162,240,797 cycles # 2.900 GHz - 4,886,709,233 instructions # 2.26 insn per cycle - 0.925659420 seconds time elapsed +TOTAL : 0.749632 sec + 2,159,833,683 cycles # 2.868 GHz + 4,885,205,971 instructions # 2.26 insn per cycle + 0.754130887 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85271) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.942183e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.944519e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.944519e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.966817e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.969220e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.969220e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.676714 sec - 1,945,087,494 cycles # 2.885 GHz - 4,358,393,924 instructions # 2.24 insn per cycle - 0.732072584 seconds time elapsed +TOTAL : 0.669184 sec + 1,936,191,266 cycles # 2.878 GHz + 4,356,892,283 instructions # 2.25 insn per cycle + 0.673682607 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.411029e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.413369e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.413369e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.196993e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.199545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.199545e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.742950 sec - 1,372,385,255 cycles # 1.898 GHz - 2,210,331,368 instructions # 1.61 insn per cycle - 0.808838837 seconds time elapsed +TOTAL : 0.740090 sec + 1,384,261,381 cycles # 1.861 GHz + 2,208,765,319 instructions # 1.60 insn per cycle + 0.744974646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3500) (512y: 33) (512z:83441) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index c6d3d1be8e..5f8ec2e089 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:13:02 +DATE: 2023-08-15_08:11:59 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.666856e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.668793e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.668937e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.670105e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.671739e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.671928e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.155304 sec - 7,482,292,231 cycles # 3.002 GHz - 16,648,711,482 instructions # 2.23 insn per cycle - 2.572235047 seconds time elapsed +TOTAL : 2.129095 sec + 7,309,412,823 cycles # 2.979 GHz + 16,121,474,446 instructions # 2.21 insn per cycle + 2.511223208 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.106939e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107464e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107489e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.108729e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.109170e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109194e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.403955 sec - 11,404,902,518 cycles # 3.058 GHz - 26,798,312,354 instructions # 2.35 insn per cycle - 3.787738720 seconds time elapsed +TOTAL : 3.397563 sec + 11,348,855,392 cycles # 3.040 GHz + 27,320,618,219 instructions # 2.41 insn per cycle + 3.792342967 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.355440e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.355746e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.355746e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.366085e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.366386e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.366386e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.325382 sec - 19,449,810,214 cycles # 3.074 GHz - 54,291,646,619 instructions # 2.79 insn per cycle - 6.329977193 seconds time elapsed +TOTAL : 6.326826 sec + 19,384,652,452 cycles # 3.062 GHz + 54,291,405,977 instructions # 2.80 insn per cycle + 6.331822155 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:31979) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.593850e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.593971e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.593971e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.611293e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.611405e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.611405e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.355062 sec - 9,506,157,029 cycles # 2.860 GHz - 26,122,765,518 instructions # 2.75 insn per cycle - 3.408457282 seconds time elapsed +TOTAL : 3.287257 sec + 9,520,222,893 cycles # 2.895 GHz + 26,121,547,178 instructions # 2.74 insn per cycle + 3.291758740 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95979) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.693904e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.694546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.694546e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.681119e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.681692e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.681692e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.451810 sec - 4,147,385,444 cycles # 2.880 GHz - 9,340,347,312 instructions # 2.25 insn per cycle - 1.841854663 seconds time elapsed +TOTAL : 1.440055 sec + 4,148,952,901 cycles # 2.876 GHz + 9,338,763,918 instructions # 2.25 insn per cycle + 1.445107953 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84147) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.175679e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176481e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176481e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.218107e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.218862e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.218862e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.270122 sec - 3,656,572,187 cycles # 2.870 GHz - 8,315,329,120 instructions # 2.27 insn per cycle - 1.330414240 seconds time elapsed +TOTAL : 1.257512 sec + 3,633,626,273 cycles # 2.881 GHz + 8,314,696,967 instructions # 2.29 insn per cycle + 1.262536037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83817) (512y: 79) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.816779e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.817544e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.817544e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.695528e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.696316e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.696316e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.417972 sec - 2,647,975,791 cycles # 1.898 GHz - 4,242,043,812 instructions # 1.60 insn per cycle - 1.510069345 seconds time elapsed +TOTAL : 1.436681 sec + 2,642,631,321 cycles # 1.836 GHz + 4,240,641,282 instructions # 1.60 insn per cycle + 1.441269837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2285) (512y: 93) (512z:82779) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 09c12499f2..ab539291a5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-08-14_01:14:06 +DATE: 2023-08-15_08:13:01 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.657116e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.659092e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.659220e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.655737e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.657354e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.657527e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.160124 sec - 7,535,428,848 cycles # 3.021 GHz - 16,641,147,852 instructions # 2.21 insn per cycle - 2.552394390 seconds time elapsed +TOTAL : 2.150810 sec + 7,429,217,381 cycles # 3.003 GHz + 15,053,638,777 instructions # 2.03 insn per cycle + 2.532126361 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104620e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.105128e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.105155e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.107795e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108240e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108265e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.411185 sec - 11,361,440,224 cycles # 3.038 GHz - 26,887,254,715 instructions # 2.37 insn per cycle - 3.795967265 seconds time elapsed +TOTAL : 3.398964 sec + 11,335,112,372 cycles # 3.033 GHz + 26,200,903,178 instructions # 2.31 insn per cycle + 3.794779431 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.292213e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.292510e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.292510e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.205581e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.205881e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.205881e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.373072 sec - 19,456,833,794 cycles # 3.053 GHz - 54,296,575,955 instructions # 2.79 insn per cycle - 6.377946941 seconds time elapsed +TOTAL : 6.441909 sec + 19,418,391,360 cycles # 3.013 GHz + 54,296,934,460 instructions # 2.80 insn per cycle + 6.446485419 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32422) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.614743e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.614856e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614856e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.605166e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.605275e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.605275e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.276195 sec - 9,475,782,239 cycles # 2.889 GHz - 26,036,173,216 instructions # 2.75 insn per cycle - 3.280979327 seconds time elapsed +TOTAL : 3.298409 sec + 9,439,216,231 cycles # 2.860 GHz + 26,036,756,092 instructions # 2.76 insn per cycle + 3.302974381 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.726806e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.727400e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.727400e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.692424e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.693001e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.693001e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.423268 sec - 4,123,365,274 cycles # 2.889 GHz - 9,318,969,631 instructions # 2.26 insn per cycle - 1.428330078 seconds time elapsed +TOTAL : 1.436141 sec + 4,130,114,036 cycles # 2.869 GHz + 9,319,033,220 instructions # 2.26 insn per cycle + 1.440658695 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83787) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.209303e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.210068e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.210068e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.228544e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.229287e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.229287e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.260600 sec - 3,647,543,257 cycles # 2.888 GHz - 8,310,864,508 instructions # 2.28 insn per cycle - 1.265225686 seconds time elapsed +TOTAL : 1.255330 sec + 3,631,851,616 cycles # 2.885 GHz + 8,310,847,527 instructions # 2.29 insn per cycle + 1.260145344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83306) (512y: 229) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.784623e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.785473e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.785473e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.774847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.775588e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775588e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.402909 sec - 2,638,826,314 cycles # 1.877 GHz - 4,239,197,481 instructions # 1.61 insn per cycle - 1.407676897 seconds time elapsed +TOTAL : 1.406738 sec + 2,637,440,345 cycles # 1.872 GHz + 4,239,370,332 instructions # 1.61 insn per cycle + 1.411242699 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1729) (512y: 175) (512z:82792) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 00b6edd6f3..7691ab57c4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:07:03 +DATE: 2023-08-15_08:06:31 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.592080e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.322709e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.718340e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.756917e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.386054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.750556e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.496988 sec - 1,935,793,349 cycles # 2.880 GHz - 2,426,101,045 instructions # 1.25 insn per cycle - 0.908770582 seconds time elapsed +TOTAL : 0.449699 sec + 1,944,728,666 cycles # 2.912 GHz + 2,421,761,171 instructions # 1.25 insn per cycle + 0.725114953 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.304219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.445784e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.915014e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.656769e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.576035e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.998689e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.540276 sec - 2,259,824,177 cycles # 2.888 GHz - 2,909,142,598 instructions # 1.29 insn per cycle - 0.839982230 seconds time elapsed +TOTAL : 0.533637 sec + 2,211,814,951 cycles # 2.864 GHz + 2,887,560,241 instructions # 1.31 insn per cycle + 0.830704475 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.164957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198004e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198004e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.170227e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.203378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203378e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.429734 sec - 4,414,855,703 cycles # 3.080 GHz - 12,855,889,800 instructions # 2.91 insn per cycle - 1.466567893 seconds time elapsed +TOTAL : 1.424484 sec + 4,413,499,773 cycles # 3.094 GHz + 12,854,148,105 instructions # 2.91 insn per cycle + 1.429447827 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.071243e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.181449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.181449e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.073346e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.180896e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.180896e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.814204 sec - 2,516,122,884 cycles # 3.072 GHz - 7,238,365,184 instructions # 2.88 insn per cycle - 0.899597536 seconds time elapsed +TOTAL : 0.812922 sec + 2,507,689,277 cycles # 3.072 GHz + 7,235,529,848 instructions # 2.89 insn per cycle + 0.823426737 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3150) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.632519e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.964758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.964758e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.636144e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.974308e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.974308e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.529393 sec - 1,394,960,287 cycles # 2.911 GHz - 3,007,840,667 instructions # 2.16 insn per cycle - 0.719366162 seconds time elapsed +TOTAL : 0.474060 sec + 1,392,319,534 cycles # 2.912 GHz + 3,007,129,515 instructions # 2.16 insn per cycle + 0.479278999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3017) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.972926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.368385e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.368385e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.954912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.358621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.358621e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.434906 sec - 1,280,481,936 cycles # 2.915 GHz - 2,859,618,669 instructions # 2.23 insn per cycle - 0.704768098 seconds time elapsed +TOTAL : 0.437714 sec + 1,282,899,010 cycles # 2.903 GHz + 2,860,715,054 instructions # 2.23 insn per cycle + 0.443030298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2780) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.808970e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.000832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.000832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.745804e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.939011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.939011e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.691504 sec - 1,228,120,467 cycles # 2.007 GHz - 1,845,589,250 instructions # 1.50 insn per cycle - 1.138453921 seconds time elapsed +TOTAL : 0.621468 sec + 1,228,938,779 cycles # 1.969 GHz + 1,844,901,377 instructions # 1.50 insn per cycle + 0.626449091 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1375) (512y: 106) (512z: 2270) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 017f75c00c..2c0323afc3 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:41:17 +DATE: 2023-08-15_08:30:06 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.040613e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.145299e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.145299e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.087095e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.261342e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.261342e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.502047 sec - 2,089,244,952 cycles # 2.906 GHz - 2,755,592,867 instructions # 1.32 insn per cycle - 0.778200374 seconds time elapsed +TOTAL : 0.499760 sec + 2,097,668,356 cycles # 2.894 GHz + 2,791,555,788 instructions # 1.33 insn per cycle + 0.782534486 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.816685e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.390684e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.390684e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.832332e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.392534e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.392534e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.796414 sec - 3,149,684,628 cycles # 2.947 GHz - 4,461,147,544 instructions # 1.42 insn per cycle - 1.129335763 seconds time elapsed +TOTAL : 0.798926 sec + 3,096,775,317 cycles # 2.919 GHz + 4,431,168,284 instructions # 1.43 insn per cycle + 1.123669054 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.146537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.179578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.179578e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.165019e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197795e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197795e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.457776 sec - 4,445,650,179 cycles # 3.042 GHz - 12,861,335,382 instructions # 2.89 insn per cycle - 1.463004122 seconds time elapsed +TOTAL : 1.434384 sec + 4,437,567,993 cycles # 3.085 GHz + 12,859,550,220 instructions # 2.90 insn per cycle + 1.439848105 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.087678e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.196453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.196453e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.075971e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.181885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.181885e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.813447 sec - 2,540,350,512 cycles # 3.109 GHz - 7,285,520,948 instructions # 2.87 insn per cycle - 0.824014435 seconds time elapsed +TOTAL : 0.818868 sec + 2,544,346,828 cycles # 3.089 GHz + 7,287,580,278 instructions # 2.86 insn per cycle + 0.830252102 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3150) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.614600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.948990e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948990e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.614210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.950189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.950189e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.483614 sec - 1,423,294,871 cycles # 2.917 GHz - 3,058,183,936 instructions # 2.15 insn per cycle - 0.489198626 seconds time elapsed +TOTAL : 0.482662 sec + 1,421,601,668 cycles # 2.921 GHz + 3,056,342,411 instructions # 2.15 insn per cycle + 0.497011065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3017) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.930452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.325705e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.325705e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.933148e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.331996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.331996e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.446892 sec - 1,312,192,667 cycles # 2.909 GHz - 2,909,852,357 instructions # 2.22 insn per cycle - 0.452142917 seconds time elapsed +TOTAL : 0.445752 sec + 1,309,065,815 cycles # 2.907 GHz + 2,907,990,208 instructions # 2.22 insn per cycle + 0.456912060 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2780) (512y: 104) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.808349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.009638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.009638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.784613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.977109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977109e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.614853 sec - 1,260,293,049 cycles # 2.035 GHz - 1,884,752,431 instructions # 1.50 insn per cycle - 0.620490604 seconds time elapsed +TOTAL : 0.619630 sec + 1,260,312,878 cycles # 2.020 GHz + 1,884,777,232 instructions # 1.50 insn per cycle + 0.634711205 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1375) (512y: 106) (512z: 2270) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index bcc8954296..9b62913c8c 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:07:24 +DATE: 2023-08-15_08:06:48 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.537895e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.228800e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.579817e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.718463e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.224283e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.561996e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.450848 sec - 1,960,304,512 cycles # 2.929 GHz - 2,441,625,263 instructions # 1.25 insn per cycle - 0.973358701 seconds time elapsed +TOTAL : 0.450989 sec + 1,926,591,901 cycles # 2.878 GHz + 2,417,002,909 instructions # 1.25 insn per cycle + 0.726915120 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.273354e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.376753e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.830350e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.610230e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.459724e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.850101e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.539336 sec - 2,258,022,835 cycles # 2.886 GHz - 2,905,914,792 instructions # 1.29 insn per cycle - 0.839926576 seconds time elapsed +TOTAL : 0.532076 sec + 2,233,739,257 cycles # 2.906 GHz + 2,874,742,621 instructions # 1.29 insn per cycle + 0.828489500 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.184308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218795e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218795e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.181743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.216053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.216053e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.445639 sec - 4,366,248,091 cycles # 3.095 GHz - 12,734,529,827 instructions # 2.92 insn per cycle - 1.529222334 seconds time elapsed +TOTAL : 1.408622 sec + 4,361,143,255 cycles # 3.089 GHz + 12,732,406,455 instructions # 2.92 insn per cycle + 1.413340753 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 687) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.124123e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.235249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.235249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.123224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.235875e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.235875e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.800501 sec - 2,460,996,087 cycles # 3.087 GHz - 7,100,311,250 instructions # 2.89 insn per cycle - 1.029477316 seconds time elapsed +TOTAL : 0.794392 sec + 2,460,614,760 cycles # 3.084 GHz + 7,101,608,991 instructions # 2.89 insn per cycle + 0.812765815 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2966) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.322537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.606892e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.606892e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.368124e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.655189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655189e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.541348 sec - 1,490,794,198 cycles # 2.864 GHz - 3,238,669,765 instructions # 2.17 insn per cycle - 0.757925572 seconds time elapsed +TOTAL : 0.508970 sec + 1,486,600,307 cycles # 2.896 GHz + 3,239,541,081 instructions # 2.18 insn per cycle + 0.514300144 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3078) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.526948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.837509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.837509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.515852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.825133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.825133e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.489074 sec - 1,431,185,581 cycles # 2.909 GHz - 3,144,239,531 instructions # 2.20 insn per cycle - 0.769431358 seconds time elapsed +TOTAL : 0.489162 sec + 1,429,316,422 cycles # 2.899 GHz + 3,143,556,021 instructions # 2.20 insn per cycle + 0.494066423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2785) (512y: 257) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.692829e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.874443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.874443e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.520314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.687012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.687012e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.643460 sec - 1,272,261,385 cycles # 1.996 GHz - 2,112,490,218 instructions # 1.66 insn per cycle - 0.693322877 seconds time elapsed +TOTAL : 0.676320 sec + 1,279,958,194 cycles # 1.881 GHz + 2,113,888,708 instructions # 1.65 insn per cycle + 0.681951384 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1197) (512y: 194) (512z: 2426) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 9cd73f88df..2ebb36034e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:07:45 +DATE: 2023-08-15_08:07:06 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.995970e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.173842e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.308306e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.317870e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.206497e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.327873e+08 ) sec^-1 MeanMatrixElemValue = ( 3.402886e+01 +- 1.677500e+01 ) GeV^-2 -TOTAL : 0.450259 sec - 1,894,005,127 cycles # 2.828 GHz - 2,377,316,207 instructions # 1.26 insn per cycle - 1.125643804 seconds time elapsed +TOTAL : 0.447120 sec + 1,895,627,368 cycles # 2.855 GHz + 2,376,491,406 instructions # 1.25 insn per cycle + 0.721871082 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 168 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.078447e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.797036e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945520e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.844312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817021e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.945412e+08 ) sec^-1 MeanMatrixElemValue = ( 4.166198e+02 +- 2.517590e+02 ) GeV^-2 -TOTAL : 0.488213 sec - 2,078,037,812 cycles # 2.880 GHz - 2,651,521,651 instructions # 1.28 insn per cycle - 0.779788082 seconds time elapsed +TOTAL : 0.482475 sec + 2,050,887,558 cycles # 2.878 GHz + 2,606,594,300 instructions # 1.27 insn per cycle + 0.769817589 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.215390e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.209994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239548e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239548e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.368796 sec - 4,257,078,542 cycles # 3.101 GHz - 12,772,775,953 instructions # 3.00 insn per cycle - 1.434009950 seconds time elapsed +TOTAL : 1.374925 sec + 4,255,805,112 cycles # 3.086 GHz + 12,772,760,715 instructions # 3.00 insn per cycle + 1.380363694 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.337757e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576268e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.320390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557734e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.568588 sec - 1,587,230,440 cycles # 3.079 GHz - 4,250,195,057 instructions # 2.68 insn per cycle - 0.695524377 seconds time elapsed +TOTAL : 0.513592 sec + 1,585,556,515 cycles # 3.064 GHz + 4,249,378,012 instructions # 2.68 insn per cycle + 0.523909176 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3709) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.560045e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.497777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.497777e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.542570e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.493905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.493905e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.296255 sec - 799,920,416 cycles # 2.906 GHz - 1,814,663,595 instructions # 2.27 insn per cycle - 0.345252799 seconds time elapsed +TOTAL : 0.271116 sec + 797,632,544 cycles # 2.903 GHz + 1,813,869,762 instructions # 2.27 insn per cycle + 0.275808482 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3614) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.030835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.122902e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.122902e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.021575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.114751e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.114751e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.254036 sec - 752,009,620 cycles # 2.905 GHz - 1,736,159,316 instructions # 2.31 insn per cycle - 0.361432969 seconds time elapsed +TOTAL : 0.254142 sec + 752,335,072 cycles # 2.915 GHz + 1,735,351,851 instructions # 2.31 insn per cycle + 0.259107554 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3443) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.257018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.855898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.855898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.361233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.978175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.978175e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.335002 sec - 688,668,898 cycles # 2.026 GHz - 1,223,449,303 instructions # 1.78 insn per cycle - 0.369979089 seconds time elapsed +TOTAL : 0.328715 sec + 686,462,283 cycles # 2.062 GHz + 1,222,689,460 instructions # 1.78 insn per cycle + 0.333921475 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 38) (512z: 2493) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 32b55e9202..41f39a811d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:41:35 +DATE: 2023-08-15_08:30:25 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -48,14 +48,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.515486e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.017606e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.017606e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.591873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223950e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223950e+07 ) sec^-1 MeanMatrixElemValue = ( 3.419752e+01 +- 1.682900e+01 ) GeV^-2 -TOTAL : 0.474668 sec - 1,993,230,838 cycles # 2.876 GHz - 2,609,365,190 instructions # 1.31 insn per cycle - 0.750368601 seconds time elapsed +TOTAL : 0.473901 sec + 2,006,434,611 cycles # 2.909 GHz + 2,607,870,987 instructions # 1.30 insn per cycle + 0.748330623 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,14 +72,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.148511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.886160e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.886160e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.143568e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.858291e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.858291e+07 ) sec^-1 MeanMatrixElemValue = ( 4.349381e+02 +- 2.541442e+02 ) GeV^-2 -TOTAL : 0.645340 sec - 2,580,020,582 cycles # 2.915 GHz - 3,581,698,266 instructions # 1.39 insn per cycle - 0.942997501 seconds time elapsed +TOTAL : 0.643398 sec + 2,551,284,606 cycles # 2.899 GHz + 3,561,798,895 instructions # 1.40 insn per cycle + 0.939559641 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -95,14 +95,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.212676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242680e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.208519e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237979e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.375822 sec - 4,271,446,348 cycles # 3.097 GHz - 12,777,385,497 instructions # 2.99 insn per cycle - 1.381116450 seconds time elapsed +TOTAL : 1.380608 sec + 4,272,088,122 cycles # 3.086 GHz + 12,777,586,064 instructions # 2.99 insn per cycle + 1.386039886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.332628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.573696e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.573696e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.316195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558201e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.516104 sec - 1,606,513,029 cycles # 3.090 GHz - 4,297,739,930 instructions # 2.68 insn per cycle - 0.530239572 seconds time elapsed +TOTAL : 0.518966 sec + 1,607,458,704 cycles # 3.073 GHz + 4,297,884,927 instructions # 2.67 insn per cycle + 0.529676971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3709) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -149,14 +149,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.409626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.344394e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.344394e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.441566e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.357712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.357712e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.280705 sec - 819,900,701 cycles # 2.882 GHz - 1,850,922,729 instructions # 2.26 insn per cycle - 0.285615729 seconds time elapsed +TOTAL : 0.279512 sec + 817,665,413 cycles # 2.883 GHz + 1,851,099,137 instructions # 2.26 insn per cycle + 0.284756023 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3614) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -176,14 +176,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.009112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.093718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.093718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.824094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.872005e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.872005e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.258376 sec - 768,054,188 cycles # 2.932 GHz - 1,772,401,645 instructions # 2.31 insn per cycle - 0.263190139 seconds time elapsed +TOTAL : 0.264878 sec + 768,094,098 cycles # 2.860 GHz + 1,772,432,011 instructions # 2.31 insn per cycle + 0.270038989 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3443) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -203,14 +203,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.356761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.978276e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.978276e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.316128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.941733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.941733e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.332643 sec - 707,258,674 cycles # 2.102 GHz - 1,264,389,051 instructions # 1.79 insn per cycle - 0.337868157 seconds time elapsed +TOTAL : 0.335239 sec + 706,774,302 cycles # 2.083 GHz + 1,264,350,644 instructions # 1.79 insn per cycle + 0.340454222 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2077) (512y: 38) (512z: 2493) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 76693da2e8..7d66debda8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:08:05 +DATE: 2023-08-15_08:07:24 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.110930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184983e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.317785e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.315866e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.169151e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.283291e+08 ) sec^-1 MeanMatrixElemValue = ( 3.402886e+01 +- 1.677500e+01 ) GeV^-2 -TOTAL : 0.450722 sec - 1,902,030,438 cycles # 2.853 GHz - 2,381,470,826 instructions # 1.25 insn per cycle - 0.892078061 seconds time elapsed +TOTAL : 0.447750 sec + 1,887,621,838 cycles # 2.847 GHz + 2,363,659,721 instructions # 1.25 insn per cycle + 0.722412452 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 162 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.105185e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.788023e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927871e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.786680e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782188e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899575e+08 ) sec^-1 MeanMatrixElemValue = ( 4.166198e+02 +- 2.517590e+02 ) GeV^-2 -TOTAL : 0.487491 sec - 2,103,413,519 cycles # 2.905 GHz - 2,654,417,817 instructions # 1.26 insn per cycle - 0.783960612 seconds time elapsed +TOTAL : 0.481145 sec + 2,053,728,244 cycles # 2.889 GHz + 2,608,696,340 instructions # 1.27 insn per cycle + 0.768584912 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.222997e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.252944e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.252944e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.218792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.248961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248961e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.360238 sec - 4,226,375,131 cycles # 3.098 GHz - 12,672,291,040 instructions # 3.00 insn per cycle - 1.383043142 seconds time elapsed +TOTAL : 1.365137 sec + 4,226,666,689 cycles # 3.088 GHz + 12,672,231,770 instructions # 3.00 insn per cycle + 1.370122458 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.659058e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.957654e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.957654e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.664170e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.958366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.958366e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.467868 sec - 1,452,852,675 cycles # 3.077 GHz - 4,138,538,739 instructions # 2.85 insn per cycle - 0.810486361 seconds time elapsed +TOTAL : 0.467403 sec + 1,451,107,170 cycles # 3.079 GHz + 4,137,811,344 instructions # 2.85 insn per cycle + 0.472451500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3414) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.031981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.584240e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.584240e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.034343e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.588836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.588836e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.346491 sec - 1,015,777,117 cycles # 2.895 GHz - 2,142,932,514 instructions # 2.11 insn per cycle - 0.384075889 seconds time elapsed +TOTAL : 0.346626 sec + 1,014,394,653 cycles # 2.893 GHz + 2,142,123,582 instructions # 2.11 insn per cycle + 0.351820806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4206) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.250914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.832768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.832768e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.191113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.760427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.760427e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.333207 sec - 978,712,000 cycles # 2.901 GHz - 2,061,974,698 instructions # 2.11 insn per cycle - 0.618694986 seconds time elapsed +TOTAL : 0.336175 sec + 977,059,662 cycles # 2.873 GHz + 2,061,260,048 instructions # 2.11 insn per cycle + 0.340940509 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4013) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.134554e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.490686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.490686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.066917e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412043e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.418992 sec - 865,070,742 cycles # 2.043 GHz - 1,591,763,542 instructions # 1.84 insn per cycle - 0.486102749 seconds time elapsed +TOTAL : 0.425660 sec + 865,326,882 cycles # 2.013 GHz + 1,591,030,103 instructions # 1.84 insn per cycle + 0.430935126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2526) (512y: 22) (512z: 2998) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index ffda577805..1572e8a079 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:08:24 +DATE: 2023-08-15_08:07:40 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.584475e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.362029e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.748367e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.773573e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.383104e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.734022e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.451999 sec - 1,943,447,204 cycles # 2.900 GHz - 2,444,384,403 instructions # 1.26 insn per cycle - 0.783980391 seconds time elapsed +TOTAL : 0.454328 sec + 1,916,621,964 cycles # 2.855 GHz + 2,425,052,266 instructions # 1.27 insn per cycle + 0.730279804 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.296403e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.520827e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.999878e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.650032e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.600937e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.004138e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.541277 sec - 2,281,705,857 cycles # 2.909 GHz - 2,910,985,044 instructions # 1.28 insn per cycle - 0.842339112 seconds time elapsed +TOTAL : 0.530540 sec + 2,232,843,689 cycles # 2.896 GHz + 2,905,011,243 instructions # 1.30 insn per cycle + 0.828486188 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.166405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.162715e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.196172e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.196172e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.427556 sec - 4,430,820,537 cycles # 3.095 GHz - 12,827,128,817 instructions # 2.89 insn per cycle - 1.465258734 seconds time elapsed +TOTAL : 1.434253 sec + 4,436,956,891 cycles # 3.087 GHz + 12,827,279,350 instructions # 2.89 insn per cycle + 1.439255939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.151761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.151761e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.052091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.156668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.156668e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.822728 sec - 2,503,622,148 cycles # 3.027 GHz - 7,160,659,176 instructions # 2.86 insn per cycle - 0.909239155 seconds time elapsed +TOTAL : 0.821769 sec + 2,501,734,616 cycles # 3.029 GHz + 7,159,831,814 instructions # 2.86 insn per cycle + 0.832634068 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3215) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.003718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.003718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.665751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.013101e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.013101e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.485579 sec - 1,375,740,734 cycles # 2.898 GHz - 2,981,571,363 instructions # 2.17 insn per cycle - 0.768853470 seconds time elapsed +TOTAL : 0.470297 sec + 1,374,569,459 cycles # 2.900 GHz + 2,980,850,687 instructions # 2.17 insn per cycle + 0.475201900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3175) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.036780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.450812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.450812e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.059483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477223e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.429322 sec - 1,251,952,370 cycles # 2.886 GHz - 2,835,457,886 instructions # 2.26 insn per cycle - 0.724079409 seconds time elapsed +TOTAL : 0.426148 sec + 1,249,053,156 cycles # 2.907 GHz + 2,832,877,164 instructions # 2.27 insn per cycle + 0.430932894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2938) (512y: 110) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.696518e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.881096e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.881096e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.712610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.896335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.896335e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.633179 sec - 1,273,747,616 cycles # 1.997 GHz - 1,874,707,128 instructions # 1.47 insn per cycle - 0.680477087 seconds time elapsed +TOTAL : 0.629252 sec + 1,271,527,587 cycles # 2.008 GHz + 1,873,961,225 instructions # 1.47 insn per cycle + 0.634389126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 114) (512z: 2312) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 2286f9fc7c..6d2531dde1 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -36,7 +36,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-08-14_01:08:43 +DATE: 2023-08-15_08:07:58 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.568267e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.219887e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.583508e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.789411e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.303398e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.650520e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.452814 sec - 1,916,869,078 cycles # 2.851 GHz - 2,413,501,923 instructions # 1.26 insn per cycle - 1.095722687 seconds time elapsed +TOTAL : 0.451196 sec + 1,919,651,498 cycles # 2.869 GHz + 2,413,802,258 instructions # 1.26 insn per cycle + 0.726545933 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -60,14 +60,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.280116e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.402601e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.863370e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.626179e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.465436e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.858156e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.543373 sec - 2,254,973,753 cycles # 2.873 GHz - 2,937,447,691 instructions # 1.30 insn per cycle - 0.844504014 seconds time elapsed +TOTAL : 0.534363 sec + 2,214,066,326 cycles # 2.862 GHz + 2,892,011,520 instructions # 1.31 insn per cycle + 0.831589584 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -82,14 +82,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.184381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218117e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218117e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.176974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.211086e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.407898 sec - 4,392,426,171 cycles # 3.114 GHz - 12,711,031,758 instructions # 2.89 insn per cycle - 1.439948284 seconds time elapsed +TOTAL : 1.414731 sec + 4,384,977,798 cycles # 3.091 GHz + 12,710,908,577 instructions # 2.90 insn per cycle + 1.419974932 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe @@ -108,14 +108,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.113865e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.224321e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.224321e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.092509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.202126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.202126e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.797766 sec - 2,456,277,344 cycles # 3.061 GHz - 6,949,916,407 instructions # 2.83 insn per cycle - 0.836149653 seconds time elapsed +TOTAL : 0.805760 sec + 2,454,654,239 cycles # 3.033 GHz + 6,949,292,792 instructions # 2.83 insn per cycle + 0.810702702 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3036) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe @@ -134,14 +134,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.337838e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618742e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618742e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.317533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.604808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.604808e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.513991 sec - 1,510,130,797 cycles # 2.911 GHz - 3,212,873,113 instructions # 2.13 insn per cycle - 0.860972968 seconds time elapsed +TOTAL : 0.516874 sec + 1,506,955,385 cycles # 2.891 GHz + 3,212,138,114 instructions # 2.13 insn per cycle + 0.522141241 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3285) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.919188e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.919188e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.530873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.849344e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.849344e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.478689 sec - 1,414,758,246 cycles # 2.929 GHz - 3,084,969,617 instructions # 2.18 insn per cycle - 1.018746453 seconds time elapsed +TOTAL : 0.487148 sec + 1,414,973,528 cycles # 2.884 GHz + 3,084,278,864 instructions # 2.18 insn per cycle + 0.492087646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2936) (512y: 265) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe @@ -186,14 +186,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.661228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.838511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.838511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.646198e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.823436e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.823436e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.640959 sec - 1,283,582,728 cycles # 1.987 GHz - 2,048,939,024 instructions # 1.60 insn per cycle - 0.716136354 seconds time elapsed +TOTAL : 0.644560 sec + 1,282,518,009 cycles # 1.977 GHz + 2,048,181,150 instructions # 1.60 insn per cycle + 0.649746004 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1520) (512y: 202) (512z: 2499) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe