From 64e239c4b6adcf597ac5ada6e7f03cbff126e833 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Wed, 1 Nov 2023 14:41:46 -0500 Subject: [PATCH] Set NVHPC maximum optimization level to -O1 for now (#3800) * Set NVHPC maximum optimization level to -O1 for now Compiling HDF5 with NVHPC 23.5 - 23.9 results in test failures in 4 different test files that need to be resolved. Since those tests pass with an optimization level of -O1 (and -O0) and it is currently unclear whether the test failures are due to issues in HDF5 or issues in the 'nvc' compiler, set the maximum optimization level for NVHPC to -O1 until the test failures are resolved. * Disable nvhpc Java testing in CMake and amend known issues * Re-enable testing of Autotools nvhpc --- .github/workflows/nvhpc-auto.yml | 10 ++++---- .github/workflows/nvhpc-cmake.yml | 12 ++++----- config/cmake/HDFCompilerFlags.cmake | 38 +++++++++++++++++++++++++++++ config/nvidia-flags | 6 +++-- release_docs/RELEASE.txt | 22 +++++++++++++++++ 5 files changed, 75 insertions(+), 13 deletions(-) diff --git a/.github/workflows/nvhpc-auto.yml b/.github/workflows/nvhpc-auto.yml index 2a97ba1fd21..3e3a323fe1e 100644 --- a/.github/workflows/nvhpc-auto.yml +++ b/.github/workflows/nvhpc-auto.yml @@ -67,11 +67,11 @@ jobs: # RUN TESTS # NORMAL -# - name: Autotools Run Tests -# run: | -# export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/comm_libs/openmpi4/bin:/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin:$PATH -# make check -j -# working-directory: ${{ runner.workspace }}/build + - name: Autotools Run Tests + run: | + export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/comm_libs/openmpi4/bin:/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin:$PATH + make check -j + working-directory: ${{ runner.workspace }}/build # INSTALL (note that this runs even when we don't run the tests) - name: Autotools Install diff --git a/.github/workflows/nvhpc-cmake.yml b/.github/workflows/nvhpc-cmake.yml index 489c0bbf3fb..e4a1454f215 100644 --- a/.github/workflows/nvhpc-cmake.yml +++ b/.github/workflows/nvhpc-cmake.yml @@ -56,7 +56,7 @@ jobs: -DLIBAEC_USE_LOCALCONTENT=OFF \ -DZLIB_USE_LOCALCONTENT=OFF \ -DHDF5_BUILD_FORTRAN:BOOL=ON \ - -DHDF5_ENABLE_ASSERTS:BOOL=ON \ + -DHDF5_BUILD_JAVA:BOOL=OFF \ -DMPIEXEC_MAX_NUMPROCS:STRING="2" \ $GITHUB_WORKSPACE cat src/libhdf5.settings @@ -69,8 +69,8 @@ jobs: working-directory: ${{ runner.workspace }}/build # RUN TESTS -# - name: CMake Run Tests -# shell: bash -# run: | -# ctest . --parallel 2 -C ${{ inputs.build_mode }} -V -# working-directory: ${{ runner.workspace }}/build + - name: CMake Run Tests + shell: bash + run: | + ctest . --parallel 2 -C ${{ inputs.build_mode }} -V + working-directory: ${{ runner.workspace }}/build diff --git a/config/cmake/HDFCompilerFlags.cmake b/config/cmake/HDFCompilerFlags.cmake index e7b9337f39c..7bddad0f776 100644 --- a/config/cmake/HDFCompilerFlags.cmake +++ b/config/cmake/HDFCompilerFlags.cmake @@ -56,6 +56,44 @@ if (CMAKE_C_COMPILER_ID STREQUAL "NVHPC" ) else () set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Mbounds -g") endif () + + # With at least NVHPC 23.5 - 23.9, compiling with -O2 or higher and -DNDEBUG + # appears to have issues that manifest in the tests as incorrect metadata + # checksums being read or memory being corrupted. Compiling without -DNDEBUG + # does not appear to have these issues, but is not ideal due to compiling in + # asserts and other library debug code. Compiling with -O1 also does not appear + # to have these issues, so set maximum optimization level to -O1 for now until + # it can be determined whether these issues are compiler-specific or issues + # in the library. + set (cmake_c_flags_minsizerel_edited "${CMAKE_C_FLAGS_MINSIZEREL}") + string (REPLACE "-O2" "" cmake_c_flags_minsizerel_edited "${cmake_c_flags_minsizerel_edited}") + string (REPLACE "-O3" "" cmake_c_flags_minsizerel_edited "${cmake_c_flags_minsizerel_edited}") + string (REPLACE "-O4" "" cmake_c_flags_minsizerel_edited "${cmake_c_flags_minsizerel_edited}") + string (REPLACE "-Ofast" "" cmake_c_flags_minsizerel_edited "${cmake_c_flags_minsizerel_edited}") + string (REPLACE "-fast" "" cmake_c_flags_minsizerel_edited "${cmake_c_flags_minsizerel_edited}") + string (STRIP "${cmake_c_flags_minsizerel_edited}" cmake_c_flags_minsizerel_edited) + string (PREPEND cmake_c_flags_minsizerel_edited "-O1 ") + set (CMAKE_C_FLAGS_MINSIZEREL "${cmake_c_flags_minsizerel_edited}") + + set (cmake_c_flags_release_edited "${CMAKE_C_FLAGS_RELEASE}") + string (REPLACE "-O2" "" cmake_c_flags_release_edited "${cmake_c_flags_release_edited}") + string (REPLACE "-O3" "" cmake_c_flags_release_edited "${cmake_c_flags_release_edited}") + string (REPLACE "-O4" "" cmake_c_flags_release_edited "${cmake_c_flags_release_edited}") + string (REPLACE "-Ofast" "" cmake_c_flags_release_edited "${cmake_c_flags_release_edited}") + string (REPLACE "-fast" "" cmake_c_flags_release_edited "${cmake_c_flags_release_edited}") + string (STRIP "${cmake_c_flags_release_edited}" cmake_c_flags_release_edited) + string (PREPEND cmake_c_flags_release_edited "-O1 ") + set (CMAKE_C_FLAGS_RELEASE "${cmake_c_flags_release_edited}") + + set (cmake_c_flags_relwithdebinfo_edited "${CMAKE_C_FLAGS_RELWITHDEBINFO}") + string (REPLACE "-O2" "" cmake_c_flags_relwithdebinfo_edited "${cmake_c_flags_relwithdebinfo_edited}") + string (REPLACE "-O3" "" cmake_c_flags_relwithdebinfo_edited "${cmake_c_flags_relwithdebinfo_edited}") + string (REPLACE "-O4" "" cmake_c_flags_relwithdebinfo_edited "${cmake_c_flags_relwithdebinfo_edited}") + string (REPLACE "-Ofast" "" cmake_c_flags_relwithdebinfo_edited "${cmake_c_flags_relwithdebinfo_edited}") + string (REPLACE "-fast" "" cmake_c_flags_relwithdebinfo_edited "${cmake_c_flags_relwithdebinfo_edited}") + string (STRIP "${cmake_c_flags_relwithdebinfo_edited}" cmake_c_flags_relwithdebinfo_edited) + string (PREPEND cmake_c_flags_relwithdebinfo_edited "-O1 ") + set (CMAKE_C_FLAGS_RELWITHDEBINFO "${cmake_c_flags_relwithdebinfo_edited}") endif () if (CMAKE_COMPILER_IS_GNUCC) diff --git a/config/nvidia-flags b/config/nvidia-flags index 864c6444114..c140edd9830 100644 --- a/config/nvidia-flags +++ b/config/nvidia-flags @@ -76,7 +76,8 @@ if test "X-nvc" = "X-$cc_vendor" -o "X-nvcc" = "X-$cc_vendor"; then ############## # NDEBUG is handled explicitly by the configure script - PROD_CFLAGS="-fast" + #PROD_CFLAGS="-fast" + PROD_CFLAGS="" # -fast implies -O2 and -O2+ currently has test failures. ######### # Debug # @@ -106,7 +107,8 @@ if test "X-nvc" = "X-$cc_vendor" -o "X-nvcc" = "X-$cc_vendor"; then ################ HIGH_OPT_CFLAGS="-O1" # -O2+ currently has test failures. - DEBUG_OPT_CFLAGS="-gopt -O2" + #DEBUG_OPT_CFLAGS="-gopt -O2" + DEBUG_OPT_CFLAGS="-gopt -O1" # -O2+ currently has test failures. NO_OPT_CFLAGS="-O0" ################# diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 0f18f6fa276..51a73ce1121 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -1158,6 +1158,28 @@ Platforms Tested Known Problems ============== + When HDF5 is compiled with NVHPC versions 23.5 - 23.9 (additional versions may + also be applicable) and with -O2 (or higher) and -DNDEBUG, test failures occur + in the following tests: + + H5PLUGIN-filter_plugin + H5TEST-flush2 + H5TEST-testhdf5-base + MPI_TEST_t_filters_parallel + + Since these tests pass with an optimization level of -O1 (and -O0) and it is + currently unclear whether the test failures are due to issues in HDF5 or issues + in the 'nvc' compiler, the maximum optimization level for NVHPC has been set + to -O1 until the test failures can be resolved. Note that even at -O1 optimization + level, there still appears to be a sporadic test failure in the Java JUnit tests + that has occasionally been seen in JUnit-TestH5Pfapl and JUnit-TestH5D. It is also + unclear whether this is an issue in HDF5 or with the 'nvc' compiler. Finally, note + that NVHPC 23.9 will fail to compile the test/tselect.c test file with a compiler + error of 'use of undefined value' when the optimization level is -O2 or higher. + Nvidia is aware of this issue and has suggested lowering the optimization level to + -O1 for the time being: + https://forums.developer.nvidia.com/t/hdf5-no-longer-compiles-with-nv-23-9/269045. + IEEE standard arithmetic enables software to raise exceptions such as overflow, division by zero, and other illegal operations without interrupting or halting the program flow. The HDF5 C library intentionally performs these exceptions.