diff --git a/.github/workflows/ubuntu-ci-x86_64.yaml b/.github/workflows/ubuntu-ci-x86_64.yaml index 64281eb78..dace76e13 100644 --- a/.github/workflows/ubuntu-ci-x86_64.yaml +++ b/.github/workflows/ubuntu-ci-x86_64.yaml @@ -116,9 +116,6 @@ jobs: # Add additional variants for MET packages, different from config/common/packages.yaml spack config add "packages:met:variants:+python +grib2 +graphics +lidar2nc +modis" - # https://github.com/spack/spack/issues/42137 - spack config add "packages:pflogger:variants:+mpi" - # Concretize and check for duplicates spack concretize 2>&1 | tee log.concretize.intel-2022.1.0 ${SPACK_STACK_DIR}/util/show_duplicate_packages.py -d log.concretize.intel-2022.1.0 -i fms -i crtm -i esmf -i mapl diff --git a/.github/workflows/ubuntu-rnd-x86_64.yaml b/.github/workflows/ubuntu-rnd-x86_64.yaml index e11a0e9ac..6b924468c 100644 --- a/.github/workflows/ubuntu-rnd-x86_64.yaml +++ b/.github/workflows/ubuntu-rnd-x86_64.yaml @@ -64,9 +64,6 @@ jobs: # Add additional variants for MET packages, different from config/common/packages.yaml spack config add "packages:met:variants:+python +grib2 +graphics +lidar2nc +modis" - # https://github.com/spack/spack/issues/42137 - spack config add "packages:pflogger:variants:+mpi" - # Overrides for spack build and staging areas to speed up builds, # separate from the default site config locations to avoid conflicts spack config add "config:build_stage:/home/ubuntu/spack-stack/CI/tmp/build_stage" @@ -133,9 +130,6 @@ jobs: # Add additional variants for MET packages, different from config/common/packages.yaml spack config add "packages:met:variants:+python +grib2 +graphics +lidar2nc +modis" - # https://github.com/spack/spack/issues/42137 - spack config add "packages:pflogger:variants:+mpi" - # Overrides for spack build and staging areas to speed up builds, # separate from the default site config locations to avoid conflicts spack config add "config:build_stage:/home/ubuntu/spack-stack/CI/tmp/build_stage" diff --git a/configs/common/packages.yaml b/configs/common/packages.yaml index 09e7b5c00..68dc82980 100644 --- a/configs/common/packages.yaml +++ b/configs/common/packages.yaml @@ -136,11 +136,6 @@ libyaml: version: ['0.2.5'] mapl: - # 2.35.2 goes with esmf@8.4.2, 2.40.3 goes with esmf@8.5.0 - # turn off ~pflogger and extdata2g to avoid compilation - # errors with intel@2021.7.0+, see - # https://github.com/JCSDA/spack-stack/issues/769 - # also: ... extdata2g segfault UFS? version: ['2.40.3'] variants: +shared +pflogger ~f2py # If making changes here, also check the Discover site config and the CI workflows @@ -199,6 +194,9 @@ variants: +pnetcdf parallel-netcdf: version: ['1.12.2'] + pflogger: + version: ['1.12.0'] + variants: +mpi pixman: variants: +pic # Do not build pkgconf - https://github.com/jcsda/spack-stack/issues/123 diff --git a/configs/sites/aws-pcluster/README.md b/configs/sites/aws-pcluster/README.md index 99f5cd50e..8190068fb 100644 --- a/configs/sites/aws-pcluster/README.md +++ b/configs/sites/aws-pcluster/README.md @@ -4,9 +4,9 @@ ### Base instance Choose a basic AMI from the Community AMIs tab that matches your desired OS and parallelcluster version. Select an instance type of the same family that you are planning to use for the head and the compute nodes, and enough storage for a swap file and a spack-stack installation. For example: -- AMI ID: ami-093dab62f7840644b -- Instance hpc6a.48xlarge -- Use 350GB of gp3 storage as / +- AMI ID: ami-07410779598773e7d (aws-parallelcluster-3.8.0-ubuntu-2204-lts-hvm-x86_64-202312160956 2023-12-16T10-00-45.861Z) +- Instance hpc7a.96xlarge +- Use 500GB of gp3 storage as / ### Prerequisites 1. As `root`: @@ -47,8 +47,7 @@ tar -xvf Lmod-8.7.tar.bz2 cd Lmod-8.7 # Note the weird prefix, lmod installs in PREFIX/lmod/X.Y automatically ./configure --prefix=/opt/ \ ---with-lmodConfigDir=/opt/lmod/8.7/config \ -2>&1 | tee log.config +--with-lmodConfigDir=/opt/lmod/8.7/config 2>&1 | tee log.config make install 2>&1 | tee log.install ln -sf /opt/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh ln -sf /opt/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh @@ -57,19 +56,19 @@ ln -sf /opt/lmod/lmod/init/profile.fish /etc/profile.d/z00_lmod.fish # Add custom module locations and fix existing modules # # intelmpi -echo "conflict openmpi" >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi -echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi -echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi -echo '}' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi +echo "conflict openmpi" >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi +echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi +echo ' module load libfabric-aws/1.19.0amzn4.0' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi +echo '}' >> /opt/intel/mpi/2021.9.0/modulefiles/intelmpi # openmpi -echo "conflict intelmpi" >> /usr/share/modules/modulefiles/openmpi/4.1.4 -echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /usr/share/modules/modulefiles/openmpi/4.1.4 -echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /usr/share/modules/modulefiles/openmpi/4.1.4 -echo '}' >> /usr/share/modules/modulefiles/openmpi/4.1.4 +echo "conflict intelmpi" >> /usr/share/modules/modulefiles/openmpi/4.1.6 +echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {' >> /usr/share/modules/modulefiles/openmpi/4.1.6 +echo ' module load libfabric-aws/1.19.0amzn4.0' >> /usr/share/modules/modulefiles/openmpi/4.1.6 +echo '}' >> /usr/share/modules/modulefiles/openmpi/4.1.6 # echo "module use /usr/share/modules/modulefiles" >> /etc/profile.d/z01_lmod.sh -echo "module use /opt/intel/mpi/2021.6.0/modulefiles" >> /etc/profile.d/z01_lmod.sh -echo "module use /home/ubuntu/jedi/modulefiles" >> /etc/profile.d/z01_lmod.sh +### NO NOT ANY MORE ### echo "module use /opt/intel/mpi/2021.9.0/modulefiles" >> /etc/profile.d/z01_lmod.sh +### NO NOT ANY MORE ### echo "module use /home/ubuntu/jedi/modulefiles" >> /etc/profile.d/z01_lmod.sh # # Log out completely, ssh back into the instance and check if lua modules work exit @@ -78,10 +77,10 @@ exit ssh ... # Now user ubuntu module av -module load libfabric-aws/1.16.0~amzn4.0 -module load openmpi/4.1.4 +module load libfabric-aws/1.19.0amzn4.0 +module load openmpi/4.1.6 module list -module unload openmpi/4.1.4 +module unload openmpi/4.1.6 module load intelmpi module list module purge @@ -103,29 +102,18 @@ apt install -y unzip apt install -y automake apt install -y xterm apt install -y texlive +apt install -y cmake # This is for ecflow -apt install -y qt5-default +apt install -y qtcreator qtbase5-dev qt5-qmake +apt install -y libqt5widgets5 apt install -y libqt5svg5-dev apt install -y qt5dxcb-plugin -### # Remove AWS openmpi -### apt remove -y openmpi40-aws - -# This is because boost doesn't work with the Intel compiler -apt install -y libboost1.71-dev -apt install -y libboost-chrono1.71-dev -apt install -y libboost-date-time1.71-dev -apt install -y libboost-exception1.71-dev -apt install -y libboost-filesystem1.71-dev -apt install -y libboost-program-options1.71-dev -apt install -y libboost-python1.71-dev -apt install -y libboost-regex1.71-dev -apt install -y libboost-serialization1.71-dev -apt install -y libboost-system1.71-dev -apt install -y libboost-test1.71-dev -apt install -y libboost-thread1.71-dev -apt install -y libboost-timer1.71-dev +# For mysql +apt install -y mysql-server +# Test +mysql -u root # Python apt install -y python3-dev python3-pip @@ -134,7 +122,16 @@ apt install -y python3-dev python3-pip wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list apt-get update -apt-get install -y intel-hpckit-2022.2.0/all +apt-get install -y intel-hpckit-2024.0/all +exit + +# As ubuntu +/opt/intel/modulefiles-setup.sh +# Back to root +sudo su +mv /home/ubuntu/modulefiles /opt/intel/modulefiles +echo "module unuse /opt/intel/mpi/2021.9.0/modulefiles" >> /etc/profile.d/z01_lmod.sh +echo "module use /opt/intel/modulefiles" >> /etc/profile.d/z01_lmod.sh # Docker # See https://docs.docker.com/engine/install/ubuntu/ @@ -158,15 +155,7 @@ service sshd restart cd /usr/lib64/ ln -sf /usr/lib/x86_64-linux-gnu/libcrypt.so . cd /usr/include -ln -sf python3.8/pyconfig.h . - -# Create swapfile - 100GB -dd if=/dev/zero of=/swapfile bs=128M count=800 -chmod 600 /swapfile -mkswap /swapfile -swapon /swapfile -swapon -s -echo "/swapfile swap swap defaults 0 0" >> /etc/fstab +ln -sf python3.10/pyconfig.h . # Exit root session exit @@ -177,73 +166,12 @@ git config --global credential.helper cache 2. Log out and back in to enable x11 forwarding -3. Build ecflow outside of spack to be able to link against OS boost +3. Create directory for spack-stack external packages ``` -mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src -cd /home/ubuntu/jedi/ecflow-5.8.4/src -wget https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2 -mv ecFlow-5.8.4-Source.tar.gz\?api\=v2 ecFlow-5.8.4-Source.tar.gz -tar -xvzf ecFlow-5.8.4-Source.tar.gz -export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source -export BOOST_ROOT=/usr - -# Build ecFlow -cd $WK -mkdir build -cd build -cmake .. -DPython3_EXECUTABLE=/usr/bin/python3 -DENABLE_STATIC_BOOST_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/home/ubuntu/jedi/ecflow-5.8.4 2>&1 | tee log.cmake -make -j4 2>&1 | tee log.make -make install 2>&1 | tee log.install - -# Create a modulefiles directory with the following ecflow/5.8.4 module in it (w/o the '%%%%...' lines): -mkdir -p /home/ubuntu/jedi/modulefiles/ecflow -vi /home/ubuntu/jedi/modulefiles/ecflow/5.8.4 -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -#%Module1.0 - -module-whatis "Provides an ecflow-5.8.4 server+ui installation for use with spack." - -conflict ecflow - -proc ModulesHelp { } { -puts stderr "Provides an ecflow-5.8.4 server+ui installation for use with spack." -} - -# Set this value -set ECFLOW_PATH "/home/ubuntu/jedi/ecflow-5.8.4" - -prepend-path PATH "${ECFLOW_PATH}/bin" -prepend-path LD_LIBRARY_PATH "${ECFLOW_PATH}/lib" -prepend-path LIBRARY_PATH "${ECFLOW_PATH}/lib" -prepend-path CPATH "${ECFLOW_PATH}/include" -prepend-path CMAKE_PREFIX_PATH "${ECFLOW_PATH}" -prepend-path PYTHONPATH "${ECFLOW_PATH}/lib/python3.8/site-packages" -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -``` - -4. Install msql community server -``` -cd /home/ubuntu/jedi -mkdir -p mysql-8.0.31/src -cd mysql-8.0.31/src -wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar -tar -xvf mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar -# Switch to root -sudo su -dpkg -i *.deb -apt --fix-broken install -dpkg -i *.deb -# Use an empty password for root, choose legacy authentication method; test connection -mysql -u root -show databases; -# exit mysql -exit -# exit root session -exit -rm *.deb +mkdir -p /home/ubuntu/spack-stack/external ``` -5. Option 1: Testing existing site config in spack-stack (skip steps 5-7 afterwards) +4. Option 1: Testing existing site config in spack-stack (skip steps 5-7 afterwards) ``` mkdir -p /home/ubuntu/sandpit cd /home/ubuntu/sandpit @@ -260,7 +188,7 @@ spack module lmod refresh spack stack setup-meta-modules ``` -6. Option 2: Test configuring site from scratch +5. Option 2: Test configuring site from scratch ``` mkdir /home/ubuntu/jedi && cd /home/ubuntu/jedi git clone -b develop --recursive https://github.com/jcsda/spack-stack spack-stack @@ -271,39 +199,41 @@ spack env activate -p envs/unified-env export SPACK_SYSTEM_CONFIG_PATH=/home/ubuntu/jedi/spack-stack/envs/unified-env/site -spack external find --scope system -spack external find --scope system perl -spack external find --scope system python +spack external find --scope system \ + --exclude bison --exclude cmake \ + --exclude curl --exclude openssl \ + --exclude openssh spack external find --scope system wget -spack external find --scope system texlive spack external find --scope system mysql +spack external find --scope system texlive +spack external find --scope system sed # No external find for pre-installed intel-oneapi-mpi (from pcluster AMI), # and no way to add object entry to list using "spack config add". echo " intel-oneapi-mpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - spec: intel-oneapi-mpi@2021.6.0%intel@2022.1.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - spec: intel-oneapi-mpi@2021.9.0%intel@2022.1.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " prefix: /opt/intel" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - libfabric-aws/1.16.0~amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - libfabric-aws/1.19.0amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " - intelmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml # Add external openmpi echo " openmpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - spec: openmpi@4.1.4%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - spec: openmpi@4.1.6%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " fabrics=ofi schedulers=slurm" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " prefix: /opt/amazon/openmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - libfabric-aws/1.16.0~amzn3.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - openmpi/4.1.4" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - libfabric-aws/1.19.0amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - openmpi/4.1.6" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml # Can't find qt5 because qtpluginfo is broken, # and no way to add object entry to list using "spack config add". echo " qt:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " buildable: False" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml -echo " - spec: qt@5.12.8" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml +echo " - spec: qt@5.15.3" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml echo " prefix: /usr" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml # Add external boost @@ -325,7 +255,7 @@ spack compiler find --scope system export -n SPACK_SYSTEM_CONFIG_PATH spack config add "packages:mpi:buildable:False" -spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.6.0, openmpi@4.1.4]" +spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.9.0, openmpi@4.1.6]" spack config add "packages:all:compiler:[intel@2022.1.0, gcc@9.4.0]" # edit envs/unified-env/site/compilers.yaml and replace the following line in the **Intel** compiler section: @@ -333,7 +263,7 @@ spack config add "packages:all:compiler:[intel@2022.1.0, gcc@9.4.0]" # --> # environment: # prepend_path: -# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.6.0/linux/compiler/lib/intel64_lin' +# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.9.0/linux/compiler/lib/intel64_lin' # set: # I_MPI_PMI_LIBRARY: '/opt/slurm/lib/libpmi.so' ``` diff --git a/configs/sites/aws-pcluster/packages.yaml b/configs/sites/aws-pcluster/packages.yaml index 698fd724d..dd48686f7 100644 --- a/configs/sites/aws-pcluster/packages.yaml +++ b/configs/sites/aws-pcluster/packages.yaml @@ -33,10 +33,6 @@ packages: ### Modification of common packages - # https://github.com/spack/spack/issues/42137 - pflogger: - variants: +mpi - ### All other external packages listed alphabetically autoconf: externals: