Skip to content

Commit

Permalink
Merge pull request #725 from pmoris/vsc_calcua
Browse files Browse the repository at this point in the history
Update vsc_calcua config
  • Loading branch information
pmoris authored Aug 8, 2024
2 parents 70c5b57 + a4ed136 commit ad49f85
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 49 deletions.
102 changes: 67 additions & 35 deletions conf/vsc_calcua.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,80 @@
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"

// Specify the work directory.
// Specify the work directory. Can be overwritten via the cli flag `-work-dir`.
workDir = "$scratch_dir/work"

// Perform work directory cleanup when the run has succesfully completed.
cleanup = true

def host = System.getenv("VSC_INSTITUTE")
// Check if environment variables for singularity/apptainer/nextflow cache and tmp dirs are set:
// - APPTAINER_TMPDIR/SINGULARITY_TMPDIR (warn if missing, apptainer defaults to $TMPDIR or /tmp)
// - APPTAINER_CACHEDIR/SINGULARITY_CACHEDIR (exit with error if missing, apptainer would default to $HOME otherwise)
// - NXF_APPTAINER_CACHEDIR/NXF_SINGULARITY_CACHEDIR (warn and set to $scratch_dir/apptainer/nextflow_cache if missing)
// Note that only the third env var can be set inside of this config file (cacheDir), because
// the env scope only provides env vars to tasks, not to the launch environment.
// See https://www.nextflow.io/docs/latest/config.html#scope-env

// Check if APPTAINER_TMPDIR/SINGULARITY_TMPDIR environment variables are set.
// If they are available, try to create the tmp directory at the specified location.
// Skip if host is not CalcUA to avoid hindering github actions.
if ( host == "antwerpen" ) {
def apptainer_tmpdir = System.getenv("APPTAINER_TMPDIR") ?: System.getenv("SINGULARITY_TMPDIR") ?: null
if (! apptainer_tmpdir ) {
// Define variables outside of conditional scope to make them usable elsewhere
def apptainer_tmpdir = System.getenv("APPTAINER_TMPDIR") ?: System.getenv("SINGULARITY_TMPDIR") ?: null
def apptainer_cachedir = System.getenv("APPTAINER_CACHEDIR") ?: System.getenv("SINGULARITY_CACHEDIR") ?: null
def nxf_apptainer_cachedir = System.getenv("NXF_APPTAINER_CACHEDIR") ?: System.getenv("NXF_SINGULARITY_CACHEDIR") ?: null

// Skip check if host is not CalcUA, to avoid hindering github actions.
if ( System.getenv("VSC_INSTITUTE") == "antwerpen" ) {
// APPTAINER_TMPDIR/SINGULARITY_TMPDIR environment variable
if ( !apptainer_tmpdir ) {
// Apptainer defaults to $TMPDIR or /tmp (on the SLURM execution node) if this env var is not set.
// See https://apptainer.org/docs/user/main/build_env.html#temporary-folders
def tmp_dir = System.getenv("TMPDIR") ?: "/tmp"
System.err.println("\nWARNING: APPTAINER_TMPDIR/SINGULARITY_TMPDIR environment variable was not found.\nPlease add the line 'export APPTAINER_TMPDIR=\"\${VSC_SCRATCH}/apptainer/tmp\"' to your ~/.bashrc file (or set it with sbatch or in your job script).\nDefaulting to local $tmp_dir on the execution node of the Nextflow head process.\n")
// TODO: check if images stored there can be accessed by slurm jobs on other nodes
} else {
// If set, try to create the tmp directory at the specified location to avoid errors during
// docker image conversion (note that this only happens when no native singulariry/apptainer
// images are available):
// FATAL: While making image from oci registry: error fetching image to cache: while
// building SIF from layers: unable to create new build: failed to create build parent dir:
// stat /scratch/antwerpen/203/vsc20380/apptainer/tmp: no such file or directory
apptainer_tmpdir = new File(apptainer_tmpdir)
if (! apptainer_tmpdir.exists() ) {
try {
dir_created = apptainer_tmpdir.mkdirs()
} catch (java.io.IOException e) {
System.err.println("\nWARNING: Could not create directory at the location specified by APPTAINER_TMPDIR/SINGULARITY_TMPDIR: $apptainer_tmpdir\nPlease check if this is a valid path to which you have write permission. Exiting...\n")
System.err.println("\nERROR: Could not create directory at the location specified by APPTAINER_TMPDIR/SINGULARITY_TMPDIR: $apptainer_tmpdir\nPlease check if this is a valid path to which you have write permission. Exiting...\n")
}
}
}
// APPTAINER_CACHEDIR/SINGULARITY_CACHEDIR
if ( !apptainer_cachedir ) {
System.err.println("\nERROR: APPTAINER_CACHEDIR/SINGULARITY_CACHEDIR environment variable was not found.\nPlease add the line 'export APPTAINER_CACHEDIR=\"\${VSC_SCRATCH}/apptainer/cache\"' to your ~/.bashrc file (or set it with sbatch or in your job script).\nUsing the default storage location of Singularity/Apptainer ~/.apptainer/cache/. Read more about why this should be avoided in the VSC docs: https://docs.vscentrum.be/software/singularity.html#building-on-vsc-infrastructure\n")
System.exit(1)
}
// NXF_APPTAINER_CACHEDIR/NXF_SINGULARITY_CACHEDIR
if ( !nxf_apptainer_cachedir ) {
nxf_apptainer_cachedir = "$scratch_dir/apptainer/nextflow_cache"
System.err.println("\nWARNING: NXF_APPTAINER_CACHEDIR/NXF_SINGULARITY_CACHEDIR environment variable was not found.\nPlease add the line 'export NXF_APPTAINER_CACHEDIR=\"\${VSC_SCRATCH}/apptainer/nextflow_cache\"' to your ~/.bashrc file (or set it with sbatch or in your job script) to choose the location of the Nextflow container image cache.\nDefaulting to $nxf_apptainer_cachedir (instead of the Nextflow work directory).\n")
}
}

// Function to check if the selected partition profile matches the partition on which the master
// Function to check if the selected partition profile matches the partition on which the head
// nextflow job was launched (either implicitly or via `sbatch --partition=<partition-name>`).
// If the profile type is `*_local` and the partitions do not match, stop the execution and
// warn the user.
// Only used for local execution profiles.
// If no partition is found (as on the login nodes), warn the user, but do not immediately exit
// to still allow debugging and testing.
// If the profile type is `*_local` and the partitions do not match, stop the execution and warn the user.
def partition_checker(String profile) {
// Skip check if host machine is not CalcUA, in order to not hinder github actions.
if ( host != "antwerpen" ) {
// System.err.println("\nWARNING: Skipping comparison of current partition and requested profile because the current machine is not VSC CalcUA.")
if ( System.getenv("VSC_INSTITUTE") != "antwerpen" ) {
return
}

def current_partition = System.getenv("SLURM_JOB_PARTITION")

try {
current_partition
} catch (java.io.IOException e) {
System.err.println("\nWARNING: Current partition could not be found in the expected \$SLURM_JOB_PARTITION environment variable. Please make sure that you submit your pipeline via a Slurm job instead of running the nextflow command directly on a login node.\nExiting...\n")
if (! current_partition) {
System.err.println("\nWARNING: Current partition could not be found in the expected \$SLURM_JOB_PARTITION environment variable. Please make sure that you submit your pipeline via a Slurm job or in an interactive `srun` session, instead of running the nextflow command directly on a login node.\n")
// TODO: optional exit here, but this makes debugging and testing more difficult.
}

try {
current_partition = profile
} catch (java.io.IOException e) {
System.err.println("\nWARNING: Slurm job was launched on the \'$current_partition\' partition, but the selected nextflow profile points to the $profile partition instead ('${profile}_local'). When using one of the local node execution profiles, please launch the job on the corresponding partition in Slurm.\nE.g., Slurm job submission command:\n sbatch --account <project_account> --partition=broadwell script.slurm\nand job script containing a nextflow command with matching profile section:\n nextflow run <pipeline> -profile vsc_calcua,broadwell_local\nExiting...\n")
else if (current_partition != profile) {
System.err.println("\nERROR: Slurm job was launched on the \'$current_partition\' partition, but the selected nextflow profile points to the $profile partition instead ('${profile}_local'). When using one of the local node execution profiles, please launch the job on the corresponding partition in Slurm.\nE.g., Slurm job submission command:\n sbatch --account <project_account> --partition=broadwell script.slurm\nand job script containing a nextflow command with matching profile section:\n nextflow run <pipeline> -profile vsc_calcua,broadwell_local\nAborting run...\n")
System.exit(1)
}
}

Expand All @@ -81,19 +103,18 @@ process {
}

// Specify that apptainer/singularity should be used and where the cache dir will be for the images.
// The singularity directive is used in favour of the apptainer one, because currently the apptainer
// Singularity is used in favour of apptainer, because currently the apptainer
// variant will pull in (and convert) docker images, instead of using pre-built singularity ones.
// To use the pre-built singularity containers instead, the singularity options should be selected
// with apptainer installed on the system, which defines singularity as an alias (as is the case
// on CalcUA).
// On a system where singularity is defined as an alias for apptainer (as is the case on CalcUA),
// this works out fine and results in pre-built singularity containers being downloaded.
// See https://nf-co.re/docs/usage/installation#pipeline-software
// and https://nf-co.re/tools#how-the-singularity-image-downloads-work
// See https://www.nextflow.io/docs/latest/config.html#scope-singularity
singularity {
enabled = true
autoMounts = true
// See https://www.nextflow.io/docs/latest/singularity.html#singularity-docker-hub
cacheDir = "$scratch_dir/apptainer/nextflow_cache" // Equivalent to setting NXF_APPTAINER_CACHEDIR/NXF_SINGULARITY_CACHEDIR environment variable
cacheDir = "$nxf_apptainer_cachedir" // Equivalent to setting NXF_APPTAINER_CACHEDIR/NXF_SINGULARITY_CACHEDIR environment variable
}

// Define profiles for the following partitions:
Expand Down Expand Up @@ -314,7 +335,7 @@ profiles {
}
}

// Define functions to fetch the available CPUs and memory of the current execution node.
// Define functions to fetch the available CPUs and memory of the current execution node.
// Only used when running one of the *_local partition profiles and allows the cpu
// and memory thresholds to be set dynamic based on the available hardware as reported
// by Slurm. Can be supplied with a default return value, which should be set to the
Expand All @@ -323,13 +344,24 @@ def get_allocated_cpus(int node_max_cpu) {
max_cpus = System.getenv("SLURM_CPUS_PER_TASK") ?: System.getenv("SLURM_JOB_CPUS_PER_NODE") ?: node_max_cpu
return max_cpus.toInteger()
}

def get_allocated_mem(int node_max_mem) {
// default to max memory of node per partition type
int max_mem = node_max_mem

// grab environment variables with memory and cpu info
def mem_per_cpu = System.getenv("SLURM_MEM_PER_CPU")
def mem_per_node = System.getenv("SLURM_MEM_PER_NODE")
def cpus_per_task = System.getenv("SLURM_CPUS_PER_TASK") ?: System.getenv("SLURM_JOB_CPUS_PER_NODE")

// Check if memory is requested per cpu and the number of cpus was also set
if ( mem_per_cpu && cpus_per_task ) {
node_max_mem = mem_per_cpu.toInteger() / 1000 * cpus_per_task.toInteger()
max_mem = mem_per_cpu.toInteger() / 1000 * cpus_per_task.toInteger()
}

return "${node_max_mem}.GB"
// Check if total/node memory was requested instead
else if ( mem_per_node ) {
max_mem = mem_per_node.toInteger() / 1000
}
// return in expected GB string format
return "${max_mem}.GB"
}
Loading

0 comments on commit ad49f85

Please sign in to comment.