Skip to content

Commit

Permalink
Use official SchedMD debian RPMs to install Slurm
Browse files Browse the repository at this point in the history
Update config files to accommodate newer version of Slurm.
  • Loading branch information
christopherwharrop-noaa committed Jun 13, 2024
1 parent a00fae4 commit c75695c
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 81 deletions.
46 changes: 39 additions & 7 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,66 @@ RUN apt-get update -y && apt-get install -y \

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y \
devscripts \
equivs \
fakeroot \
libbpf-dev \
libdbus-1-dev \
libhwloc-dev \
openssh-server \
slurm-client \
sudo


RUN cd /tmp \
&& wget https://download.schedmd.com/slurm/slurm-23.11.7.tar.bz2 \
&& tar -xaf slurm-23.11.7.tar.bz2 \
&& cd slurm-23.11.7 \
&& mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes -y" -i debian/control \
&& debuild -b -uc -us \
&& cd .. \
&& dpkg --install slurm-smd_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-client_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-dev_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-doc_23.11.7-1_all.deb \
&& dpkg --install slurm-smd-libnss-slurm_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpam-slurm-adopt_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpmi0_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpmi2-0_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libslurm-perl_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-sackd_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-sview_23.11.7-1_amd64.deb

RUN useradd -m admin -s /usr/bin/bash -d /home/admin \
&& echo "admin:admin" | chpasswd \
&& adduser admin sudo \
&& echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN useradd -m slurm -s /usr/bin/bash -d /home/slurm \
&& echo "slurm:slurm" | chpasswd

RUN mkdir /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
&& chmod -x /etc/update-motd.d/* \
&& rm -f /etc/legal

COPY slurm.conf /etc/slurm/
COPY cgroup.conf /etc/slurm/
COPY docker-entrypoint.sh /etc/slurm/

RUN chown slurm:slurm /etc/slurm \
&& chown slurm:slurm /etc/slurm/*.conf \
&& chmod 775 /etc/slurm \
&& chmod 775 /etc/slurm/*.conf

COPY ssh /home/admin/.ssh

RUN chown -R admin:admin /home/admin/.ssh \
&& chmod -R 700 /home/admin/.ssh

COPY slurm.conf /etc/slurm-llnl/
COPY cgroup.conf /etc/slurm-llnl/
COPY docker-entrypoint.sh /etc/slurm-llnl/

WORKDIR /home/admin

EXPOSE 22 8888

ENV USER admin
ENV SHELL bash

ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"]
ENTRYPOINT ["/etc/slurm/docker-entrypoint.sh"]
12 changes: 9 additions & 3 deletions frontend/cgroup.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupPlugin=cgroup/v1
ConstrainSwapSpace=no
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes

2 changes: 1 addition & 1 deletion frontend/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

export SLURM_CPUS_ON_NODE=$(cat /proc/cpuinfo | grep processor | wc -l)
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm/slurm.conf

sudo service munge start
sudo service ssh start
Expand Down
29 changes: 14 additions & 15 deletions frontend/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=slurmmaster
#
#DisableRootJobs=NO
Expand Down Expand Up @@ -39,20 +40,21 @@ ProctrackType=proctrack/linuxproc
#RebootProgram=
ReturnToService=1
#SallocDefaultCommand=
SlurmdParameters=config_overrides
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=root
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
TaskPluginParam=Sched
#TaskPlugin=task/affinity
TaskPlugin=task/none
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
Expand Down Expand Up @@ -87,8 +89,7 @@ Waittime=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core
SelectType=select/cons_tres
#
#
# JOB PRIORITY
Expand All @@ -114,7 +115,6 @@ SelectTypeParameters=CR_Core
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
Expand All @@ -126,10 +126,10 @@ JobCompType=jobcomp/none
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=error
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdDebug=error
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
SlurmctldDebug=debug2
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=debug2
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
Expand All @@ -147,7 +147,6 @@ SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
#
#
# COMPUTE NODES
#
NodeName=slurmnode[1-10] REPLACE_IT State=UNKNOWN
PartitionName=slurmpar Nodes=slurmnode[1-10] Default=YES MaxTime=INFINITE State=UP

NodeName=DEFAULT State=UNKNOWN Sockets=1 ThreadsPerCore=1 CoresPerSocket=8
NodeName=slurmnode[1-3] REPLACE_IT
PartitionName=slurmpar Nodes=ALL Default=YES MaxTime=INFINITE State=UP
53 changes: 44 additions & 9 deletions master/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,69 @@ RUN apt-get update -y && apt-get install -y \

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y \
libpmi2-0-dev \
devscripts \
equivs \
fakeroot \
libbpf-dev \
libdbus-1-dev \
libhwloc-dev \
openssh-server \
slurm-client \
slurmctld \
slurmd \
sudo

RUN cd /tmp \
&& wget https://download.schedmd.com/slurm/slurm-23.11.7.tar.bz2 \
&& tar -xaf slurm-23.11.7.tar.bz2 \
&& cd slurm-23.11.7 \
&& mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes -y" -i debian/control \
&& debuild -b -uc -us \
&& cd .. \
&& dpkg --install slurm-smd_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-client_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-slurmctld_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-dev_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-doc_23.11.7-1_all.deb \
&& dpkg --install slurm-smd-libnss-slurm_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpam-slurm-adopt_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpmi0_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libpmi2-0_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-libslurm-perl_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-sackd_23.11.7-1_amd64.deb \
&& dpkg --install slurm-smd-sview_23.11.7-1_amd64.deb

RUN useradd -m admin -s /usr/bin/bash -d /home/admin \
&& echo "admin:admin" | chpasswd \
&& adduser admin sudo \
&& echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN useradd -m slurm -s /usr/bin/bash -d /home/slurm \
&& echo "slurm:slurm" | chpasswd

RUN mkdir /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
&& chmod -x /etc/update-motd.d/* \
&& rm -f /etc/legal

COPY slurm.conf /etc/slurm/
COPY cgroup.conf /etc/slurm/
COPY docker-entrypoint.sh /etc/slurm/

RUN mkdir /var/spool/slurmctld \
&& chown slurm:slurm /var/spool/slurmctld \
&& chmod 755 /var/spool/slurmctld \
&& chown slurm:slurm /etc/slurm \
&& chown slurm:slurm /etc/slurm/*.conf \
&& chmod 775 /etc/slurm \
&& chmod 775 /etc/slurm/*.conf

RUN systemctl enable slurmctld

COPY ssh /home/admin/.ssh

RUN chown -R admin:admin /home/admin/.ssh \
&& chmod -R 700 /home/admin/.ssh

COPY slurm.conf /etc/slurm-llnl/
COPY cgroup.conf /etc/slurm-llnl/
COPY docker-entrypoint.sh /etc/slurm-llnl/

EXPOSE 22 6817 6818 6819 3306

WORKDIR /home/admin

ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"]
ENTRYPOINT ["/etc/slurm/docker-entrypoint.sh"]
12 changes: 9 additions & 3 deletions master/cgroup.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupPlugin=cgroup/v1
ConstrainSwapSpace=no
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes

4 changes: 2 additions & 2 deletions master/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

export SLURM_CPUS_ON_NODE=$(cat /proc/cpuinfo | grep processor | wc -l)
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm/slurm.conf

sudo service munge start
sudo service slurmctld start
sudo slurmctld
sudo service ssh start

tail -f /dev/null
29 changes: 14 additions & 15 deletions master/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=slurmmaster
#
#DisableRootJobs=NO
Expand Down Expand Up @@ -39,20 +40,21 @@ ProctrackType=proctrack/linuxproc
#RebootProgram=
ReturnToService=1
#SallocDefaultCommand=
SlurmdParameters=config_overrides
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=root
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
TaskPluginParam=Sched
#TaskPlugin=task/affinity
TaskPlugin=task/none
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
Expand Down Expand Up @@ -87,8 +89,7 @@ Waittime=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core
SelectType=select/cons_tres
#
#
# JOB PRIORITY
Expand All @@ -114,7 +115,6 @@ SelectTypeParameters=CR_Core
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
Expand All @@ -126,10 +126,10 @@ JobCompType=jobcomp/none
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=error
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdDebug=error
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
SlurmctldDebug=debug2
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=debug2
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
Expand All @@ -147,7 +147,6 @@ SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
#
#
# COMPUTE NODES
#
NodeName=slurmnode[1-10] REPLACE_IT State=UNKNOWN
PartitionName=slurmpar Nodes=slurmnode[1-10] Default=YES MaxTime=INFINITE State=UP

NodeName=DEFAULT State=UNKNOWN Sockets=1 ThreadsPerCore=1 CoresPerSocket=8
NodeName=slurmnode[1-3] REPLACE_IT
PartitionName=slurmpar Nodes=ALL Default=YES MaxTime=INFINITE State=UP
Loading

0 comments on commit c75695c

Please sign in to comment.