carpentries-incubator · tkphd · Jun 2, 2024
@@ -64,6 +64,7 @@ jobs:
           - ComputeCanada_Graham_slurm
           - EPCC_Cirrus_slurm
           - HPCC_MagicCastle_slurm
+          - LLNL_Pascal_slurm
           - Magic_Castle_EESSI_slurm
           - NIST_CTCMS_slurm
           - Norway_SIGMA2_SAGA_slurm

@@ -0,0 +1,70 @@
+#------------------------------------------------------------
+# NIST CTCMS Slurm
+#------------------------------------------------------------
+
+# Cluster host and scheduler options: the defaults come from
+# Graham at Compute Canada, running Slurm. Other options can
+# be found in the library of snippets,
+# `_includes/snippets_library`. To use one, replace options
+# below with those in `_config_options.yml` from the
+# library. E.g, to customise for Cirrus at EPCC, running
+# Slurm, we could replace the options below with those from
+#
+# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml
+#
+# If your cluster is not represented in the library, please
+# copy an existing folder, rename it, and customize for your
+# installation. Remember to keep the leading slash on the
+# `snippets` variable below!
+---
+snippets: "/snippets_library/LLNL_Pascal_slurm"
+
+local:
+  prompt: "[user@laptop ~]$"
+  bash_shebang: "#!/usr/bin/bash"
+
+remote:
+  name: "pascal83"
+  login: "pascal.nist.gov"
+  host: "quartz"
+  node: "pascal17"
+  location: "Lawrence Livermore National Laboratory"
+  homedir: "/g/g0/"
+  user: "yourUsername"
+  prompt: "yourUsername@pascal83"
+  bash_shebang: "#!/bin/bash"
+
+sched:
+  name: "Slurm"
+  submit:
+    name: "sbatch"
+    options: "--partition=pvis"
+  queue:
+    debug: "pdebug"
+    testing: "pvis"
+  status: "squeue"
+  flag:
+    user: "-u yourUsername"
+    interactive: ""
+    histdetail: "--format=JobName,Submit,Start,State,ReqCPUS,Reserved,Elapsed,MaxRSS -j"
+    name: "-J"
+    time: "-t"
+    queue: "-p"
+    partition: "-p pdebug"
+  del: "scancel"
+  interactive: "srun"
+  info: "sinfo"
+  comment: "#SBATCH"
+  hist: "sacct -u yourUsername"
+  hist_filter: ""
+
+episode_order:
+  - 10-hpc-intro
+  - 11-connecting
+  - 12-cluster
+  - 13-scheduler
+  - 14-environment-variables
+  - 16-transferring-files
+  - 17-parallel
+  - 18-resources
+  - 19-responsibility
@@ -0,0 +1,25 @@
+```
+PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
+rack1        up 30-00:00:0     12  alloc r[061-072]
+rack2        up 30-00:00:0     10  alloc r[003-012]
+rack3        up 30-00:00:0      1    mix r036
+rack3        up 30-00:00:0      6  alloc r[032-035,037-038]
+rack4        up 30-00:00:0      1  drain r048
+rack4        up 30-00:00:0      1    mix r047
+rack4        up 30-00:00:0      9  alloc r[041-046,049-051]
+rack4e       up 30-00:00:0      1    mix r073
+rack4e       up 30-00:00:0      5  alloc r[013-016,074]
+rack4e       up 30-00:00:0      2   idle r[075-076]
+rack5        up 30-00:00:0      3    mix r[021-022,028]
+rack5        up 30-00:00:0      5  alloc r[023-027]
+rack5        up 30-00:00:0      3   idle r[019-020,029]
+rack6i       up 30-00:00:0      2   idle r[059-060]
+rack6        up 30-00:00:0      1 drain* r057
+rack6        up 30-00:00:0      1  down* r053
+rack6        up 30-00:00:0      5  alloc r[052,054-056,058]
+{{ site.sched.queue.testing }}         up   12:00:00      1   idle r001
+{{ site.sched.queue.debug }}       up 14-00:00:0      1   idle r002
+gpu          up 7-00:00:00      3   idle rgpu,rgpu[4-5]
+gpu          up 7-00:00:00      2   down rgpu[2-3]
+```
+{: .output}
@@ -0,0 +1,6 @@
+```
+bin   etc   lib64  proc  sbin     sys  var
+boot  {{ site.remote.homedir | replace: "/", "" }}  mnt    root  scratch  tmp  working
+dev   lib   opt    run   srv      usr
+```
+{: .output}
@@ -0,0 +1,11 @@
+> ## Explore a Worker Node
+>
+> Finally, let's look at the resources available on the worker nodes where your
+> jobs will actually run. Try running this command to see the name, CPUs and
+> memory available on the worker nodes:
+>
+> ```
+> {{ site.remote.prompt }} sinfo -n {{ site.remote.node }} -o "%n %c %m"
+> ```
+> {: .language-bash}
+{: .challenge}
@@ -0,0 +1 @@
+<!-- CTCMS does not use Modules -->
@@ -0,0 +1,4 @@
+```
+No Modulefiles Currently Loaded.
+```
+{: .output}
@@ -0,0 +1,3 @@
+```
+```
+{: .output}
@@ -0,0 +1,4 @@
+```
+{{ site.remote.prompt }} which python3
+```
+{: .language-bash}
@@ -0,0 +1,4 @@
+```
+/usr/bin/python3
+```
+{: .output}
@@ -0,0 +1,4 @@
+```
+{{ site.remote.prompt }} ls /usr/bin/py*
+```
+{: .language-bash}
@@ -0,0 +1,12 @@
+```
+py3clean     pydoc3.5               python2            python3-config
+py3compile   pygettext              python2.7          python3-futurize
+py3versions  pygettext2.7           python2.7-config   python3m
+pybuild      pygettext3             python2-config     python3m-config
+pyclean      pygettext3.5           python3            python3-pasteurize
+pycompile    pygobject-codegen-2.0  python3.5          python-config
+pydoc        pygtk-codegen-2.0      python3.5-config   pyversions
+pydoc2.7     pygtk-demo             python3.5m
+pydoc3       python                 python3.5m-config
+```
+{: .output}
@@ -0,0 +1,3 @@
+```
+```
+{: .output}
@@ -0,0 +1 @@
+<!-- CTCMS does not use modules -->
@@ -0,0 +1 @@
+<!-- CTCMS does not use modules -->
@@ -0,0 +1,11 @@
+```
+{{ site.remote.bash_shebang }}
+{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job
+{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }}
+{{ site.sched.comment }} -N 1
+{{ site.sched.comment }} -n 8
+
+# Execute the task
+mpiexec amdahl
+```
+{: .language-bash}
@@ -0,0 +1,11 @@
+```
+{{ site.remote.bash_shebang }}
+{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job
+{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }}
+{{ site.sched.comment }} -N 1
+{{ site.sched.comment }} -n 4
+
+# Execute the task
+mpiexec amdahl
+```
+{: .language-bash}
@@ -0,0 +1,11 @@
+```
+{{ site.remote.bash_shebang }}
+{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job
+{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }}
+{{ site.sched.comment }} -N 1
+{{ site.sched.comment }} -n 1
+
+# Execute the task
+amdahl
+```
+{: .language-bash}
@@ -0,0 +1,16 @@
+```
+      JobID    JobName  Partition  AllocCPUS      State Exit
+------------ ---------- ---------- ---------- ---------- ----
+212339         hostname     {{ site.sched.queue.debug }}      2  COMPLETED
+212340         hostname     {{ site.sched.queue.debug }}      2  COMPLETED
+212341              env     {{ site.sched.queue.debug }}      2  COMPLETED
+212342           mpirun       {{ site.sched.queue.testing }}      2  COMPLETED
+212343           mpirun       {{ site.sched.queue.testing }}      2  COMPLETED
+212344           amdahl       {{ site.sched.queue.testing }}      2  COMPLETED
+212345           amdahl       {{ site.sched.queue.testing }}      2  COMPLETED
+212346             bash       {{ site.sched.queue.testing }}      2  COMPLETED
+212346.0           bash                 2  COMPLETED
+212346.1         amdahl                 2  COMPLETED
+212347           amdahl       {{ site.sched.queue.testing }}      2     FAILED
+```
+{: .output}
@@ -0,0 +1,6 @@
+* **Hostname**: Where did your job run?
+* **MaxRSS**: What was the maximum amount of memory used?
+* **Elapsed**: How long did the job take?
+* **State**: What is the job currently doing/what happened to it?
+* **MaxDiskRead**: Amount of data read from disk.
+* **MaxDiskWrite**: Amount of data written to disk.
@@ -0,0 +1,19 @@
+```
+top - 15:47:18 up 21 days,  6:25,  2 users,  load average: 0.02, 0.04, 0.04
+Tasks: 223 total,   1 running, 222 sleeping,   0 stopped,   0 zombie
+%Cpu(s):  0.2 us,  0.1 sy,  0.0 ni, 99.6 id,  0.1 wa,  0.0 hi,  0.0 si,  0.0 st
+KiB Mem : 32950812 total,  1594456 free,   502696 used, 30853660 buff/cache
+KiB Swap: 64002952 total, 64002952 free,        0 used. 31913980 avail Mem
+
+  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
+ 1693 jeff      20   0 4270580 346944 171372 S  29.8  2.1   9:31.89 gnome-shell
+ 3140 jeff      20   0 3142044 928972 389716 S  27.5  5.7  13:30.29 Web Content
+ 3057 jeff      20   0 3115900 521368 231288 S  18.9  3.2  10:27.71 firefox
+ 6007 jeff      20   0  813992 112336  75592 S   4.3  0.7   0:28.25 tilix
+ 1742 jeff      20   0  975080 164508 130624 S   2.0  1.0   3:29.83 Xwayland
+    1 root      20   0  230484  11924   7544 S   0.3  0.1   0:06.08 systemd
+   68 root      20   0       0      0      0 I   0.3  0.0   0:01.25 kworker/4:1
+ 2913 jeff      20   0  965620  47892  37432 S   0.3  0.3   0:11.76 code
+    2 root      20   0       0      0      0 S   0.0  0.0   0:00.02 kthreadd
+```
+{: .output}
@@ -0,0 +1,7 @@
+```
+              total        used        free      shared  buff/cache   available
+Mem:            31G        501M        1.5G         64M         29G         30G
+Swap:           61G          0B         61G
+
+```
+{: .output}
@@ -0,0 +1,4 @@
+```
+Submitted batch job 36855
+```
+{: .output}
@@ -0,0 +1,9 @@
+```
+ JOBID PARTITION     NAME     ST       TIME  NODES NODELIST(REASON)
+212201    {{ site.sched.queue.debug }} example- R       0:05      1 r002
+```
+{: .output}
+
+We can see all the details of our job, most importantly that it is in the `R`
+or `RUNNING` state. Sometimes our jobs might need to wait in a queue
+(`PENDING`) or have an error (`E`).
@@ -0,0 +1,19 @@
+> Jobs on an HPC system might run for days or even weeks. We probably have
+> better things to do than constantly check on the status of our job with
+> `{{ site.sched.status }}`. Looking at the manual page for
+> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email
+> when it finishes?
+>
+> > ## Hint
+> >
+> > You can use the *manual pages* for {{ site.sched.name }} utilities to find
+> > more about their capabilities. On the command line, these are accessed
+> > through the `man` utility: run `man <program-name>`. You can find the same
+> > information online by searching > "man <program-name>".
+> >
+> > ```
+> > {{ site.remote.prompt }} man {{ site.sched.submit.name }}
+> > ```
+> > {: .language-bash}
+> {: .solution}
+{: .challenge}
@@ -0,0 +1,6 @@
+```
+ JOBID PARTITION     NAME     ST       TIME  NODES NODELIST(REASON)
+212202    {{ site.sched.queue.debug }} hello-wo  R       0:02      1 r002
+
+```
+{: .output}
@@ -0,0 +1,15 @@
+* `--ntasks=<ntasks>` or `-n <ntasks>`: How many CPU cores does your job need,
+  in total?
+
+* `--time <days-hours:minutes:seconds>` or `-t <days-hours:minutes:seconds>`:
+  How much real-world time (walltime) will your job take to run? The `<days>`
+  part can be omitted.
+
+* `--mem=<megabytes>`: How much memory on a node does your job need in
+  megabytes? You can also specify gigabytes using by adding a little "g"
+  afterwards (example: `--mem=5g`)
+
+* `--nodes=<nnodes>` or `-N <nnodes>`: How many separate machines does your job
+  need to run on? Note that if you set `ntasks` to a number greater than what
+  one machine can offer, {{ site.sched.name }} will set this value
+  automatically.
@@ -0,0 +1,31 @@
+> ## Job environment variables
+>
+> When {{ site.sched.name }} runs a job, it sets a number of environment
+> variables for the job. One of these will let us check what directory our job
+> script was submitted from. The `SLURM_SUBMIT_DIR` variable is set to the
+> directory from which our job was submitted. Using the `SLURM_SUBMIT_DIR`
+> variable, modify your job so that it prints out the location from which the
+> job was submitted.
+>
+> > ## Solution
+> >
+> > ```
+> > {{ site.remote.prompt }} nano example-job.sh
+> > {{ site.remote.prompt }} cat example-job.sh
+> > ```
+> > {: .language-bash}
+> >
+> > ```
+> > {{ site.remote.bash_shebang }}
+> > {{ site.sched.comment }} {{ site.sched.flag.partition }}
+> > {{ site.sched.comment }} {{ site.sched.flag.time }} 00:00:20
+> >
+> > echo -n "This script is running on "
+> > hostname
+> >
+> > echo "This job was launched in the following directory:"
+> > echo ${SLURM_SUBMIT_DIR}
+> > ```
+> > {: .output}
+> {: .solution}
+{: .challenge}
@@ -0,0 +1,4 @@
+```
+{{ site.remote.prompt }} cat slurm-38193.out
+```
+{: .language-bash}
@@ -0,0 +1,7 @@
+```
+This job is running on:
+{{ site.sched.node }}
+slurmstepd: error: *** JOB 38193 ON {{ site.sched.node }} CANCELLED AT
+2017-07-02T16:35:48 DUE TO TIME LIMIT ***
+```
+{: .output}
@@ -0,0 +1,7 @@
+```
+Submitted batch job 212203
+
+ JOBID PARTITION     NAME     ST       TIME  NODES NODELIST(REASON)
+212203    {{ site.sched.queue.debug }} hello-wo  R       0:03      1 r002
+```
+{: .output}
@@ -0,0 +1,4 @@
+```
+ JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
+```
+{: .output}