Skip to content

Commit

Permalink
Merge pull request #2 from vub-hpc/job_slots
Browse files Browse the repository at this point in the history
display available resources distributed in job slots
  • Loading branch information
wpoely86 authored May 24, 2023
2 parents 5080b24 + 674969b commit 687e88b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 28 deletions.
2 changes: 1 addition & 1 deletion jupyterhub_moss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .spawner import MOSlurmSpawner
from .utils import local_path as _local_path

version = "6.0.0"
version = "6.2.0"

STATIC_FORM_REGEX = r"/form/(.*)"
STATIC_FORM_PATH = _local_path("form")
Expand Down
60 changes: 37 additions & 23 deletions jupyterhub_moss/spawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from pydantic import ValidationError

from .models import (
PartitionAllResources,
PartitionInfo,
PartitionResources,
PartitionsTrait,
Expand Down Expand Up @@ -78,7 +77,7 @@ def _validate_partitions(self, proposal: dict) -> dict[str, dict]:

slurm_info_cmd = traitlets.Unicode(
# Get number of nodes/state, cores/node, cores/state, gpus, total memory for all partitions
r"sinfo -a --noheader -o '%R %F %c %C %G %m %l'",
r"sinfo -N -a --noheader -O 'PartitionName,StateCompact,CPUsState,Gres,GresUsed,Memory,Time'",
help="Command to query cluster information from Slurm. Formatted using req_xyz traits as {xyz}."
"Output will be parsed by ``slurm_info_resources``.",
).tag(config=True)
Expand Down Expand Up @@ -116,26 +115,38 @@ def _slurm_info_resources(
for line in slurm_info_out.splitlines():
(
partition,
nnodes,
ncores_per_node,
node_state,
ncores,
gpus,
gres_total,
gres_used,
memory,
timelimit,
) = line.split()
# node count - allocated/idle/other/total
_, nnodes_idle, _, nnodes_total = nnodes.split("/")

# ignore nodes that are full or down
if node_state not in ["idle", "mix"]:
continue

# core count - allocated/idle/other/total
_, ncores_idle, _, ncores_total = ncores.split("/")
ncores_idle = int(ncores_idle)
ncores_total = int(ncores_total)

# gpu count - gpu:name:total(indexes)
try:
gpus_gres = gpus.replace("(", ":").split(":")
gpus_total = gpus_gres[2]
gpus_gres = gres_total.replace("(", ":").split(":")
gpus_total = int(gpus_gres[2])
gpu = ":".join(gpus_gres[0:2]) + ":{}"
gpus_used = int(gres_used.replace("(", ":").split(":")[2])
except IndexError:
gpus_total = "0"
gpus_total = 0
gpus_used = 0
gpu = ""

# job slots for resource display
# 1 core, 2 cores, 4 cores, 1 GPU
job_slots = [ncores_idle, ncores_idle // 2, ncores_idle // 4, gpus_total-gpus_used]

try:
max_runtime = parse_timelimit(timelimit)
except ValueError:
Expand All @@ -145,24 +156,27 @@ def _slurm_info_resources(
max_runtime = datetime.timedelta(days=1)

try:
resources = PartitionAllResources(
resources = {
# display resource counts
nnodes_total=nnodes_total,
nnodes_idle=nnodes_idle,
ncores_total=ncores_total,
ncores_idle=ncores_idle,
"job_slots": job_slots,
# required resource counts
max_nprocs=ncores_per_node.rstrip("+"),
max_mem=memory.rstrip("+"),
gpu=gpu,
max_ngpus=gpus_total,
max_runtime=max_runtime.total_seconds(),
)
"max_nprocs": ncores_total,
"max_mem": memory.rstrip("+"),
"gpu": gpu,
"max_ngpus": gpus_total,
"max_runtime": max_runtime.total_seconds(),
}
except ValidationError as err:
self.log.error("Error parsing output of slurm_info_cmd: %s", err)
raise

partitions_info[partition] = resources
if partition in partitions_info:
# update display counters of existing partition
slots_counters = zip(partitions_info[partition]["job_slots"], resources["job_slots"])
partitions_info[partition]["job_slots"] = [old + new for old, new in slots_counters]
else:
# add new partition
partitions_info[partition] = resources

return partitions_info

Expand Down Expand Up @@ -228,7 +242,7 @@ async def _get_partitions_info(self) -> dict[str, PartitionInfo]:
partitions_info = {
partition: PartitionInfo.parse_obj(
{
**resources_info[partition].dict(),
**resources_info[partition],
**config_partition_info.dict(exclude_none=True),
}
)
Expand Down
11 changes: 7 additions & 4 deletions jupyterhub_moss/templates/option_form.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@ <h4 class="subheading">Available resources</h4>
<table class="table">
<tr class="active">
<th style="padding-right: 10rem;">Partition</th>
<th style="text-align: center; width: 50%;">Idle CPU cores</th>
<th style="text-align: center; width: 50%;">Idle nodes</th>
<th style="text-align: center; width: 25%;">1 core slots</th>
<th style="text-align: center; width: 25%;">2 cores slots</th>
<th style="text-align: center; width: 25%;">4 cores slots</th>
<th style="text-align: center; width: 25%;">1 GPU slots</th>
</tr>
{% for name, partition in partitions.items() %}
{% if partition.simple or not simple_only %}
<tr>
<th>{{ name }}</th>
<th style="text-align: center">{{ partition['ncores_idle'] }} <small>/ {{ partition['ncores_total'] }}</small></th>
<th style="text-align: center">{{ partition['nnodes_idle'] }} <small>/ {{ partition['nnodes_total'] }}</small></th>
{% for slot in partition['job_slots'] %}
<th style="text-align: center">{{ slot }}</th>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
Expand Down

0 comments on commit 687e88b

Please sign in to comment.