diff --git a/jupyterhub_moss/__init__.py b/jupyterhub_moss/__init__.py index 1e1ffd6..101779f 100644 --- a/jupyterhub_moss/__init__.py +++ b/jupyterhub_moss/__init__.py @@ -3,7 +3,7 @@ from .spawner import MOSlurmSpawner from .utils import local_path as _local_path -version = "6.0.0" +version = "6.2.0" STATIC_FORM_REGEX = r"/form/(.*)" STATIC_FORM_PATH = _local_path("form") diff --git a/jupyterhub_moss/spawner.py b/jupyterhub_moss/spawner.py index 7c8cefc..c187fa9 100644 --- a/jupyterhub_moss/spawner.py +++ b/jupyterhub_moss/spawner.py @@ -13,7 +13,6 @@ from pydantic import ValidationError from .models import ( - PartitionAllResources, PartitionInfo, PartitionResources, PartitionsTrait, @@ -78,7 +77,7 @@ def _validate_partitions(self, proposal: dict) -> dict[str, dict]: slurm_info_cmd = traitlets.Unicode( # Get number of nodes/state, cores/node, cores/state, gpus, total memory for all partitions - r"sinfo -a --noheader -o '%R %F %c %C %G %m %l'", + r"sinfo -N -a --noheader -O 'PartitionName,StateCompact,CPUsState,Gres,GresUsed,Memory,Time'", help="Command to query cluster information from Slurm. Formatted using req_xyz traits as {xyz}." "Output will be parsed by ``slurm_info_resources``.", ).tag(config=True) @@ -116,26 +115,38 @@ def _slurm_info_resources( for line in slurm_info_out.splitlines(): ( partition, - nnodes, - ncores_per_node, + node_state, ncores, - gpus, + gres_total, + gres_used, memory, timelimit, ) = line.split() - # node count - allocated/idle/other/total - _, nnodes_idle, _, nnodes_total = nnodes.split("/") + + # ignore nodes that are full or down + if node_state not in ["idle", "mix"]: + continue + # core count - allocated/idle/other/total _, ncores_idle, _, ncores_total = ncores.split("/") + ncores_idle = int(ncores_idle) + ncores_total = int(ncores_total) + # gpu count - gpu:name:total(indexes) try: - gpus_gres = gpus.replace("(", ":").split(":") - gpus_total = gpus_gres[2] + gpus_gres = gres_total.replace("(", ":").split(":") + gpus_total = int(gpus_gres[2]) gpu = ":".join(gpus_gres[0:2]) + ":{}" + gpus_used = int(gres_used.replace("(", ":").split(":")[2]) except IndexError: - gpus_total = "0" + gpus_total = 0 + gpus_used = 0 gpu = "" + # job slots for resource display + # 1 core, 2 cores, 4 cores, 1 GPU + job_slots = [ncores_idle, ncores_idle // 2, ncores_idle // 4, gpus_total-gpus_used] + try: max_runtime = parse_timelimit(timelimit) except ValueError: @@ -145,24 +156,27 @@ def _slurm_info_resources( max_runtime = datetime.timedelta(days=1) try: - resources = PartitionAllResources( + resources = { # display resource counts - nnodes_total=nnodes_total, - nnodes_idle=nnodes_idle, - ncores_total=ncores_total, - ncores_idle=ncores_idle, + "job_slots": job_slots, # required resource counts - max_nprocs=ncores_per_node.rstrip("+"), - max_mem=memory.rstrip("+"), - gpu=gpu, - max_ngpus=gpus_total, - max_runtime=max_runtime.total_seconds(), - ) + "max_nprocs": ncores_total, + "max_mem": memory.rstrip("+"), + "gpu": gpu, + "max_ngpus": gpus_total, + "max_runtime": max_runtime.total_seconds(), + } except ValidationError as err: self.log.error("Error parsing output of slurm_info_cmd: %s", err) raise - partitions_info[partition] = resources + if partition in partitions_info: + # update display counters of existing partition + slots_counters = zip(partitions_info[partition]["job_slots"], resources["job_slots"]) + partitions_info[partition]["job_slots"] = [old + new for old, new in slots_counters] + else: + # add new partition + partitions_info[partition] = resources return partitions_info @@ -228,7 +242,7 @@ async def _get_partitions_info(self) -> dict[str, PartitionInfo]: partitions_info = { partition: PartitionInfo.parse_obj( { - **resources_info[partition].dict(), + **resources_info[partition], **config_partition_info.dict(exclude_none=True), } ) diff --git a/jupyterhub_moss/templates/option_form.html b/jupyterhub_moss/templates/option_form.html index 979b4c3..8327165 100755 --- a/jupyterhub_moss/templates/option_form.html +++ b/jupyterhub_moss/templates/option_form.html @@ -3,15 +3,18 @@
Partition | -Idle CPU cores | -Idle nodes | +1 core slots | +2 cores slots | +4 cores slots | +1 GPU slots |
---|---|---|---|---|---|---|
{{ name }} | -{{ partition['ncores_idle'] }} / {{ partition['ncores_total'] }} | -{{ partition['nnodes_idle'] }} / {{ partition['nnodes_total'] }} | + {% for slot in partition['job_slots'] %} +{{ slot }} | + {% endfor %}