Skip to content

Commit

Permalink
Merge pull request #1401 from Libensemble/release/v_1.4.2
Browse files Browse the repository at this point in the history
Release/v 1.4.2
  • Loading branch information
shuds13 authored Aug 14, 2024
2 parents 33118cc + 63314c1 commit 2892c86
Show file tree
Hide file tree
Showing 26 changed files with 293 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,4 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: crate-ci/[email protected].4
- uses: crate-ci/[email protected].6
8 changes: 7 additions & 1 deletion .github/workflows/extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ jobs:
rm ./libensemble/tests/unit_tests/test_ufunc_runners.py
rm ./libensemble/tests/unit_tests/test_executor_balsam.py
- name: Start Redis
if: matrix.os == 'ubuntu-latest'
uses: supercharge/[email protected]
with:
redis-version: 7

- name: Run extensive tests, Ubuntu
if: matrix.os == 'ubuntu-latest'
run: |
Expand All @@ -254,4 +260,4 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: crate-ci/[email protected].4
- uses: crate-ci/[email protected].6
4 changes: 2 additions & 2 deletions .wci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ description: |
language: Python

release:
version: 1.4.1
date: 2024-07-29
version: 1.4.2
date: 2024-08-14

documentation:
general: https://libensemble.readthedocs.io
Expand Down
27 changes: 26 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,32 @@ GitHub issues are referenced, and can be viewed with hyperlinks on the `github r

.. _`github releases page`: https://github.com/Libensemble/libensemble/releases

Release 1.4.2
--------------

:Date: August 14, 2024

* Fix under-utilized resource usage. #1398
* Fixes bug causing executor to wrongly increase processor counts when not all nodes are utilized.
* Fixes case where setting `num_gpus` to zero was treated as `None`.
* Add missing PerlmutterGPU specs (these were detected anyway). #1393
* Handle case where Perlmutter finds no partition. #1391
* Launch environment scripts in shell. #1392

:Examples:

* Add proxystore example (uses a proxy in history array). #1326

:Note:

* Tests were run on Linux and MacOS with Python versions 3.9, 3.10, 3.11, 3.12
* Heterogeneous workflows tested on Frontier (OLCF), Polaris (ALCF), and Perlmutter (NERSC).
* Note that tests have been recently run on Aurora (ALCF), but the system was unavailable at time of release.

:Known Issues:

* See known issues section in the documentation.

Release 1.4.1
--------------

Expand All @@ -25,7 +51,6 @@ Release 1.4.1

* See known issues section in the documentation.


Release 1.4.0
--------------

Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sphinx<8
sphinx<9
sphinxcontrib-bibtex
sphinxcontrib-spelling
autodoc_pydantic
Expand Down
3 changes: 2 additions & 1 deletion install/misc_feature_requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
globus-compute-sdk==2.24.0
globus-compute-sdk==2.25.0
proxystore==0.7.0
4 changes: 2 additions & 2 deletions install/testing_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
flake8==7.1.0
flake8==7.1.1
coverage==7.3.1
pytest==8.3.2
pytest-cov==5.0.0
pytest-timeout==2.3.1
mock==5.1.0
python-dateutil==2.9.0.post0
anyio==4.4.0
matplotlib==3.9.1
matplotlib==3.9.2
2 changes: 1 addition & 1 deletion libensemble/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from libensemble.specs import AllocSpecs, ExitCriteria, GenSpecs, LibeSpecs, SimSpecs
from libensemble.tools import add_unique_random_streams
from libensemble.tools import parse_args as parse_args_f
from libensemble.tools.parse_args import mpi_init
from libensemble.tools import save_libE_output
from libensemble.tools.parse_args import mpi_init
from libensemble.utils.misc import specs_dump

ATTR_ERR_MSG = 'Unable to load "{}". Is the function or submodule correctly named?'
Expand Down
12 changes: 6 additions & 6 deletions libensemble/executors/mpi_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def set_resources(self, resources: Resources) -> None:
self.resources = resources

def _launch_with_retries(
self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str]
self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str], use_shell: bool
) -> None:
"""Launch task with retry mechanism"""
retry_count = 0
Expand All @@ -156,6 +156,7 @@ def _launch_with_retries(
stdout=out,
stderr=err,
start_new_session=subgroup_launch,
shell=use_shell,
)
except Exception as e:
logger.warning(f"task {task.name} submit command failed on try {retry_count} with error {e}")
Expand Down Expand Up @@ -325,12 +326,9 @@ def submit(
if not num_procs and not match_procs_to_gpus:
num_procs = self.gen_nprocs

if not num_gpus:
if num_gpus is None:
num_gpus = self.gen_ngpus

if not num_nodes and (self.gen_ngpus or self.gen_nprocs):
num_nodes = self.resources.worker_resources.local_node_count

if mpi_runner_type is not None:
if isinstance(mpi_runner_type, str):
mpi_config = {"mpi_runner": mpi_runner_type}
Expand Down Expand Up @@ -367,8 +365,10 @@ def submit(

if env_script is not None:
run_cmd = Executor._process_env_script(task, runline, env_script)
use_shell = True
else:
run_cmd = runline
use_shell = False

if dry_run:
logger.info(f"Test (No submit) Runline: {' '.join(run_cmd)}")
Expand All @@ -378,7 +378,7 @@ def submit(
task._implement_env()

# Launch Task
self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd)
self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd, use_shell)

if not task.timer.timing and not task.finished:
task.timer.start()
Expand Down
41 changes: 36 additions & 5 deletions libensemble/executors/mpi_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _set_gpu_cli_option(self, wresources, extra_args, gpu_setting_name, gpu_valu
def _set_gpu_env_var(self, wresources, task, gpus_per_node, gpus_env):
"""Add GPU environment variable setting to the tasks environment"""
jassert(wresources.matching_slots, f"Cannot assign CPUs/GPUs to non-matching slots per node {wresources.slots}")
slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset, limit=gpus_per_node)
slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset_per_node, limit=gpus_per_node)
task._add_to_env(gpus_env, slot_list)

def _local_runner_set_gpus(self, task, wresources, extra_args, gpus_per_node, ppn):
Expand Down Expand Up @@ -171,7 +171,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,

# gpus per node for this worker.
if wresources.doihave_gpus():
gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset
gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset_per_node
else:
gpus_avail_per_node = 0

Expand Down Expand Up @@ -224,6 +224,35 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,

return nprocs, nnodes, ppn, extra_args

def _get_min_nodes(self, nprocs, ppn, nnodes, ngpus, resources):
"""Get minimum nodes needed to match configuration"""
if nnodes is not None:
return nnodes
if ppn:
return None # nnodes gets processed later.
if resources is not None:
wresources = resources.worker_resources
total_nodes = wresources.local_node_count
procs_on_node = wresources.slot_count * wresources.procs_per_rset_per_node

if not nprocs and ngpus is None:
# Delay node evaluation to GPU assignment code
return None
proc_min_nodes = 1
gpu_min_nodes = 1
if nprocs:
proc_min_nodes = (nprocs + procs_on_node - 1) // procs_on_node
if ngpus:
gpus_on_node = wresources.slot_count * wresources.gpus_per_rset_per_node
gpu_min_nodes = (ngpus + gpus_on_node - 1) // gpus_on_node

min_nodes = max(proc_min_nodes, gpu_min_nodes)
nnodes = min(min_nodes, total_nodes)
# Must have atleast one processor per node to use GPUs
if nprocs:
nnodes = min(nnodes, nprocs)
return nnodes

def _adjust_procs(self, nprocs, ppn, nnodes, ngpus, resources):
"""Adjust an invalid config"""

Expand All @@ -241,8 +270,8 @@ def adjust_resource(n_units, units_attr, units_name):

if resources is not None:
wresources = resources.worker_resources
ngpus = adjust_resource(ngpus, "gpus_per_rset", "ngpus")
nprocs = adjust_resource(nprocs, "procs_per_rset", "nprocs")
ngpus = adjust_resource(ngpus, "gpus_per_rset_per_node", "ngpus")
nprocs = adjust_resource(nprocs, "procs_per_rset_per_node", "nprocs")
return nprocs, ngpus

def get_mpi_specs(
Expand Down Expand Up @@ -284,6 +313,8 @@ def get_mpi_specs(

if match_procs_to_gpus:
jassert(no_config_set, "match_procs_to_gpus is mutually exclusive with either of nprocs/ppn")

nnodes = self._get_min_nodes(nprocs, ppn, nnodes, ngpus, resources)
nprocs, ngpus = self._adjust_procs(nprocs, ppn, nnodes, ngpus, resources)

if auto_assign_gpus or ngpus is not None:
Expand All @@ -294,7 +325,7 @@ def get_mpi_specs(
task, resources, nprocs, nnodes, ppn, ngpus, extra_args, match_procs_to_gpus
)

rm_rpn = True if self.rm_rpn and ppn is None and nnodes is None else False
rm_rpn = self.rm_rpn and ppn is None and nnodes is None

hostlist = None
if machinefile and not self.mfile_support:
Expand Down
4 changes: 2 additions & 2 deletions libensemble/gen_funcs/persistent_aposmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import numpy as np
from mpmath import gamma

# from scipy.spatial.distance import cdist

from libensemble.gen_funcs.aposmm_localopt_support import ConvergedMsg, LocalOptInterfacer, simulate_recv_from_manager
from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG
from libensemble.tools.persistent_support import PersistentSupport

# from scipy.spatial.distance import cdist


# Due to recursion error in scipy cdist function
def cdist(XA, XB, metric="euclidean"):
Expand Down
2 changes: 1 addition & 1 deletion libensemble/resources/mpi_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def get_resources(resources, num_procs=None, num_nodes=None, procs_per_node=None
)

if num_nodes < local_node_count:
logger.warning(
logger.debug(
"User constraints mean fewer nodes being used "
f"than available. {num_nodes} nodes used. {local_node_count} nodes available"
)
Expand Down
19 changes: 16 additions & 3 deletions libensemble/resources/platforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
option or the environment variable ``LIBE_PLATFORM``.
"""

import logging
import os
import subprocess
from typing import Optional
Expand All @@ -16,6 +17,10 @@

from libensemble.utils.misc import specs_dump

logger = logging.getLogger(__name__)
# To change logging level for just this module
# logger.setLevel(logging.DEBUG)


class PlatformException(Exception):
"""Platform module exception"""
Expand Down Expand Up @@ -178,6 +183,8 @@ class PerlmutterCPU(Perlmutter):


class PerlmutterGPU(Perlmutter):
cores_per_node: int = 64
logical_cores_per_node: int = 128
gpus_per_node: int = 4
gpu_setting_type: str = "runner_default"
gpu_env_fallback: str = "CUDA_VISIBLE_DEVICES"
Expand Down Expand Up @@ -269,6 +276,7 @@ class Known_platforms(BaseModel):
generic_rocm: GenericROCm = GenericROCm()
crusher: Crusher = Crusher()
frontier: Frontier = Frontier()
perlmutter: Perlmutter = Perlmutter()
perlmutter_c: PerlmutterCPU = PerlmutterCPU()
perlmutter_g: PerlmutterGPU = PerlmutterGPU()
polaris: Polaris = Polaris()
Expand All @@ -292,10 +300,15 @@ def known_envs():
"""Detect system by environment variables"""
name = None
if os.environ.get("NERSC_HOST") == "perlmutter":
if "gpu_" in os.environ.get("SLURM_JOB_PARTITION"):
name = "perlmutter_g"
partition = os.environ.get("SLURM_JOB_PARTITION")
if partition:
if "gpu_" in partition:
name = "perlmutter_g"
else:
name = "perlmutter_c"
else:
name = "perlmutter_c"
name = "perlmutter"
logger.manager_warning("Perlmutter detected, but no compute partition detected. Are you on login nodes?")
return name


Expand Down
17 changes: 11 additions & 6 deletions libensemble/resources/rset_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def __init__(self, num_workers, resources):
self.num_workers = num_workers
self.num_workers_2assign2 = RSetResources.get_workers2assign2(self.num_workers, resources)
self.total_num_rsets = resources.num_resource_sets or self.num_workers_2assign2

self.num_nodes = len(resources.global_nodelist)
self.split_list, self.local_rsets_list = RSetResources.get_partitioned_nodelist(self.total_num_rsets, resources)
self.nodes_in_rset = len(self.split_list[0])

gpus_avail_per_node = resources.gpus_avail_per_node
self.rsets_per_node = RSetResources.get_rsets_on_a_node(self.total_num_rsets, resources)
Expand All @@ -67,16 +68,20 @@ def __init__(self, num_workers, resources):
self.total_num_gpu_rsets = np.count_nonzero(self.all_rsets["gpus"])
self.total_num_nongpu_rsets = np.count_nonzero(~self.all_rsets["gpus"])

self.gpus_per_rset = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
self.cores_per_rset = resources.physical_cores_avail_per_node // self.rsets_per_node
self.gpus_per_rset_per_node = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
self.cores_per_rset_per_node = resources.physical_cores_avail_per_node // self.rsets_per_node

# Oversubsribe
if self.cores_per_rset == 0:
if self.cores_per_rset_per_node == 0:
cpn = resources.physical_cores_avail_per_node
procs_per_core = self.rsets_per_node // cpn + (self.rsets_per_node % cpn > 0)
self.procs_per_rset = resources.physical_cores_avail_per_node * procs_per_core
self.procs_per_rset_per_node = resources.physical_cores_avail_per_node * procs_per_core
else:
self.procs_per_rset = self.cores_per_rset
self.procs_per_rset_per_node = self.cores_per_rset_per_node

self.gpus_per_rset = self.gpus_per_rset_per_node * self.nodes_in_rset
self.cores_per_rset = self.cores_per_rset_per_node * self.nodes_in_rset
self.procs_per_rset = self.procs_per_rset_per_node * self.nodes_in_rset

@staticmethod
def get_group_list(split_list, gpus_per_node=0, gpus_per_group=None):
Expand Down
2 changes: 1 addition & 1 deletion libensemble/resources/worker_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def set_env_to_gpus(self, env_var=None, delimiter=","):
"""
assert self.matching_slots, f"Cannot assign GPUs to non-matching slots per node {self.slots}"
if self.doihave_gpus():
env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset, limit=self.gen_ngpus)
env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset_per_node, limit=self.gen_ngpus)
if env_var is None:
if self.platform_info is not None:
if self.platform_info.get("gpu_setting_type") == "env":
Expand Down
1 change: 1 addition & 0 deletions libensemble/sim_funcs/simple_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
__all__ = ["norm_eval"]

import numpy as np

from libensemble.specs import input_fields, output_data


Expand Down
2 changes: 1 addition & 1 deletion libensemble/sim_funcs/var_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def CUDA_variable_resources(H, _, sim_specs, libE_info):
cores_per_node = resources.slot_count

# Set to detected GPUs
# gpus_per_slot = resources.gpus_per_rset
# gpus_per_slot = resources.gpus_per_rset_per_node
# resources.set_env_to_slots("CUDA_VISIBLE_DEVICES", multiplier=gpus_per_slot)
# cores_per_node = resources.slot_count * gpus_per_slot # One CPU per GPU

Expand Down
Loading

0 comments on commit 2892c86

Please sign in to comment.