Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CNEUR-379] Use /dev/shm as a cache in multinode simulations #15

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ set_alt_branches:
blueconfig_tests:
variables:
bb5_build_dir: pipeline
PY_NEURODAMUS_BRANCH: $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
PY_NEURODAMUS_BRANCH: $CI_COMMIT_BRANCH
PARENT_COMMIT_MESSAGE: $CI_COMMIT_MESSAGE
trigger:
project: hpc/sim/blueconfigs
Expand Down
3 changes: 2 additions & 1 deletion neurodamus/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def neurodamus(args=None):
--save-time=<TIME> The simulation time [ms] to save the state. (Default: At the end)
--restore=<PATH> Restore and resume simulation from a save point on disk
--dump-cell-state=<GID> Dump cell state debug files on start, save-restore and at the end
--enable-shm=[ON, OFF] Enables the use of /dev/shm for coreneuron_input [default: ON]
--enable-shm=[ON, OFF, CACHE]
Enables the use of /dev/shm for coreneuron_input [default: ON]
--model-stats Show model stats in CoreNEURON simulations [default: False]
--dry-run Dry-run simulation to estimate memory usage [default: False]
"""
Expand Down
7 changes: 4 additions & 3 deletions neurodamus/core/_shmutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class SHMUtil:
"""Helper class for the SHM file transfer mechanism of CoreNEURON.
"""
node_id = -1
local_ranks = []
nnodes = -1

@staticmethod
Expand All @@ -23,9 +24,9 @@ def __set_node_info(MPI): # TODO: Replace with MPI SHM communicator
MPI.barrier()

# Get a filelist sorted by rank ID and store the local node info
listdir = sorted(os.listdir(shmdir), key=int)
rank0_node = int(listdir[0])
nranks_node = len(listdir)
SHMUtil.local_ranks = sorted(os.listdir(shmdir), key=int)
rank0_node = int(SHMUtil.local_ranks[0])
nranks_node = len(SHMUtil.local_ranks)

# Calculate node ID based on the entries that contain a process count
node_info = MPI.py_gather((nranks_node if MPI.rank == rank0_node else 0), 0)
Expand Down
7 changes: 7 additions & 0 deletions neurodamus/core/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,13 @@ def _loadbal_mode(config: _SimConfig, run_conf):
config.loadbal_mode = LoadBalanceMode.parse(lb_mode_str)


@SimConfig.validator
def _enable_shm(config: _SimConfig, run_conf):
cli_args = config.cli_options
dev_shm_str = cli_args.enable_shm
config.enable_shm = True if dev_shm_str in ["CACHE", "ON"] else False


@SimConfig.validator
def _projection_params(config: _SimConfig, run_conf):
required_fields = ("Path",)
Expand Down
31 changes: 31 additions & 0 deletions neurodamus/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import subprocess
from os import path as ospath
from pathlib import Path
from collections import Counter, namedtuple, defaultdict
from contextlib import contextmanager
from shutil import copyfileobj, move
Expand Down Expand Up @@ -1346,6 +1347,36 @@ def _sim_corenrn_write_config(self, corenrn_restore=False):
self._pc.nrnbbcore_write(corenrn_data)
MPI.barrier() # wait for all ranks to finish corenrn data generation

if self._shm_enabled and SimConfig.cli_options.enable_shm == "CACHE":
# Below improvement should only be enabled if /dev/shm is available
# If /dev/shm is enabled this means the the <*_{1,2,3}.dat> files are going to
# be written first in /dev/shm (which is going to be used as a buffer to
# accelerate the writes and avoid GPFS IO)
# Rank 0 should move the <*_{1,2,3}.dat> files to a the coreneuron_datadir on
# GPFS (coreneuron_input_gpfs). Each node should move them to a separate
# subfolder (with name NODE_ID?) of coreneuron_datadir to increase GPFS
# performance. We can do that by finding out the Rank 0 of every node (already
# implemented).
# We will then create links on coreneuron_datadir for each file in the
# subfolders.
local_node_rank0 = int(SHMUtil.local_ranks[0])
if MPI.rank == local_node_rank0:
import shutil

group_id = int(SHMUtil.node_id / 20)
node_specific_corenrn_output_in_storage = \
Path(SimConfig.coreneuron_datadir) / f"cycle_{self._cycle_i}/group_{group_id}/node_{SHMUtil.node_id}"
allfiles = glob.glob(
os.path.join(corenrn_data, "*_[1-3].dat"), recursive=False
)
os.makedirs(node_specific_corenrn_output_in_storage, exist_ok=True)
# f has the whole path. I need only the filename
for f in allfiles:
if not os.path.islink(f):
filename = os.path.basename(f)
shutil.move(f, node_specific_corenrn_output_in_storage)
os.symlink(node_specific_corenrn_output_in_storage / filename, f)

SimConfig.coreneuron.write_sim_config(
corenrn_output,
corenrn_data,
Expand Down