From 0b24f40b0fc7728e57a66622bbadc63d88ae181a Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Fri, 1 Sep 2023 11:59:28 -0500 Subject: [PATCH] Fix Subfiling VFD IOC assignment bug (#3456) --- release_docs/RELEASE.txt | 13 +++++ src/H5FDsubfiling/H5FDsubfiling_priv.h | 5 -- src/H5FDsubfiling/H5subfiling_common.c | 66 ++++++++++++++++++++------ src/H5FDsubfiling/H5subfiling_common.h | 19 ++++++++ 4 files changed, 84 insertions(+), 19 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 9f8d517bccb..4e32cc7ef13 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -292,6 +292,19 @@ Bug Fixes since HDF5-1.14.0 release =================================== Library ------- + - Fixed a bug with the way the Subfiling VFD assigns I/O concentrators + + During a file open operation, the Subfiling VFD determines the topology + of the application and uses that to select a subset of MPI ranks that + I/O will be forwarded to, called I/O concentrators. The code for this + had previously assumed that the parallel job launcher application (e.g., + mpirun, srun, etc.) would distribute MPI ranks sequentially among a node + until all processors on that node have been assigned before going on to + the next node. When the launcher application mapped MPI ranks to nodes + in a different fashion, such as round-robin, this could cause the Subfiling + VFD to incorrectly map MPI ranks as I/O concentrators, leading to missing + subfiles. + - Fixed performance regression with some compound type conversions In-place type conversion was introduced for most use cases in 1.14.2. diff --git a/src/H5FDsubfiling/H5FDsubfiling_priv.h b/src/H5FDsubfiling/H5FDsubfiling_priv.h index 96a73225621..08fef7d1a01 100644 --- a/src/H5FDsubfiling/H5FDsubfiling_priv.h +++ b/src/H5FDsubfiling/H5FDsubfiling_priv.h @@ -40,11 +40,6 @@ #include "H5subfiling_common.h" #include "H5subfiling_err.h" -/* - * Some definitions for debugging the Subfiling VFD - */ -/* #define H5FD_SUBFILING_DEBUG */ - #define DRIVER_INFO_MESSAGE_MAX_INFO 65536 #define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */ diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index 63791c14c99..34aaa93f4ab 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -55,8 +55,8 @@ static herr_t H5_free_subfiling_topology(sf_topology_t *topology); static herr_t init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm, int64_t *context_id_out); -static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, - sf_topology_t **app_topology_out); +static herr_t init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, + MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out); static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type, char **ioc_sel_info_str); static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config, @@ -64,7 +64,7 @@ static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t * static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm); static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm); static int compare_layout_nodelocal(const void *layout1, const void *layout2); -static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride); +static herr_t identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride); static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename, uint64_t file_id, H5FD_subfiling_params_t *subfiling_config, sf_topology_t *app_topology, MPI_Comm file_comm); @@ -886,7 +886,7 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param * Setup the application topology information, including the computed * number and distribution map of the set of I/O concentrators */ - if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0) + if (init_app_topology(context_id, subfiling_config, comm, node_comm, &app_topology) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology"); new_context->sf_context_id = context_id; @@ -938,8 +938,8 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param *------------------------------------------------------------------------- */ static herr_t -init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm, - sf_topology_t **app_topology_out) +init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, + MPI_Comm node_comm, sf_topology_t **app_topology_out) { H5FD_subfiling_ioc_select_t ioc_selection_type; sf_topology_t *app_topology = NULL; @@ -1142,7 +1142,7 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_ * Determine which ranks are I/O concentrator ranks, based on the * given IOC selection strategy and MPI information. */ - if (identify_ioc_ranks(app_topology, rank_multiple) < 0) + if (identify_ioc_ranks(sf_context_id, app_topology, rank_multiple) < 0) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't determine which MPI ranks are I/O concentrators"); } @@ -1600,7 +1600,7 @@ compare_layout_nodelocal(const void *layout1, const void *layout2) *------------------------------------------------------------------------- */ static herr_t -identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) +identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride) { app_layout_t *app_layout = NULL; int *io_concentrators = NULL; @@ -1613,6 +1613,11 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) assert(app_topology->app_layout); assert(app_topology->app_layout->layout); assert(app_topology->app_layout->node_count > 0); + assert(app_topology->app_layout->node_count <= app_topology->app_layout->world_size); + +#ifndef H5_SUBFILING_DEBUG + (void)sf_context_id; +#endif app_layout = app_topology->app_layout; @@ -1629,36 +1634,52 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) case SELECT_IOC_ONE_PER_NODE: { int total_ioc_count = 0; int iocs_per_node = 1; + int last_lead_rank; if (app_topology->n_io_concentrators > app_layout->node_count) iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count; - assert(app_layout->node_ranks); + /* + * NOTE: The below code assumes that the app_layout->layout + * array was sorted according to the node_lead_rank field, + * such that entries for MPI ranks on the same node will occur + * together in the array. + */ - for (size_t i = 0; i < (size_t)app_layout->node_count; i++) { - int node_index = app_layout->node_ranks[i]; - int local_size = app_layout->layout[node_index].node_local_size; + last_lead_rank = app_layout->layout[0].node_lead_rank; + for (size_t i = 0, layout_idx = 0; i < (size_t)app_layout->node_count; i++) { + int local_size = app_layout->layout[layout_idx].node_local_size; + /* Assign first I/O concentrator from this node */ assert(total_ioc_count < app_topology->n_io_concentrators); - io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; + io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank; if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + assert(!app_topology->rank_is_ioc); + app_topology->ioc_idx = total_ioc_count; app_topology->rank_is_ioc = TRUE; } total_ioc_count++; + /* Assign any additional I/O concentrators from this node */ for (size_t j = 1; j < (size_t)iocs_per_node; j++) { if (total_ioc_count >= max_iocs) break; if (j >= (size_t)local_size) break; + /* Sanity check to make sure this rank is on the same node */ + assert(app_layout->layout[layout_idx].node_lead_rank == + app_layout->layout[layout_idx - 1].node_lead_rank); + assert(total_ioc_count < app_topology->n_io_concentrators); - io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank; + io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank; if (app_layout->world_rank == io_concentrators[total_ioc_count]) { + assert(!app_topology->rank_is_ioc); + app_topology->ioc_idx = total_ioc_count; app_topology->rank_is_ioc = TRUE; } @@ -1668,8 +1689,25 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride) if (total_ioc_count >= max_iocs) break; + + /* Find the block of layout structures for the next node */ + while ((layout_idx < (size_t)app_layout->world_size) && + (last_lead_rank == app_layout->layout[layout_idx].node_lead_rank)) + layout_idx++; + + if (layout_idx >= (size_t)app_layout->world_size) + break; + + last_lead_rank = app_layout->layout[layout_idx].node_lead_rank; } +#ifdef H5_SUBFILING_DEBUG + if (app_topology->n_io_concentrators != total_ioc_count) + H5_subfiling_log(sf_context_id, + "%s: **WARN** Number of I/O concentrators adjusted from %d to %d", __func__, + app_topology->n_io_concentrators, total_ioc_count); +#endif + /* Set final number of I/O concentrators after adjustments */ app_topology->n_io_concentrators = total_ioc_count; diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h index 51d8b2289aa..395183a7f50 100644 --- a/src/H5FDsubfiling/H5subfiling_common.h +++ b/src/H5FDsubfiling/H5subfiling_common.h @@ -155,6 +155,25 @@ typedef enum io_ops { * will be broadcast to all MPI ranks and will * provide a basis for determining which MPI ranks * will host an I/O concentrator. + * + * - rank + * The MPI rank value for this processor + * + * - node_local_rank + * The MPI rank value for this processor in an MPI + * communicator that only involves MPI ranks on the + * same node as this processor + * + * - node_local_size + * The number of MPI ranks on the same node as this + * processor, including this processor itself + * + * - node_lead_rank + * The lowest MPI rank value for processors on the + * same node as this processor (possibly the MPI + * rank value for this processor); Denotes a "lead" + * MPI rank for certain operations. + * */ typedef struct { int rank;