Skip to content

Commit

Permalink
Utilize PMIx type checking for PML check
Browse files Browse the repository at this point in the history
It appears that sometimes we get a bogus value back
for the PML selected by a remote proc. Unfortunately,
the current code for exchanging that info removes all
type information from the exchange.

Replace the use of those macros with native PMIx
calls so we can check that the data type being
returned is the one we expected. True, it should not
have changed - but this will help detect and debug
any errors.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 committed Apr 19, 2024
1 parent bb7ecde commit 303b734
Showing 1 changed file with 44 additions and 24 deletions.
68 changes: 44 additions & 24 deletions ompi/mca/pml/base/pml_base_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
* Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights
* Copyright (c) 2018-2020 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -239,17 +240,6 @@ int mca_pml_base_select(bool enable_progress_threads,
return ret;
}

/* need a "commonly" named PML structure so everything ends up in the
same modex field */
static mca_base_component_t pml_base_component = {
OMPI_MCA_BASE_VERSION_2_1_0("pml", 2, 0, 0),
.mca_component_name = "base",
.mca_component_major_version = 2,
.mca_component_minor_version = 0,
.mca_component_release_version = 0,
};


/*
* If direct modex, then publish PML for all procs. If full modex then
* publish PML for rank 0 only. This information is used during add_procs
Expand All @@ -268,11 +258,19 @@ static mca_base_component_t pml_base_component = {
int
mca_pml_base_pml_selected(const char *name)
{
int rc = 0;
int rc = OMPI_SUCCESS;
pmix_value_t val;

/* if we were told not to perform the check, then no point
* in unnecessarily adding to the modex payload
*/
if (!ompi_pml_base_check_pml) {
return OMPI_SUCCESS;
}
if (!opal_pmix_collect_all_data || 0 == OMPI_PROC_MY_NAME->vpid) {
OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &pml_base_component, name,
strlen(name) + 1);
PMIX_VALUE_LOAD(&val, name, PMIX_STRING);
rc = PMIx_Put(PMIX_GLOBAL, "OMPI_PML_BASE", &val);
PMIX_VALUE_DESTRUCT(&val);
}
return rc;
}
Expand All @@ -281,19 +279,25 @@ static int
mca_pml_base_pml_check_selected_impl(const char *my_pml,
opal_process_name_t proc_name)
{
size_t size;
int ret = 0;
char *remote_pml;
pmix_proc_t proc;
pmix_value_t *val = NULL;
pmix_info_t optional;

/* if we are proc_name=OMPI_PROC_MY_NAME, then we can also assume success */
if (0 == opal_compare_proc(ompi_proc_local()->super.proc_name, proc_name)) {
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: PML check not necessary on self");
return OMPI_SUCCESS;
}
OPAL_MODEX_RECV_STRING(ret,
mca_base_component_to_string(&pml_base_component),
&proc_name, (void**) &remote_pml, &size);

/* Note that we cannot tell PMIx to just check our local modex info
* because we might have requested a direct modex
*/
OPAL_PMIX_CONVERT_NAME(&proc, &proc_name);
ret = PMIx_Get(&proc, "OMPI_PML_BASE", NULL, 0, &val);

if (PMIX_ERR_NOT_FOUND == ret) {
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: PML modex for process %s not found",
Expand All @@ -305,31 +309,47 @@ mca_pml_base_pml_check_selected_impl(const char *my_pml,
* wasn't returned, but just to be safe, and since the check
* is fast...let's be sure
*/
if (NULL == remote_pml) {
if (NULL == val) {
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: got a NULL pml from process %s",
OMPI_NAME_PRINT(&proc_name));
return OMPI_ERR_UNREACH;
}
if (PMIX_STRING != val->type) {
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: got a non-string (%s) pml from process %s",
PMIx_Data_type_string(val->type), OMPI_NAME_PRINT(&proc_name));
PMIX_VALUE_RELEASE(val);
return OMPI_ERR_UNREACH;
}
remote_pml = val->data.string;
val->data.string = NULL; // protect data
PMIX_VALUE_RELEASE(val);

opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
"check:select: checking my pml %s against process %s"
" pml %s", my_pml, OMPI_NAME_PRINT(&proc_name),
remote_pml);

/* if that module doesn't match my own, return an error */
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
if (0 != strcmp(my_pml, remote_pml)) {
char *errhost = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc_name,
&(errhost), PMIX_STRING);
PMIX_INFO_LOAD(&optional, PMIX_OPTIONAL, NULL, PMIX_BOOL);
ret = PMIx_Get(&proc, PMIX_HOSTNAME, &optional, 1, &val);
if (PMIX_SUCCESS == ret && NULL != val && PMIX_STRING == val->type) {
errhost = val->data.string;
val->data.string = NULL; // protect data
PMIX_VALUE_RELEASE(val);
}
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
my_pml, OMPI_NAME_PRINT(&proc_name),
(NULL == errhost) ? "unknown" : errhost,
remote_pml);
free(remote_pml);
free(errhost);
if (NULL != errhost) {
free(errhost);
}
/* cleanup before returning */
return OMPI_ERR_UNREACH;
}
Expand Down

0 comments on commit 303b734

Please sign in to comment.