Skip to content

Commit

Permalink
Merge branch 'main' into add-logical-n
Browse files Browse the repository at this point in the history
  • Loading branch information
dalcinl authored Mar 25, 2024
2 parents e6e515a + 58331e6 commit f566c5e
Show file tree
Hide file tree
Showing 3 changed files with 278 additions and 19 deletions.
23 changes: 23 additions & 0 deletions src/include/mpir_hwtopo.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ typedef enum {
MPIR_HWTOPO_TYPE__NODE,
MPIR_HWTOPO_TYPE__PACKAGE,
MPIR_HWTOPO_TYPE__SOCKET,
MPIR_HWTOPO_TYPE__GROUP,
MPIR_HWTOPO_TYPE__CPU,
MPIR_HWTOPO_TYPE__CORE,
MPIR_HWTOPO_TYPE__HWTHREAD,
Expand Down Expand Up @@ -146,4 +147,26 @@ bool MPIR_hwtopo_is_dev_close_by_pci(int domain, int bus, int dev, int func);
* Return the global id of the first non-io object above the PCI device
*/
MPIR_hwtopo_gid_t MPIR_hwtopo_get_dev_parent_by_pci(int domain, int bus, int dev, int func);

/*
* Return the number of numa nodes.
* This function is used to determine if a node is in SPR SNC4 mode
*/
int MPIR_hwtopo_get_num_numa_nodes(void);

/*
* Return the global id of the group ancestor of the first bound PU.
* This function is used for nic binding in SPR SNC4 mode
*/
MPIR_hwtopo_gid_t MPIR_hwtopo_get_first_pu_group(void);

/*
* Return the global id of the socket ancestor of the passed gid.
*/
MPIR_hwtopo_gid_t MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_gid_t gid);

/*
* Return the local index of my nic in my first non io ancestor.
*/
int MPIR_hwtopo_get_pci_network_lid(int domain, int bus, int dev, int func);
#endif /* MPIR_HWTOPO_H_INCLUDED */
144 changes: 125 additions & 19 deletions src/mpid/ch4/netmod/ofi/ofi_nic.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,50 @@ static bool is_nic_close(struct fi_info *info)
return MPIR_hwtopo_is_dev_close_by_name(info->domain_attr->name);
}

/* Return true if the NIC is close to the group of the calling process */
static bool is_nic_close_snc4(const MPIDI_OFI_nic_info_t * nic_info, int num_parents)
{
int nic_socket_gid = MPIR_hwtopo_get_parent_socket(nic_info->parent);
int rank_socket_gid = MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_get_first_pu_group());

/* In SNC4 mode, when there are 4 groups that have nics, it means that there are 4
* other adjacent groups with no nics. This leads to each set of 2 groups having 2 nics
* such that, the first group has no nics and the second group has 2 nics.
* The correct assignment strategy is such the 2 nics of the second group is considered
* close to the ranks on both the groups.*/
if (num_parents == 4) {
/* Check that the parent socket of the rank and the nic is the same */
if (nic_socket_gid == rank_socket_gid) {
int nic_group_lid = MPIR_hwtopo_get_lid(nic_info->parent);
int rank_group_lid = MPIR_hwtopo_get_lid(MPIR_hwtopo_get_first_pu_group());
if (nic_group_lid == rank_group_lid || nic_group_lid - rank_group_lid == 1) {
struct fi_info *info = (struct fi_info *) (nic_info->nic);
if (info->nic->bus_attr->bus_type == FI_BUS_PCI) {
struct fi_pci_attr pci = info->nic->bus_attr->attr.pci;

int nic_lid = MPIR_hwtopo_get_pci_network_lid(pci.domain_id,
pci.bus_id,
pci.device_id,
pci.function_id);

/* Map 1st nic of the group to the previous group */
if (nic_lid == 0 && nic_group_lid - rank_group_lid == 1)
return 1;
/* Map 2nd nic of the group to the current group */
else if (nic_lid == 1 && nic_group_lid == rank_group_lid)
return 1;
}
}
}
} else {
/* On using a different configuration than having 4 num_parents, simply
* compare parent socket of the nic and the rank */
if (nic_socket_gid == rank_socket_gid)
return 1;
}
return 0;
}

/* Comparison function for NIC names. Used in qsort() */
static int compare_nic_names(const void *info1, const void *info2)
{
Expand Down Expand Up @@ -170,6 +214,19 @@ static int setup_single_nic(void)
}

#ifdef HAVE_LIBFABRIC_NIC
/* Comparison function for NICs in SPR SNC4 mode. This function is used in qsort(). */
static int compare_nics_snc4(const void *nic1, const void *nic2)
{
const MPIDI_OFI_nic_info_t *i1 = (const MPIDI_OFI_nic_info_t *) nic1;
const MPIDI_OFI_nic_info_t *i2 = (const MPIDI_OFI_nic_info_t *) nic2;

if (i1->close && !i2->close)
return -1;
else if (i2->close && !i1->close)
return 1;
return compare_nic_names(&(i1->nic), &(i2->nic));
}

/* TODO: Now that multiple NICs are detected, sort them based on preferred-ness,
* closeness and count of other processes using the NIC. */
static int setup_multi_nic(int nic_count)
Expand All @@ -195,24 +252,26 @@ static int setup_multi_nic(int nic_count)
MPIDI_OFI_global.num_nics = MPIR_CVAR_CH4_OFI_MAX_NICS;
}

/* Now go through every NIC and set initial information
* from current process's perspective */
for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) {
nics[i].nic = MPIDI_OFI_global.prov_use[i];
nics[i].id = i;
/* Determine NIC's "closeness" to current process */
nics[i].close = is_nic_close(nics[i].nic);
if (nics[i].close)
MPIDI_OFI_global.num_close_nics++;
/* Set the preference of all NICs to least preferable (lower is more preferable) */
nics[i].prefer = MPIDI_OFI_global.num_nics + 1;
nics[i].count = 0;
nics[i].num_close_ranks = 0;
/* Determine NIC's first normal parent topology
* item (e.g., typically the socket parent) */
nics[i].parent = get_nic_parent(nics[i].nic);
/* Expand list of close NIC-parent topology items or increment */
if (nics[i].close) {
int num_numa_nodes = MPIR_hwtopo_get_num_numa_nodes();
bool is_snc4_with_cxi_nics = false;

if ((num_numa_nodes == 8 || num_numa_nodes == 16))
if (MPIDI_OFI_global.num_nics > 1)
if (strstr(MPIDI_OFI_global.prov_use[0]->domain_attr->name, "cxi"))
is_snc4_with_cxi_nics = true;

/* Special case of nic assignment for SPR in SNC4 mode */
if (is_snc4_with_cxi_nics) {
for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) {
nics[i].nic = MPIDI_OFI_global.prov_use[i];
nics[i].id = i;
/* Set the preference of all NICs to least preferable (lower is more preferable) */
nics[i].prefer = MPIDI_OFI_global.num_nics + 1;
nics[i].count = 0;
nics[i].num_close_ranks = 0;

nics[i].parent = get_nic_parent(nics[i].nic);

int found = 0;
for (int j = 0; j < num_parents; ++j) {
if (parents[j] == nics[i].parent) {
Expand All @@ -225,6 +284,47 @@ static int setup_multi_nic(int nic_count)
num_parents++;
}
}
/* Use num_parents to determine nic closeness */
for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) {
nics[i].close = is_nic_close_snc4(&nics[i], num_parents);
if (nics[i].close)
MPIDI_OFI_global.num_close_nics++;
}

} else {
/* General case of nic assignment */

/* Now go through every NIC and set initial information
* from current process's perspective */
for (int i = 0; i < MPIDI_OFI_global.num_nics; ++i) {
nics[i].nic = MPIDI_OFI_global.prov_use[i];
nics[i].id = i;
/* Determine NIC's "closeness" to current process */
nics[i].close = is_nic_close(nics[i].nic);
if (nics[i].close)
MPIDI_OFI_global.num_close_nics++;
/* Set the preference of all NICs to least preferable (lower is more preferable) */
nics[i].prefer = MPIDI_OFI_global.num_nics + 1;
nics[i].count = 0;
nics[i].num_close_ranks = 0;
/* Determine NIC's first normal parent topology
* item (e.g., typically the socket parent) */
nics[i].parent = get_nic_parent(nics[i].nic);
/* Expand list of close NIC-parent topology items or increment */
if (nics[i].close) {
int found = 0;
for (int j = 0; j < num_parents; ++j) {
if (parents[j] == nics[i].parent) {
found = 1;
break;
}
}
if (!found) {
parents[num_parents] = nics[i].parent;
num_parents++;
}
}
}
}

/* If there were zero NICs on my socket, then just consider every NIC close
Expand All @@ -237,7 +337,13 @@ static int setup_multi_nic(int nic_count)

/* Sort the NICs array based on closeness first. This way all the close
* NICs are at the beginning of the array */
qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics);
if (is_snc4_with_cxi_nics) {
/* Use a separate sorting function for snc4 nics in order to just compare
* closeness followed by nic name */
qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics_snc4);
} else {
qsort(nics, MPIDI_OFI_global.num_nics, sizeof(nics[0]), compare_nics);
}

/* Because we cannot communicate with the other local processes to avoid collisions with the
* same NICs, just shift NICs that have multiple close NICs around according to their local
Expand Down
130 changes: 130 additions & 0 deletions src/util/mpir_hwtopo.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ static hwloc_obj_type_t get_hwloc_obj_type(MPIR_hwtopo_type_e type)
case MPIR_HWTOPO_TYPE__CPU:
hwloc_obj_type = HWLOC_OBJ_PACKAGE;
break;
case MPIR_HWTOPO_TYPE__GROUP:
hwloc_obj_type = HWLOC_OBJ_GROUP;
break;
case MPIR_HWTOPO_TYPE__CORE:
hwloc_obj_type = HWLOC_OBJ_CORE;
break;
Expand Down Expand Up @@ -320,6 +323,7 @@ MPIR_hwtopo_type_e MPIR_hwtopo_get_type_id(const char *name)
{"machine", MPIR_HWTOPO_TYPE__NODE},
{"socket", MPIR_HWTOPO_TYPE__SOCKET},
{"package", MPIR_HWTOPO_TYPE__PACKAGE},
{"group", MPIR_HWTOPO_TYPE__GROUP},
{"cpu", MPIR_HWTOPO_TYPE__CPU},
{"core", MPIR_HWTOPO_TYPE__CORE},
{"hwthread", MPIR_HWTOPO_TYPE__HWTHREAD},
Expand Down Expand Up @@ -628,3 +632,129 @@ MPIR_hwtopo_gid_t MPIR_hwtopo_get_dev_parent_by_pci(int domain, int bus, int dev
#endif
return gid;
}

int MPIR_hwtopo_get_num_numa_nodes(void)
{
int num_numa_nodes = 0;

#ifdef HAVE_HWLOC
MPIR_hwtopo_gid_t gid = MPIR_hwtopo_get_obj_by_name("node");
hwloc_obj_t obj =
hwloc_get_obj_by_depth(hwloc_topology, HWTOPO_GET_DEPTH(gid), HWTOPO_GET_INDEX(gid));

hwloc_obj_t tmp = NULL;

while ((tmp = hwloc_get_next_obj_by_type(hwloc_topology, HWLOC_OBJ_NUMANODE, tmp)) != NULL) {
if (hwloc_bitmap_isset(obj->nodeset, tmp->os_index)) {
num_numa_nodes++;
}
}
#endif
return num_numa_nodes;
}

MPIR_hwtopo_gid_t MPIR_hwtopo_get_first_pu_group(void)
{
MPIR_hwtopo_gid_t gid = MPIR_HWTOPO_GID_ROOT;
#ifdef HAVE_HWLOC
hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
hwloc_get_proc_cpubind(hwloc_topology, getpid(), cpuset, HWLOC_CPUBIND_PROCESS);

hwloc_obj_t obj = hwloc_get_pu_obj_by_os_index(hwloc_topology, hwloc_bitmap_first(cpuset));
gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index);

/* Traverse up the PU object until a group object is reached */
while (obj && obj->type != HWLOC_OBJ_GROUP && obj->parent)
obj = obj->parent;
gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index);
#endif
return gid;
}

MPIR_hwtopo_gid_t MPIR_hwtopo_get_parent_socket(MPIR_hwtopo_gid_t gid)
{
MPIR_hwtopo_gid_t parent_gid = MPIR_HWTOPO_GID_ROOT;
#ifdef HAVE_HWLOC
hwloc_obj_t obj =
hwloc_get_obj_by_depth(hwloc_topology, HWTOPO_GET_DEPTH(gid), HWTOPO_GET_INDEX(gid));

while (obj && obj->parent && obj->type != HWLOC_OBJ_PACKAGE)
obj = obj->parent;

if (obj->type == HWLOC_OBJ_PACKAGE)
parent_gid = HWTOPO_GET_GID(get_type_class(obj->type), obj->depth, obj->logical_index);
#endif
return parent_gid;
}

#ifdef HAVE_HWLOC
static MPIR_hwtopo_gid_t obj_to_gid(hwloc_obj_t obj)
{
hwtopo_class_e class = get_type_class(obj->type);
return HWTOPO_GET_GID(class, obj->depth, obj->logical_index);
}

static int get_number_of_nics_below_me(hwloc_obj_t obj)
{
int num = 0;

/* Found a network device, increment by 1 */
if (obj->attr && obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK)
num++;

/* Find network devices among all my 'regular' children */
for (int i = 0; i < obj->arity; i++) {
num += get_number_of_nics_below_me(obj->children[i]);
}

/* Find network devices among all my io children */
hwloc_obj_t io_child = obj->io_first_child;
while (io_child) {
num += get_number_of_nics_below_me(io_child);
io_child = io_child->next_sibling;
}
return num;
}
#endif

int MPIR_hwtopo_get_pci_network_lid(int domain, int bus, int dev, int func)
{
int myIndex = 0;
#ifdef HAVE_HWLOC
hwloc_obj_t my_io_device = hwloc_get_pcidev_by_busid(hwloc_topology, domain, bus, dev, func);
MPIR_Assert(my_io_device);
hwloc_obj_t my_first_non_io = hwloc_get_non_io_ancestor_obj(hwloc_topology, my_io_device);
MPIR_Assert(my_first_non_io);

MPIR_hwtopo_gid_t my_parent_gid = obj_to_gid(my_first_non_io);
hwloc_obj_t io_device = my_io_device;

/* Determine the number of network devices before me in my first non io ancestor. This
* can be used to determine my local network nic, which is used for nic mapping.
* First, look for network devices among my previous siblings. */
while (io_device->prev_sibling) {
MPIR_hwtopo_gid_t prev_sibling_parent_gid =
obj_to_gid(hwloc_get_non_io_ancestor_obj(hwloc_topology, io_device->prev_sibling));

if (my_parent_gid != prev_sibling_parent_gid)
break;

myIndex += get_number_of_nics_below_me(io_device->prev_sibling);
io_device = io_device->prev_sibling;
}

/* Next, look for network devices among my previous cousins */
io_device = my_io_device;
while (io_device->prev_cousin) {
MPIR_hwtopo_gid_t prev_cousin_parent_gid =
obj_to_gid(hwloc_get_non_io_ancestor_obj(hwloc_topology, io_device->prev_cousin));

if (my_parent_gid != prev_cousin_parent_gid)
break;

myIndex += get_number_of_nics_below_me(io_device->prev_cousin);
io_device = io_device->prev_cousin;
}
#endif
return myIndex;
}

0 comments on commit f566c5e

Please sign in to comment.