From 0a7215ee64c068f86718fe799dc7427d8a0606b8 Mon Sep 17 00:00:00 2001 From: Cristian-Vasile Achim <66278390+csccva@users.noreply.github.com> Date: Tue, 30 Apr 2024 11:56:17 +0300 Subject: [PATCH] Update 10-further-mpi-topics.md --- mpi/docs/10-further-mpi-topics.md | 799 ++++++++++++++++++++++++++++++ 1 file changed, 799 insertions(+) diff --git a/mpi/docs/10-further-mpi-topics.md b/mpi/docs/10-further-mpi-topics.md index 07a6add90..f73c5915a 100644 --- a/mpi/docs/10-further-mpi-topics.md +++ b/mpi/docs/10-further-mpi-topics.md @@ -69,3 +69,802 @@ MPI_Request_free (&recv_req); MPI_Request_free (&send_req); - All the parameters for the communication are set up in the initialization phase - Communication is started and finalized in separate steps - Persistent communication provides optimization opportunities for MPI library + +# One-sided communication {.section} + + +# Process topologies {.section} + +# Communicators + +
+- Communicators are dynamic +- A task can belong simultaneously to several communicators + - Unique rank in each communicator +
+
+![](img/communicator.png){.center width=80%} +
+ +# Process topologies + +- MPI topology mechanism adds additional information about the + communication pattern to a communicator +- MPI topology can provide convenient naming scheme of processes +- MPI topology may assist the library and the runtime system in + optimizations + - In most implementations main advantage is, however, better programmability +- Topologies are defined by creating special user defined + communicators + +# Virtual topologies + +- MPI topologies are virtual, *i.e.* they do not have have necessarily relation + to the physical structure of the supercomputer +- The assignment of processes to physical CPU cores happens + typically outside MPI (and before MPI is initialized) +- The physical structure can in principle be taken account when + creating topologies, however, MPI implementations may not + implement that in practice + +# Virtual topologies + +- A communication pattern can be represented by a graph: nodes present + processes and edges connect processes that communicate with each other +- We discuss here only Cartesian topology which represents a regular + multidimensional grid. + +# Two dimensional Cartesian grid + +
+![](img/two-d-cartesian-grid.png){width=90%} +
+
+- Row major numbering +- Topology places no restrictions on communication + - any process can communicate with any other process +- Any dimension can be finite or periodic +
+ +# Communicator in Cartesian grid: MPI_Cart_create {.split-definition} + +MPI_Cart_create(`oldcomm`{.input}, `ndims`{.input}, `dims`{.input}, `periods`{.input}, `reorder`{.input}, `newcomm`{.output}) + : `oldcomm`{.input} + : communicator + + `ndims`{.input} + : number of dimensions + + `dims`{.input} + : integer array (size ndims) that defines the number of processes in each + dimension + + `periods`{.input} + : array that defines the periodicity of each dimension + + `reorder`{.input} + : is MPI allowed to renumber the ranks + + `newcomm`{.output} + : new Cartesian communicator + +# Determining division: MPI_Dims_create + +- Decompose a given number of processes into balanced distribution + +MPI_Dims_create(`ntasks`{.input}, `ndims`{.input}, `di`{.input}`ms`{.output}) + : `ntasks`{.input} + : number of tasks in a grid + : `ndims`{.input} + : number of dimensions + : `di`{.input}`ms`{.output} + : integer array (size ndims). A value of 0 means that MPI fills in + suitable value + + + +# Translating rank to coordinates + +- Checking the Cartesian communication topology coordinates for a + specific rank + +`MPI_Cart_coords(comm, rank, maxdim, coords)` + : `comm`{.input} + : Cartesian communicator + : `rank`{.input} + : rank to convert + : `maxdim`{.input} + : length of the coords vector + : `coords`{.output} + : coordinates in Cartesian topology that corresponds to rank + + +# Translating coordinates to rank + +- Checking the rank of the process at specific Cartesian communication + topology coordinates + +`MPI_Cart_rank(comm, coords, rank)` + : `comm`{.input} + : Cartesian communicator + : `coords`{.input} + : array of coordinates + : `rank`{.output} + : a rank corresponding to coords + + +# Creating a Cartesian communication topology + +```fortranfree +dims = 0 +period=(/ .true., .false. /) + +call mpi_dims_create(ntasks, 2, dims, rc) +call mpi_cart_create(mpi_comm_world, 2, dims, period, .true., comm2d, rc) +call mpi_comm_rank(comm2d, rank, rc) +call mpi_cart_coords(comm2d, rank, 2, coords, rc) +``` + + +# How to communicate in a Cartesian topology + +`MPI_Cart_shift(comm, direction, displ, source, dest)` + : `comm`{.input} + : Cartesian communicator + : `direction`{.input} + : shift direction (0 or 1 in 2D) + : `displ`{.input} + : shift displacement (1 for next cell etc, < 0 for source from "down"/"right" directions) + : `source`{.output} + : rank of source process + : `dest`{.output} + : rank of destination process + +# How to communicate in a Cartesian topology + +- Note! *Both* `source` and `dest` are *output* parameters. The + coordinates of the calling task is implicit input. +- `source` and `dest` are defined as for a shift like operation: + receive from source, send to destination + $$ + \text{displ = 1} \Longrightarrow + \begin{cases} + \text{source = mycoord - 1} \\ + \text{dest = mycoord + 1} + \end{cases} + $$ +- With a non-periodic grid, source or dest can land outside of the grid + - `MPI_PROC_NULL` is then returned + +# How to communicate in a Cartesian topology + +![](img/two-d-cartesian-shift.png){.center width=60%} + + +# Halo exchange + + +```fortranfree +call mpi_cart_shift(comm2d, 0, 1, nbr_up, nbr_down, rc) +call mpi_cart_shift(comm2d, 1, 1, nbr_left, nbr_right, rc) +... + +! left boundaries: send to left, receive from right +call mpi_sendrecv(buf(1,1), 1, coltype, nbr_left, tag_left, & + buf(1,n+1), 1, coltype, nbr_right, tag_left, & + comm2d, mpi_status_ignore, rc) + +! right boundaries: send to right, receive from left +... +! top boundaries: send to above, receive from below +call mpi_sendrecv(buf(1,1), 1, rowtype, nbr_up, tag_up, & + buf(n+1,1), 1, rowtype, nbr_down, tag_up, & + comm2d, mpi_status_ignore, rc) + +! bottom boundaries: send to below, receive from above +... +``` + + +# Summary + +- Process topologies provide a convenient referencing scheme for grid-like + decompositions +- Usage pattern + - Define a process grid with `MPI_Cart_create` + - Use the obtained new communicator as the `comm` argument in communication + routines + - For getting the ranks of the neighboring processes, use `MPI_Cart_shift` + or wrangle with `MPI_Cart_coords` and `MPI_Cart_rank` +- MPI provides also more general graph topologies + + +# Neighborhood collectives {.section} + +# Neighborhood collectives + +- Neighborhood collectives build on top of process topologies +- Provide optimization possibilities for MPI library for communication + patterns involving neighbours + - Nearest neighbors in cartesian topology + - Processes connected by a edge in a general graph +- Similar to ordinary collectives, all tasks within a communicator + need to call the routine +- Possible to have multidimensional halo-exchange with a single MPI call + +# Neighborhood collectives in cartesian grid + +
+- Only nearest neighbors, *i.e.* those corresponding to + `MPI_Cart_shift` with displacement=1. +- Boundaries in finite dimensions treated as like with `MPI_PROC_NULL` +
+ +
+![](img/cartesian-neighborhood.png){width=90%} +
+ +# Neighborhood collectives + +- Two main neighborhood operations + - `MPI_Neighbor_allgather` : send same data to all neighbors, receive different + data from neighbors + - `MPI_Neighbor_alltoall` : send and receive different data + between all the neighbors +- Also variants where different number or type of elements is + communicated +- Non-blocking versions with similar semantics than non-blocking + collectives + - Request parameter at the end of the list of arguments + +# Summary + +- Neighborhood collectives enable communication between neighbours in process topology + with a single MPI call +- Neighborhood communication provides optimization opportunities for MPI library + +# User-defined datatypes {.section} + +# MPI datatypes + +- MPI datatypes are used for communication purposes + - Datatype tells MPI where to take the data when sending or where + to put data when receiving +- Elementary datatypes (`MPI_INT`, `MPI_REAL`, ...) + - Different types in Fortran and C, correspond to languages basic + types + - Enable communication using contiguous memory sequence of + identical elements (e.g. vector or matrix) + +# Sending a matrix row (Fortran) + +- Row of a matrix is not contiguous in memory in Fortran + +

+ +![](img/fortran-array-layout.png){.center width=50%} + +

+ +- Several options for sending a row: + - Use several send commands for each element of a row + - Copy data to temporary buffer and send that with one send + command + - Create a matching datatype and send all data with one send + command + +# User-defined datatypes + +- Use elementary datatypes as building blocks +- Enable communication of + - Non-contiguous data with a single MPI call, e.g. rows or columns + of a matrix + - Heterogeneous data (structs in C, types in Fortran) + - Larger messages, count is `int` (32 bits) in C +- Provide higher level of programming + - Code is more compact and maintainable +- Needed for getting the most out of MPI I/O + +# User-defined datatypes + +- User-defined datatypes can be used both in point-to-point + communication and collective communication +- The datatype instructs where to take the data when sending or where + to put data when receiving + - Non-contiguous data in sending process can be received as + contiguous or vice versa + +# Using user-defined datatypes + +- A new datatype is created from existing ones with a datatype constructor + - Several routines for different special cases +- A new datatype must be committed before using it in communication + - **`MPI_Type_commit`(`newtype`{.input})** +- A type should be freed after it is no longer needed + - **`MPI_Type_free`(`newtype`{.input})** + +# Datatype constructors + +| Datatype | Usage | +|----------------------------|-------------------------------------------| +| `MPI_Type_contiguous` | contiguous datatypes | +| `MPI_Type_vector` | regularly spaced datatype | +| `MPI_Type_indexed` | variably spaced datatype | +| `MPI_Type_create_subarray` | subarray within a multi-dimensional array | +| `MPI_Type_create_hvector` | like vector, but uses bytes for spacings | +| `MPI_Type_create_hindexed` | like index, but uses bytes for spacings | +| `MPI_Type_create_struct` | fully general datatype | + + + +# User-defined datatypes: regularly spaced data (vector) {.section} + +# MPI_TYPE_VECTOR + +- Creates a new type from equally spaced identical blocks + +

+MPI_Type_vector(`count`{.input}, `blocklen`{.input}, `stride`{.input}, `oldtype`{.input}, `newtype`{.output}) + : `count`{.input} + : number of blocks + : `blocklen`{.input} + : number of elements in each block + : `stride`{.input} + : displacement between the blocks +
+
+

+![](img/type_vector.png){.center width=100%} +

+ +# Example: sending rows of matrix in Fortran + +```fortranfree +integer, parameter :: n=2, m=3 +real, dimension(n,m) :: a +type(mpi_datatype) :: rowtype +! create a derived type +call mpi_type_vector(m, 1, n, mpi_real, rowtype, ierr) +call mpi_type_commit(rowtype, ierr) +! send a row +call mpi_send(a, 1, rowtype, dest, tag, comm, ierr) +! free the type after it is not needed +call mpi_type_free(rowtype, ierr) +``` + +

+![](img/fortran-array-layout.png){.center width=50%} + + + +# Sending multiple elements: Extent + +- When communicating multiple elements, MPI uses the concept of + extent + - next element is read or write *extent* bytes apart from the + previous one in the buffer +- Extent is determined from the displacements and sizes of the basic + types + - The lower bound (LB) = min(displacement) + - Extent = max(displacement + size) - LB + padding +- Communicating multiple user-defined types at once may not behave as + expected if there are gaps in the beginning or end of the derived type + + +# Multiple MPI_TYPE_VECTORs + +![](img/vector-extent.png) + + +# Getting extent and lower bound + +`MPI_Type_get_extent(type, lb, extent)` + : `type`{.input} + : Datatype + + `lb`{.output} + : Lower bound of type (in bytes) + + `extent`{.output} + : Extent of type (in bytes) + + +# Setting extent and lower bound + +`MPI_Type_create_resized(type, lb, extent, newtype)` + : `type`{.input} + : Old datatype + + `lb`{.input} + : New lower bound (in bytes) + + `extent`{.input} + : New extent (in bytes) + + `newtype`{.output} + : New datatype, commit before use + +# Multiple MPI_TYPE_VECTORs + +![](img/vector-resize.png) + + + +# From non-contiguous to contiguous data + +

+![](img/contiguous.png){.center width=100%} +
+
+```c +if (rank == 0) + MPI_Type_vector(n, 1, 2, MPI_FLOAT, + &newtype) + ... + MPI_Send(A, 1, newtype, 1, ...) +else + MPI_Recv(B, n, MPI_FLOAT, 0, ...) + +``` + +```c +if (rank == 0) + MPI_Send(A, n, MPI_FLOAT, 1, ...) +else + MPI_Type_vector(n, 1, 2, MPI_FLOAT, + &newtype) + ... + MPI_Recv(B, 1, newtype, 0, ...) + +``` +
+ + + +# User-defined datatypes: heterogeneous data (struct) {.section} + + +# Example: sending an array of structs + +```c +struct ParticleStruct { + int charge; /* particle charge */ + double coords[3]; /* particle coordinates */ + double velocity[3]; /* particle velocity vector */ +}; + +struct ParticleStruct particle[1000]; + +/* How to define a new Particletype? */ +... + +MPI_Send(particle, 1000, Particletype, dest, tag, MPI_COMM_WORLD); +``` + + +# MPI_TYPE_CREATE_STRUCT {.split-definition} + +- Creates a new type from heterogeneous blocks + - e.g. Fortran types and C structures + +`MPI_Type_create_struct(count, blocklens, displs, types, newtype)` + : `count`{.input} + : number of blocks + + `blocklens`{.input} + : lengths of blocks (array) + + `displs`{.input} + : displacements of blocks in bytes (array) + + `types`{.input} + : types of blocks (array) + + `newtype`{.output} + : new datatype + + `-`{.ghost} + : `-`{.ghost} + +![](img/type-struct.png) + + +# Determining displacements + +- The displacements of blocks should be determined by using the function + +`MPI_Get_address(pointer, address)` + : `pointer`{.input} + : pointer to the variable of interest + + `address`{.output} + : address of the variable, type is + + - `MPI_Aint` (C) + - `integer(mpi_address_kind)` (Fortran) + + +# Gaps between structs: Extent + +- When sending an array of the structures, the extent needs to be checked as for other user-defined datatypes + - It's implicit assumed that the **extent** of the datatype would be the same as the + size of the C struct + - This is not necessarily the case, and if there are gaps in memory between the successive structures, sending + does not work correctly with the default extent + + +# Example: sending an array of structs + +```c +struct ParticleStruct { + int charge; /* particle charge */ + double coords[3]; /* particle coordinates */ + double velocity[3]; /* particle velocity vector */ +}; + +struct ParticleStruct particle[1000]; + +/* Define a new type */ +MPI_Datatype Particletype; +MPI_Datatype type[3] = {MPI_INT, MPI_DOUBLE, MPI_DOUBLE}; +int blocklen[3] = {1, 3, 3}; + +... +``` + +# Example: sending an array of structs + +```c +... +/* Determine displacements */ +MPI_Aint disp[3]; +MPI_Get_address(&particle[0].charge, &disp[0]); +MPI_Get_address(&particle[0].coords, &disp[1]); +MPI_Get_address(&particle[0].velocity, &disp[2]); + +/* Make displacements relative */ +disp[2] -= disp[0]; +disp[1] -= disp[0]; +disp[0] = 0; + +/* Create the new type */ +MPI_Type_create_struct(3, blocklen, disp, type, &Particletype); +MPI_Type_commit(&Particletype); + +... +``` + +# Example: sending an array of structs + +```c +... +/* Check extent */ +MPI_Datatype oldtype; +MPI_Aint lb, extent; +MPI_Type_get_extent(Particletype, &lb, &extent); +if ( extent != sizeof(particle[0]) ) { + oldtype = Particletype; + MPI_Type_create_resized(oldtype, 0, sizeof(particle[0]), &Particletype); + MPI_Type_commit(&Particletype); + MPI_Type_free(&oldtype); +} + +/* Now we are ready to use the new type */ +MPI_Send(particle, 1000, Particletype, dest, tag, MPI_COMM_WORLD); + +/* Free the type after not needed anymore */ +MPI_Type_free(&Particletype); +``` + + +# Other ways of communicating non-uniform data + +- Structures and types as continuous stream of bytes: Communicate + everything using `MPI_BYTE` + - Portability can be an issue - be careful (*cf.* extent) + +```c +struct ParticleStruct particle[1000]; + +MPI_Send(particle, 1000*sizeof(particle[0]), MPI_BYTE, ...); +``` + + +# Other ways of communicating non-uniform data + +- Non-contiguous data by manual packing + - Copy data into or out from temporary buffer +- Use `MPI_Pack` and `MPI_Unpack` functions + - Performance will likely be an issue + + +# User-defined datatypes: general remarks {.section} + +# Performance + +- Main motivation for using datatypes is not necessarily performance – manual + packing can be faster +- Performance depends on the datatype – more general datatypes are + often slower +- Overhead is potentially reduced by: + - Sending one long message instead of many small messages + - Avoiding the need to pack data in temporary buffers +- Performance should be tested on target platforms + - Performance is currently poor with GPU-aware MPI + + +# Summary + +- User-defined types enable communication of non-contiguous or + heterogeneous data with single MPI communication operations + - Improves code readability & portability + - Allows optimizations by the MPI runtime +- Life cycle of derived type: create, commit, free +- Ensuring the correct extent of the derived data type is important +- MPI provides constructors for several specific types + + +# User-defined datatypes: other datatype constructors {.section} + +# MPI_TYPE_CONTIGUOUS + +MPI_Type_contiguous(`count`{.input}, `oldtype`{.input}, `newtype`{.output}) + : `count`{.input} + : number of oldtypes + : `oldtype`{.input} + : old type + : `newtype`{.output} + : new datatype + +- Usage mainly for programming convenience + - derived types in all communication calls + + +
+```fortranfree +! Using derived type +call mpi_send(buf, 1, conttype, ...) +``` +
+
+```fortranfree +! Equivalent call with count and basic type +call mpi_send(buf, count, MPI_REAL, ...) +``` +
+
+ + +# MPI_TYPE_INDEXED {.split-def-3} + +- Creates a new type from blocks comprising identical elements + - The size and displacements of the blocks may vary + +MPI_Type_indexed(`count`{.input}, `blocklens`{.input}, `displs`{.input}, `oldtype`{.input}, `newtype`{.output}) + : `count`{.input} + : number of blocks + + `blocklens`{.input} + : lengths of the blocks (array) + + `displs`{.input} + : displacements (array) in extent of oldtypes + + `oldtype`{.input} + : original type + + `newtype`{.output} + : new type + + `-`{.ghost} + : `-`{.ghost} + +

+![](img/type_indexed.png){.center width=100%} + +# Example: an upper triangular matrix + +

+```c +/* Upper triangular matrix */ +double a[100][100]; +int disp[100], blocklen[100], int i; +MPI_Datatype upper; +/* compute start and size of rows */ +for (i=0; i<100; i++) { + disp[i] = 100*i+i; + blocklen[i] = 100-­i; +} +/* create a datatype for upper tr matrix */ +MPI_Type_indexed(100,blocklen,disp, + MPI_DOUBLE,&upper); +MPI_Type_commit(&upper); +/* ... send it ... */ +MPI_Send(a,1,upper,dest, tag, MPI_COMM_WORLD); +MPI_Type_free(&upper); +``` +
+ +
+![](img/triangle.png){.center width=65%} +
+ +# Subarray + +
+- Subarray datatype describes a N-dimensional subarray within a +N-dimensional array +- Array can have either C (row major) or Fortran (column major) +ordering in memory +
+ +
+![](img/subarray.png){.center width=60%} +
+ + +# MPI_TYPE_CREATE_SUBARRAY {.split-def-3} + + +MPI_Type_create_subarray(`ndims`{.input}, `sizes`{.input}, `subsizes`{.input}, `offsets`{.input}, `order`{.input}, `oldtype`{.input}, `newtype`{.output}) + : `ndims`{.input} + : number of array dimensions + + `sizes`{.input} + : number of array elements in each dimension (array) + + `subsizes`{.input} + : number of subarray elements in each dimension (array) + + `offsets`{.input} + : starting point of subarray in each dimension (array) + + `order`{.input} + : storage order of the array. Either `MPI_ORDER_C` or + `MPI_ORDER_FORTRAN` + + `oldtype`{.input} + : oldtype + + `newtype`{.output} + : resulting type + + `-`{.ghost} + : `-`{.ghost} + +# Example: subarray + +
+ +```c +int a_size[2] = {5,5}; +int sub_size[2] = {2,3}; +int sub_start[2] = {1,2}; +MPI_Datatype sub_type; +double array[5][5]; + +for(i = 0; i < a_size[0]; i++) + for(j = 0; j < a_size[1]; j++) + array[i][j] = rank; + +MPI_Type_create_subarray(2, a_size, sub_size, + sub_start, MPI_ORDER_C, MPI_DOUBLE, &sub_type); + +MPI_Type_commit(&sub_type); + +if (rank==0) + MPI_Recv(array[0], 1, sub_type, 1, 123, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); +if (rank==1) + MPI_Send(array[0], 1, sub_type, 0, 123, + MPI_COMM_WORLD); + +MPI_Type_free(&sub_type); + +``` + +
+
+![](img/type_array.png){.center width=100%} +