3 * ompi_coll_tuned_allgatherv_intra_neighborexchange
5 * Function: allgatherv using N/2 steps (O(N))
6 * Accepts: Same arguments as MPI_Allgatherv
7 * Returns: MPI_SUCCESS or error code
9 * Description: Neighbor Exchange algorithm for allgather adapted for
11 * Described by Chen et.al. in
12 * "Performance Evaluation of Allgather Algorithms on
13 * Terascale Linux Cluster with Fast Ethernet",
14 * Proceedings of the Eighth International Conference on
15 * High-Performance Computing inn Asia-Pacific Region
18 * Rank r exchanges message with one of its neighbors and
19 * forwards the data further in the next step.
21 * No additional memory requirements.
23 * Limitations: Algorithm works only on even number of processes.
24 * For odd number of processes we switch to ring algorithm.
29 * [0] [ ] [ ] [ ] [ ] [ ]
30 * [ ] [1] [ ] [ ] [ ] [ ]
31 * [ ] [ ] [2] [ ] [ ] [ ]
32 * [ ] [ ] [ ] [3] [ ] [ ]
33 * [ ] [ ] [ ] [ ] [4] [ ]
34 * [ ] [ ] [ ] [ ] [ ] [5]
37 * [0] [0] [ ] [ ] [ ] [ ]
38 * [1] [1] [ ] [ ] [ ] [ ]
39 * [ ] [ ] [2] [2] [ ] [ ]
40 * [ ] [ ] [3] [3] [ ] [ ]
41 * [ ] [ ] [ ] [ ] [4] [4]
42 * [ ] [ ] [ ] [ ] [5] [5]
45 * [0] [0] [0] [ ] [ ] [0]
46 * [1] [1] [1] [ ] [ ] [1]
47 * [ ] [2] [2] [2] [2] [ ]
48 * [ ] [3] [3] [3] [3] [ ]
49 * [4] [ ] [ ] [4] [4] [4]
50 * [5] [ ] [ ] [5] [5] [5]
53 * [0] [0] [0] [0] [0] [0]
54 * [1] [1] [1] [1] [1] [1]
55 * [2] [2] [2] [2] [2] [2]
56 * [3] [3] [3] [3] [3] [3]
57 * [4] [4] [4] [4] [4] [4]
58 * [5] [5] [5] [5] [5] [5]
61 #include "colls_private.h"
64 smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
66 void* rbuf, int *rcounts, int *rdispls,
72 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
76 ptrdiff_t slb, rlb, sext, rext;
77 char *tmpsend = NULL, *tmprecv = NULL;
80 size = smpi_comm_size(comm);
81 rank = smpi_comm_rank(comm);
85 "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
87 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
94 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
96 err = smpi_datatype_extent (sdtype, &slb, &sext);
97 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
99 err = smpi_datatype_extent (rdtype, &rlb, &rext);
100 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
102 /* Initialization step:
103 - if send buffer is not MPI_IN_PLACE, copy send buffer to
104 the appropriate block of receive buffer
106 tmprecv = (char*) rbuf + rdispls[rank] * rext;
107 if (MPI_IN_PLACE != sbuf) {
108 tmpsend = (char*) sbuf;
109 err = smpi_datatype_copy(tmpsend, scount, sdtype,
110 tmprecv, rcounts[rank], rdtype);
111 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
114 /* Determine neighbors, order in which blocks will arrive, etc. */
115 even_rank = !(rank % 2);
117 neighbor[0] = (rank + 1) % size;
118 neighbor[1] = (rank - 1 + size) % size;
119 recv_data_from[0] = rank;
120 recv_data_from[1] = rank;
121 offset_at_step[0] = (+2);
122 offset_at_step[1] = (-2);
124 neighbor[0] = (rank - 1 + size) % size;
125 neighbor[1] = (rank + 1) % size;
126 recv_data_from[0] = neighbor[0];
127 recv_data_from[1] = neighbor[0];
128 offset_at_step[0] = (-2);
129 offset_at_step[1] = (+2);
132 /* Communication loop:
133 - First step is special: exchange a single block with neighbor[0].
135 update recv_data_from according to offset, and
136 exchange two blocks with appropriate neighbor.
137 the send location becomes previous receve location.
138 Note, we need to create indexed datatype to send and receive these
141 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
142 tmpsend = (char*)rbuf + rdispls[rank] * rext;
143 smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype,
144 neighbor[0], COLL_TAG_ALLGATHERV,
145 tmprecv, rcounts[neighbor[0]], rdtype,
146 neighbor[0], COLL_TAG_ALLGATHERV,
147 comm, MPI_STATUS_IGNORE);
153 /* Determine initial sending counts and displacements*/
155 send_data_from = rank;
157 send_data_from = recv_data_from[0];
160 for (i = 1; i < (size / 2); i++) {
161 MPI_Datatype new_rdtype, new_sdtype;
162 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
163 const int i_parity = i % 2;
164 recv_data_from[i_parity] =
165 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
167 /* Create new indexed types for sending and receiving.
168 We are sending data from ranks (send_data_from) and (send_data_from+1)
169 We are receiving data from ranks (recv_data_from[i_parity]) and
170 (recv_data_from[i_parity]+1).
173 new_scounts[0] = rcounts[send_data_from];
174 new_scounts[1] = rcounts[(send_data_from + 1)];
175 new_sdispls[0] = rdispls[send_data_from];
176 new_sdispls[1] = rdispls[(send_data_from + 1)];
177 err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype,
179 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
180 smpi_datatype_commit(&new_sdtype);
182 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
183 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
184 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
185 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
186 err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype,
188 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
189 smpi_datatype_commit(&new_rdtype);
191 tmprecv = (char*)rbuf;
192 tmpsend = (char*)rbuf;
195 smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
197 tmprecv, 1, new_rdtype, neighbor[i_parity],
199 comm, MPI_STATUS_IGNORE);
201 send_data_from = recv_data_from[i_parity];
203 smpi_datatype_free(&new_sdtype);
204 smpi_datatype_free(&new_rdtype);
210 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
211 __FILE__, line, err, rank);