3 * ompi_coll_tuned_allgatherv_intra_neighborexchange
5 * Function: allgatherv using N/2 steps (O(N))
6 * Accepts: Same arguments as MPI_Allgatherv
7 * Returns: MPI_SUCCESS or error code
9 * Description: Neighbor Exchange algorithm for allgather adapted for
11 * Described by Chen et.al. in
12 * "Performance Evaluation of Allgather Algorithms on
13 * Terascale Linux Cluster with Fast Ethernet",
14 * Proceedings of the Eighth International Conference on
15 * High-Performance Computing inn Asia-Pacific Region
18 * Rank r exchanges message with one of its neighbors and
19 * forwards the data further in the next step.
21 * No additional memory requirements.
23 * Limitations: Algorithm works only on even number of processes.
24 * For odd number of processes we switch to ring algorithm.
29 * [0] [ ] [ ] [ ] [ ] [ ]
30 * [ ] [1] [ ] [ ] [ ] [ ]
31 * [ ] [ ] [2] [ ] [ ] [ ]
32 * [ ] [ ] [ ] [3] [ ] [ ]
33 * [ ] [ ] [ ] [ ] [4] [ ]
34 * [ ] [ ] [ ] [ ] [ ] [5]
37 * [0] [0] [ ] [ ] [ ] [ ]
38 * [1] [1] [ ] [ ] [ ] [ ]
39 * [ ] [ ] [2] [2] [ ] [ ]
40 * [ ] [ ] [3] [3] [ ] [ ]
41 * [ ] [ ] [ ] [ ] [4] [4]
42 * [ ] [ ] [ ] [ ] [5] [5]
45 * [0] [0] [0] [ ] [ ] [0]
46 * [1] [1] [1] [ ] [ ] [1]
47 * [ ] [2] [2] [2] [2] [ ]
48 * [ ] [3] [3] [3] [3] [ ]
49 * [4] [ ] [ ] [4] [4] [4]
50 * [5] [ ] [ ] [5] [5] [5]
53 * [0] [0] [0] [0] [0] [0]
54 * [1] [1] [1] [1] [1] [1]
55 * [2] [2] [2] [2] [2] [2]
56 * [3] [3] [3] [3] [3] [3]
57 * [4] [4] [4] [4] [4] [4]
58 * [5] [5] [5] [5] [5] [5]
61 #include "colls_private.h"
62 #define MCA_COLL_BASE_TAG_ALLGATHERV 444
65 smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
67 void* rbuf, int *rcounts, int *rdispls,
73 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
77 ptrdiff_t slb, rlb, sext, rext;
78 char *tmpsend = NULL, *tmprecv = NULL;
81 size = smpi_comm_size(comm);
82 rank = smpi_comm_rank(comm);
86 "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
88 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
95 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
97 err = smpi_datatype_extent (sdtype, &slb, &sext);
98 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
100 err = smpi_datatype_extent (rdtype, &rlb, &rext);
101 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
103 /* Initialization step:
104 - if send buffer is not MPI_IN_PLACE, copy send buffer to
105 the appropriate block of receive buffer
107 tmprecv = (char*) rbuf + rdispls[rank] * rext;
108 if (MPI_IN_PLACE != sbuf) {
109 tmpsend = (char*) sbuf;
110 err = smpi_datatype_copy(tmpsend, scount, sdtype,
111 tmprecv, rcounts[rank], rdtype);
112 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
115 /* Determine neighbors, order in which blocks will arrive, etc. */
116 even_rank = !(rank % 2);
118 neighbor[0] = (rank + 1) % size;
119 neighbor[1] = (rank - 1 + size) % size;
120 recv_data_from[0] = rank;
121 recv_data_from[1] = rank;
122 offset_at_step[0] = (+2);
123 offset_at_step[1] = (-2);
125 neighbor[0] = (rank - 1 + size) % size;
126 neighbor[1] = (rank + 1) % size;
127 recv_data_from[0] = neighbor[0];
128 recv_data_from[1] = neighbor[0];
129 offset_at_step[0] = (-2);
130 offset_at_step[1] = (+2);
133 /* Communication loop:
134 - First step is special: exchange a single block with neighbor[0].
136 update recv_data_from according to offset, and
137 exchange two blocks with appropriate neighbor.
138 the send location becomes previous receve location.
139 Note, we need to create indexed datatype to send and receive these
142 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
143 tmpsend = (char*)rbuf + rdispls[rank] * rext;
144 smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype,
145 neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
146 tmprecv, rcounts[neighbor[0]], rdtype,
147 neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
148 comm, MPI_STATUS_IGNORE);
154 /* Determine initial sending counts and displacements*/
156 send_data_from = rank;
158 send_data_from = recv_data_from[0];
161 for (i = 1; i < (size / 2); i++) {
162 MPI_Datatype new_rdtype, new_sdtype;
163 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
164 const int i_parity = i % 2;
165 recv_data_from[i_parity] =
166 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
168 /* Create new indexed types for sending and receiving.
169 We are sending data from ranks (send_data_from) and (send_data_from+1)
170 We are receiving data from ranks (recv_data_from[i_parity]) and
171 (recv_data_from[i_parity]+1).
174 new_scounts[0] = rcounts[send_data_from];
175 new_scounts[1] = rcounts[(send_data_from + 1)];
176 new_sdispls[0] = rdispls[send_data_from];
177 new_sdispls[1] = rdispls[(send_data_from + 1)];
178 err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype,
180 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
181 smpi_datatype_commit(&new_sdtype);
183 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
184 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
185 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
186 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
187 err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype,
189 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
190 smpi_datatype_commit(&new_rdtype);
192 tmprecv = (char*)rbuf;
193 tmpsend = (char*)rbuf;
196 smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
197 MCA_COLL_BASE_TAG_ALLGATHERV,
198 tmprecv, 1, new_rdtype, neighbor[i_parity],
199 MCA_COLL_BASE_TAG_ALLGATHERV,
200 comm, MPI_STATUS_IGNORE);
202 send_data_from = recv_data_from[i_parity];
204 smpi_datatype_free(&new_sdtype);
205 smpi_datatype_free(&new_rdtype);
211 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
212 __FILE__, line, err, rank);