1 /* Copyright (c) 2013-2019. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * ompi_coll_tuned_allgatherv_intra_neighborexchange
10 * Function: allgatherv using N/2 steps (O(N))
11 * Accepts: Same arguments as MPI_Allgatherv
12 * Returns: MPI_SUCCESS or error code
14 * Description: Neighbor Exchange algorithm for allgather adapted for
16 * Described by Chen et.al. in
17 * "Performance Evaluation of Allgather Algorithms on
18 * Terascale Linux Cluster with Fast Ethernet",
19 * Proceedings of the Eighth International Conference on
20 * High-Performance Computing inn Asia-Pacific Region
23 * Rank r exchanges message with one of its neighbors and
24 * forwards the data further in the next step.
26 * No additional memory requirements.
28 * Limitations: Algorithm works only on even number of processes.
29 * For odd number of processes we switch to ring algorithm.
34 * [0] [ ] [ ] [ ] [ ] [ ]
35 * [ ] [1] [ ] [ ] [ ] [ ]
36 * [ ] [ ] [2] [ ] [ ] [ ]
37 * [ ] [ ] [ ] [3] [ ] [ ]
38 * [ ] [ ] [ ] [ ] [4] [ ]
39 * [ ] [ ] [ ] [ ] [ ] [5]
42 * [0] [0] [ ] [ ] [ ] [ ]
43 * [1] [1] [ ] [ ] [ ] [ ]
44 * [ ] [ ] [2] [2] [ ] [ ]
45 * [ ] [ ] [3] [3] [ ] [ ]
46 * [ ] [ ] [ ] [ ] [4] [4]
47 * [ ] [ ] [ ] [ ] [5] [5]
50 * [0] [0] [0] [ ] [ ] [0]
51 * [1] [1] [1] [ ] [ ] [1]
52 * [ ] [2] [2] [2] [2] [ ]
53 * [ ] [3] [3] [3] [3] [ ]
54 * [4] [ ] [ ] [4] [4] [4]
55 * [5] [ ] [ ] [5] [5] [5]
58 * [0] [0] [0] [0] [0] [0]
59 * [1] [1] [1] [1] [1] [1]
60 * [2] [2] [2] [2] [2] [2]
61 * [3] [3] [3] [3] [3] [3]
62 * [4] [4] [4] [4] [4] [4]
63 * [5] [5] [5] [5] [5] [5]
66 #include "../colls_private.hpp"
72 Coll_allgatherv_ompi_neighborexchange::allgatherv(const void *sbuf, int scount,
74 void* rbuf, const int *rcounts, const int *rdispls,
80 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
84 ptrdiff_t slb, rlb, sext, rext;
85 char *tmpsend = NULL, *tmprecv = NULL;
93 "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
95 return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
102 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
104 err = sdtype->extent(&slb, &sext);
105 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
107 err = rdtype->extent(&rlb, &rext);
108 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
110 /* Initialization step:
111 - if send buffer is not MPI_IN_PLACE, copy send buffer to
112 the appropriate block of receive buffer
114 tmprecv = (char*) rbuf + rdispls[rank] * rext;
115 if (MPI_IN_PLACE != sbuf) {
116 tmpsend = (char*) sbuf;
117 err = Datatype::copy(tmpsend, scount, sdtype,
118 tmprecv, rcounts[rank], rdtype);
119 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
122 /* Determine neighbors, order in which blocks will arrive, etc. */
123 even_rank = !(rank % 2);
125 neighbor[0] = (rank + 1) % size;
126 neighbor[1] = (rank - 1 + size) % size;
127 recv_data_from[0] = rank;
128 recv_data_from[1] = rank;
129 offset_at_step[0] = (+2);
130 offset_at_step[1] = (-2);
132 neighbor[0] = (rank - 1 + size) % size;
133 neighbor[1] = (rank + 1) % size;
134 recv_data_from[0] = neighbor[0];
135 recv_data_from[1] = neighbor[0];
136 offset_at_step[0] = (-2);
137 offset_at_step[1] = (+2);
140 /* Communication loop:
141 - First step is special: exchange a single block with neighbor[0].
143 update recv_data_from according to offset, and
144 exchange two blocks with appropriate neighbor.
145 the send location becomes previous receve location.
146 Note, we need to create indexed datatype to send and receive these
149 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
150 tmpsend = (char*)rbuf + rdispls[rank] * rext;
151 Request::sendrecv(tmpsend, rcounts[rank], rdtype,
152 neighbor[0], COLL_TAG_ALLGATHERV,
153 tmprecv, rcounts[neighbor[0]], rdtype,
154 neighbor[0], COLL_TAG_ALLGATHERV,
155 comm, MPI_STATUS_IGNORE);
161 /* Determine initial sending counts and displacements*/
163 send_data_from = rank;
165 send_data_from = recv_data_from[0];
168 for (i = 1; i < (size / 2); i++) {
169 MPI_Datatype new_rdtype, new_sdtype;
170 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
171 const int i_parity = i % 2;
172 recv_data_from[i_parity] =
173 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
175 /* Create new indexed types for sending and receiving.
176 We are sending data from ranks (send_data_from) and (send_data_from+1)
177 We are receiving data from ranks (recv_data_from[i_parity]) and
178 (recv_data_from[i_parity]+1).
181 new_scounts[0] = rcounts[send_data_from];
182 new_scounts[1] = rcounts[(send_data_from + 1)];
183 new_sdispls[0] = rdispls[send_data_from];
184 new_sdispls[1] = rdispls[(send_data_from + 1)];
185 err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
187 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
188 new_sdtype->commit();
190 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
191 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
192 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
193 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
194 err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
196 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
197 new_rdtype->commit();
199 tmprecv = (char*)rbuf;
200 tmpsend = (char*)rbuf;
203 Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
205 tmprecv, 1, new_rdtype, neighbor[i_parity],
207 comm, MPI_STATUS_IGNORE);
209 send_data_from = recv_data_from[i_parity];
211 Datatype::unref(new_sdtype);
212 Datatype::unref(new_rdtype);
218 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
219 __FILE__, line, err, rank);