1 /* Copyright (c) 2013-2019. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * ompi_coll_tuned_allgatherv_intra_neighborexchange
10 * Function: allgatherv using N/2 steps (O(N))
11 * Accepts: Same arguments as MPI_Allgatherv
12 * Returns: MPI_SUCCESS or error code
14 * Description: Neighbor Exchange algorithm for allgather adapted for
16 * Described by Chen et.al. in
17 * "Performance Evaluation of Allgather Algorithms on
18 * Terascale Linux Cluster with Fast Ethernet",
19 * Proceedings of the Eighth International Conference on
20 * High-Performance Computing inn Asia-Pacific Region
23 * Rank r exchanges message with one of its neighbors and
24 * forwards the data further in the next step.
26 * No additional memory requirements.
28 * Limitations: Algorithm works only on even number of processes.
29 * For odd number of processes we switch to ring algorithm.
34 * [0] [ ] [ ] [ ] [ ] [ ]
35 * [ ] [1] [ ] [ ] [ ] [ ]
36 * [ ] [ ] [2] [ ] [ ] [ ]
37 * [ ] [ ] [ ] [3] [ ] [ ]
38 * [ ] [ ] [ ] [ ] [4] [ ]
39 * [ ] [ ] [ ] [ ] [ ] [5]
42 * [0] [0] [ ] [ ] [ ] [ ]
43 * [1] [1] [ ] [ ] [ ] [ ]
44 * [ ] [ ] [2] [2] [ ] [ ]
45 * [ ] [ ] [3] [3] [ ] [ ]
46 * [ ] [ ] [ ] [ ] [4] [4]
47 * [ ] [ ] [ ] [ ] [5] [5]
50 * [0] [0] [0] [ ] [ ] [0]
51 * [1] [1] [1] [ ] [ ] [1]
52 * [ ] [2] [2] [2] [2] [ ]
53 * [ ] [3] [3] [3] [3] [ ]
54 * [4] [ ] [ ] [4] [4] [4]
55 * [5] [ ] [ ] [5] [5] [5]
58 * [0] [0] [0] [0] [0] [0]
59 * [1] [1] [1] [1] [1] [1]
60 * [2] [2] [2] [2] [2] [2]
61 * [3] [3] [3] [3] [3] [3]
62 * [4] [4] [4] [4] [4] [4]
63 * [5] [5] [5] [5] [5] [5]
66 #include "../colls_private.hpp"
72 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
74 void* rbuf, const int *rcounts, const int *rdispls,
80 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
84 ptrdiff_t slb, rlb, sext, rext;
85 char *tmpsend = NULL, *tmprecv = NULL;
92 XBT_DEBUG("allgatherv__ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
94 return allgatherv__ring(sbuf, scount, sdtype,
101 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
103 err = sdtype->extent(&slb, &sext);
104 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
106 err = rdtype->extent(&rlb, &rext);
107 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
109 /* Initialization step:
110 - if send buffer is not MPI_IN_PLACE, copy send buffer to
111 the appropriate block of receive buffer
113 tmprecv = (char*) rbuf + rdispls[rank] * rext;
114 if (MPI_IN_PLACE != sbuf) {
115 tmpsend = (char*) sbuf;
116 err = Datatype::copy(tmpsend, scount, sdtype,
117 tmprecv, rcounts[rank], rdtype);
118 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
121 /* Determine neighbors, order in which blocks will arrive, etc. */
122 even_rank = !(rank % 2);
124 neighbor[0] = (rank + 1) % size;
125 neighbor[1] = (rank - 1 + size) % size;
126 recv_data_from[0] = rank;
127 recv_data_from[1] = rank;
128 offset_at_step[0] = (+2);
129 offset_at_step[1] = (-2);
131 neighbor[0] = (rank - 1 + size) % size;
132 neighbor[1] = (rank + 1) % size;
133 recv_data_from[0] = neighbor[0];
134 recv_data_from[1] = neighbor[0];
135 offset_at_step[0] = (-2);
136 offset_at_step[1] = (+2);
139 /* Communication loop:
140 - First step is special: exchange a single block with neighbor[0].
142 update recv_data_from according to offset, and
143 exchange two blocks with appropriate neighbor.
144 the send location becomes previous receve location.
145 Note, we need to create indexed datatype to send and receive these
148 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
149 tmpsend = (char*)rbuf + rdispls[rank] * rext;
150 Request::sendrecv(tmpsend, rcounts[rank], rdtype,
151 neighbor[0], COLL_TAG_ALLGATHERV,
152 tmprecv, rcounts[neighbor[0]], rdtype,
153 neighbor[0], COLL_TAG_ALLGATHERV,
154 comm, MPI_STATUS_IGNORE);
160 /* Determine initial sending counts and displacements*/
162 send_data_from = rank;
164 send_data_from = recv_data_from[0];
167 for (i = 1; i < (size / 2); i++) {
168 MPI_Datatype new_rdtype, new_sdtype;
169 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
170 const int i_parity = i % 2;
171 recv_data_from[i_parity] =
172 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
174 /* Create new indexed types for sending and receiving.
175 We are sending data from ranks (send_data_from) and (send_data_from+1)
176 We are receiving data from ranks (recv_data_from[i_parity]) and
177 (recv_data_from[i_parity]+1).
180 new_scounts[0] = rcounts[send_data_from];
181 new_scounts[1] = rcounts[(send_data_from + 1)];
182 new_sdispls[0] = rdispls[send_data_from];
183 new_sdispls[1] = rdispls[(send_data_from + 1)];
184 err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
186 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
187 new_sdtype->commit();
189 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
190 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
191 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
192 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
193 err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
195 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
196 new_rdtype->commit();
198 tmprecv = (char*)rbuf;
199 tmpsend = (char*)rbuf;
202 Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
204 tmprecv, 1, new_rdtype, neighbor[i_parity],
206 comm, MPI_STATUS_IGNORE);
208 send_data_from = recv_data_from[i_parity];
210 Datatype::unref(new_sdtype);
211 Datatype::unref(new_rdtype);
217 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
218 __FILE__, line, err, rank);