1 /* Copyright (c) 2013-2022. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * ompi_coll_tuned_allgatherv_intra_neighborexchange
10 * Function: allgatherv using N/2 steps (O(N))
11 * Accepts: Same arguments as MPI_Allgatherv
12 * Returns: MPI_SUCCESS or error code
14 * Description: Neighbor Exchange algorithm for allgather adapted for
16 * Described by Chen et.al. in
17 * "Performance Evaluation of Allgather Algorithms on
18 * Terascale Linux Cluster with Fast Ethernet",
19 * Proceedings of the Eighth International Conference on
20 * High-Performance Computing inn Asia-Pacific Region
23 * Rank r exchanges message with one of its neighbors and
24 * forwards the data further in the next step.
26 * No additional memory requirements.
28 * Limitations: Algorithm works only on even number of processes.
29 * For odd number of processes we switch to ring algorithm.
34 * [0] [ ] [ ] [ ] [ ] [ ]
35 * [ ] [1] [ ] [ ] [ ] [ ]
36 * [ ] [ ] [2] [ ] [ ] [ ]
37 * [ ] [ ] [ ] [3] [ ] [ ]
38 * [ ] [ ] [ ] [ ] [4] [ ]
39 * [ ] [ ] [ ] [ ] [ ] [5]
42 * [0] [0] [ ] [ ] [ ] [ ]
43 * [1] [1] [ ] [ ] [ ] [ ]
44 * [ ] [ ] [2] [2] [ ] [ ]
45 * [ ] [ ] [3] [3] [ ] [ ]
46 * [ ] [ ] [ ] [ ] [4] [4]
47 * [ ] [ ] [ ] [ ] [5] [5]
50 * [0] [0] [0] [ ] [ ] [0]
51 * [1] [1] [1] [ ] [ ] [1]
52 * [ ] [2] [2] [2] [2] [ ]
53 * [ ] [3] [3] [3] [3] [ ]
54 * [4] [ ] [ ] [4] [4] [4]
55 * [5] [ ] [ ] [5] [5] [5]
58 * [0] [0] [0] [0] [0] [0]
59 * [1] [1] [1] [1] [1] [1]
60 * [2] [2] [2] [2] [2] [2]
61 * [3] [3] [3] [3] [3] [3]
62 * [4] [4] [4] [4] [4] [4]
63 * [5] [5] [5] [5] [5] [5]
66 #include "../colls_private.hpp"
72 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
74 void* rbuf, const int *rcounts, const int *rdispls,
80 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
84 ptrdiff_t slb, rlb, sext, rext;
85 char *tmpsend = nullptr, *tmprecv = nullptr;
91 XBT_INFO("allgatherv__ompi_neighborexchange: odd size %d, switching to ring algorithm",
93 return allgatherv__ring(sbuf, scount, sdtype,
100 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
102 err = sdtype->extent(&slb, &sext);
103 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
105 err = rdtype->extent(&rlb, &rext);
106 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
108 /* Initialization step:
109 - if send buffer is not MPI_IN_PLACE, copy send buffer to
110 the appropriate block of receive buffer
112 tmprecv = (char*) rbuf + rdispls[rank] * rext;
113 if (MPI_IN_PLACE != sbuf) {
114 tmpsend = (char*) sbuf;
115 err = Datatype::copy(tmpsend, scount, sdtype,
116 tmprecv, rcounts[rank], rdtype);
117 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
120 /* Determine neighbors, order in which blocks will arrive, etc. */
121 even_rank = !(rank % 2);
123 neighbor[0] = (rank + 1) % size;
124 neighbor[1] = (rank - 1 + size) % size;
125 recv_data_from[0] = rank;
126 recv_data_from[1] = rank;
127 offset_at_step[0] = (+2);
128 offset_at_step[1] = (-2);
130 neighbor[0] = (rank - 1 + size) % size;
131 neighbor[1] = (rank + 1) % size;
132 recv_data_from[0] = neighbor[0];
133 recv_data_from[1] = neighbor[0];
134 offset_at_step[0] = (-2);
135 offset_at_step[1] = (+2);
138 /* Communication loop:
139 - First step is special: exchange a single block with neighbor[0].
141 update recv_data_from according to offset, and
142 exchange two blocks with appropriate neighbor.
143 the send location becomes previous receive location.
144 Note, we need to create indexed datatype to send and receive these
147 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
148 tmpsend = (char*)rbuf + rdispls[rank] * rext;
149 Request::sendrecv(tmpsend, rcounts[rank], rdtype,
150 neighbor[0], COLL_TAG_ALLGATHERV,
151 tmprecv, rcounts[neighbor[0]], rdtype,
152 neighbor[0], COLL_TAG_ALLGATHERV,
153 comm, MPI_STATUS_IGNORE);
159 /* Determine initial sending counts and displacements*/
161 send_data_from = rank;
163 send_data_from = recv_data_from[0];
166 for (i = 1; i < (size / 2); i++) {
167 MPI_Datatype new_rdtype, new_sdtype;
168 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
169 const int i_parity = i % 2;
170 recv_data_from[i_parity] =
171 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
173 /* Create new indexed types for sending and receiving.
174 We are sending data from ranks (send_data_from) and (send_data_from+1)
175 We are receiving data from ranks (recv_data_from[i_parity]) and
176 (recv_data_from[i_parity]+1).
179 new_scounts[0] = rcounts[send_data_from];
180 new_scounts[1] = rcounts[(send_data_from + 1)];
181 new_sdispls[0] = rdispls[send_data_from];
182 new_sdispls[1] = rdispls[(send_data_from + 1)];
183 err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
185 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
186 new_sdtype->commit();
188 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
189 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
190 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
191 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
192 err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
194 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
195 new_rdtype->commit();
197 tmprecv = (char*)rbuf;
198 tmpsend = (char*)rbuf;
201 Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
203 tmprecv, 1, new_rdtype, neighbor[i_parity],
205 comm, MPI_STATUS_IGNORE);
207 send_data_from = recv_data_from[i_parity];
209 Datatype::unref(new_sdtype);
210 Datatype::unref(new_rdtype);
216 XBT_WARN( "%s:%4d\tError occurred %d, rank %2d",
217 __FILE__, line, err, rank);