1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * ompi_coll_tuned_allgatherv_intra_neighborexchange
10 * Function: allgatherv using N/2 steps (O(N))
11 * Accepts: Same arguments as MPI_Allgatherv
12 * Returns: MPI_SUCCESS or error code
14 * Description: Neighbor Exchange algorithm for allgather adapted for
16 * Described by Chen et.al. in
17 * "Performance Evaluation of Allgather Algorithms on
18 * Terascale Linux Cluster with Fast Ethernet",
19 * Proceedings of the Eighth International Conference on
20 * High-Performance Computing inn Asia-Pacific Region
23 * Rank r exchanges message with one of its neighbors and
24 * forwards the data further in the next step.
26 * No additional memory requirements.
28 * Limitations: Algorithm works only on even number of processes.
29 * For odd number of processes we switch to ring algorithm.
34 * [0] [ ] [ ] [ ] [ ] [ ]
35 * [ ] [1] [ ] [ ] [ ] [ ]
36 * [ ] [ ] [2] [ ] [ ] [ ]
37 * [ ] [ ] [ ] [3] [ ] [ ]
38 * [ ] [ ] [ ] [ ] [4] [ ]
39 * [ ] [ ] [ ] [ ] [ ] [5]
42 * [0] [0] [ ] [ ] [ ] [ ]
43 * [1] [1] [ ] [ ] [ ] [ ]
44 * [ ] [ ] [2] [2] [ ] [ ]
45 * [ ] [ ] [3] [3] [ ] [ ]
46 * [ ] [ ] [ ] [ ] [4] [4]
47 * [ ] [ ] [ ] [ ] [5] [5]
50 * [0] [0] [0] [ ] [ ] [0]
51 * [1] [1] [1] [ ] [ ] [1]
52 * [ ] [2] [2] [2] [2] [ ]
53 * [ ] [3] [3] [3] [3] [ ]
54 * [4] [ ] [ ] [4] [4] [4]
55 * [5] [ ] [ ] [5] [5] [5]
58 * [0] [0] [0] [0] [0] [0]
59 * [1] [1] [1] [1] [1] [1]
60 * [2] [2] [2] [2] [2] [2]
61 * [3] [3] [3] [3] [3] [3]
62 * [4] [4] [4] [4] [4] [4]
63 * [5] [5] [5] [5] [5] [5]
66 #include "colls_private.h"
69 smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
71 void* rbuf, int *rcounts, int *rdispls,
77 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
81 ptrdiff_t slb, rlb, sext, rext;
82 char *tmpsend = NULL, *tmprecv = NULL;
85 size = smpi_comm_size(comm);
86 rank = smpi_comm_rank(comm);
90 "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
92 return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
99 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
101 err = smpi_datatype_extent (sdtype, &slb, &sext);
102 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
104 err = smpi_datatype_extent (rdtype, &rlb, &rext);
105 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
107 /* Initialization step:
108 - if send buffer is not MPI_IN_PLACE, copy send buffer to
109 the appropriate block of receive buffer
111 tmprecv = (char*) rbuf + rdispls[rank] * rext;
112 if (MPI_IN_PLACE != sbuf) {
113 tmpsend = (char*) sbuf;
114 err = smpi_datatype_copy(tmpsend, scount, sdtype,
115 tmprecv, rcounts[rank], rdtype);
116 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
119 /* Determine neighbors, order in which blocks will arrive, etc. */
120 even_rank = !(rank % 2);
122 neighbor[0] = (rank + 1) % size;
123 neighbor[1] = (rank - 1 + size) % size;
124 recv_data_from[0] = rank;
125 recv_data_from[1] = rank;
126 offset_at_step[0] = (+2);
127 offset_at_step[1] = (-2);
129 neighbor[0] = (rank - 1 + size) % size;
130 neighbor[1] = (rank + 1) % size;
131 recv_data_from[0] = neighbor[0];
132 recv_data_from[1] = neighbor[0];
133 offset_at_step[0] = (-2);
134 offset_at_step[1] = (+2);
137 /* Communication loop:
138 - First step is special: exchange a single block with neighbor[0].
140 update recv_data_from according to offset, and
141 exchange two blocks with appropriate neighbor.
142 the send location becomes previous receve location.
143 Note, we need to create indexed datatype to send and receive these
146 tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
147 tmpsend = (char*)rbuf + rdispls[rank] * rext;
148 smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype,
149 neighbor[0], COLL_TAG_ALLGATHERV,
150 tmprecv, rcounts[neighbor[0]], rdtype,
151 neighbor[0], COLL_TAG_ALLGATHERV,
152 comm, MPI_STATUS_IGNORE);
158 /* Determine initial sending counts and displacements*/
160 send_data_from = rank;
162 send_data_from = recv_data_from[0];
165 for (i = 1; i < (size / 2); i++) {
166 MPI_Datatype new_rdtype, new_sdtype;
167 int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
168 const int i_parity = i % 2;
169 recv_data_from[i_parity] =
170 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
172 /* Create new indexed types for sending and receiving.
173 We are sending data from ranks (send_data_from) and (send_data_from+1)
174 We are receiving data from ranks (recv_data_from[i_parity]) and
175 (recv_data_from[i_parity]+1).
178 new_scounts[0] = rcounts[send_data_from];
179 new_scounts[1] = rcounts[(send_data_from + 1)];
180 new_sdispls[0] = rdispls[send_data_from];
181 new_sdispls[1] = rdispls[(send_data_from + 1)];
182 err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype,
184 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
185 smpi_datatype_commit(&new_sdtype);
187 new_rcounts[0] = rcounts[recv_data_from[i_parity]];
188 new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
189 new_rdispls[0] = rdispls[recv_data_from[i_parity]];
190 new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
191 err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype,
193 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
194 smpi_datatype_commit(&new_rdtype);
196 tmprecv = (char*)rbuf;
197 tmpsend = (char*)rbuf;
200 smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
202 tmprecv, 1, new_rdtype, neighbor[i_parity],
204 comm, MPI_STATUS_IGNORE);
206 send_data_from = recv_data_from[i_parity];
208 smpi_datatype_free(&new_sdtype);
209 smpi_datatype_free(&new_rdtype);
215 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
216 __FILE__, line, err, rank);