2 * ompi_coll_tuned_allgather_intra_neighborexchange
4 * Function: allgather using N/2 steps (O(N))
5 * Accepts: Same arguments as MPI_Allgather
6 * Returns: MPI_SUCCESS or error code
8 * Description: Neighbor Exchange algorithm for allgather.
9 * Described by Chen et.al. in
10 * "Performance Evaluation of Allgather Algorithms on
11 * Terascale Linux Cluster with Fast Ethernet",
12 * Proceedings of the Eighth International Conference on
13 * High-Performance Computing inn Asia-Pacific Region
16 * Rank r exchanges message with one of its neighbors and
17 * forwards the data further in the next step.
19 * No additional memory requirements.
21 * Limitations: Algorithm works only on even number of processes.
22 * For odd number of processes we switch to ring algorithm.
27 * [0] [ ] [ ] [ ] [ ] [ ]
28 * [ ] [1] [ ] [ ] [ ] [ ]
29 * [ ] [ ] [2] [ ] [ ] [ ]
30 * [ ] [ ] [ ] [3] [ ] [ ]
31 * [ ] [ ] [ ] [ ] [4] [ ]
32 * [ ] [ ] [ ] [ ] [ ] [5]
35 * [0] [0] [ ] [ ] [ ] [ ]
36 * [1] [1] [ ] [ ] [ ] [ ]
37 * [ ] [ ] [2] [2] [ ] [ ]
38 * [ ] [ ] [3] [3] [ ] [ ]
39 * [ ] [ ] [ ] [ ] [4] [4]
40 * [ ] [ ] [ ] [ ] [5] [5]
43 * [0] [0] [0] [ ] [ ] [0]
44 * [1] [1] [1] [ ] [ ] [1]
45 * [ ] [2] [2] [2] [2] [ ]
46 * [ ] [3] [3] [3] [3] [ ]
47 * [4] [ ] [ ] [4] [4] [4]
48 * [5] [ ] [ ] [5] [5] [5]
51 * [0] [0] [0] [0] [0] [0]
52 * [1] [1] [1] [1] [1] [1]
53 * [2] [2] [2] [2] [2] [2]
54 * [3] [3] [3] [3] [3] [3]
55 * [4] [4] [4] [4] [4] [4]
56 * [5] [5] [5] [5] [5] [5]
59 #include "colls_private.h"
60 #define MCA_COLL_BASE_TAG_ALLGATHER 555
62 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
64 void* rbuf, int rcount,
71 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
74 ptrdiff_t slb, rlb, sext, rext;
75 char *tmpsend = NULL, *tmprecv = NULL;
77 size = smpi_comm_size(comm);
78 rank = smpi_comm_rank(comm);
82 "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
84 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
90 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
92 err = smpi_datatype_extent (sdtype, &slb, &sext);
93 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
95 err = smpi_datatype_extent (rdtype, &rlb, &rext);
96 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
98 /* Initialization step:
99 - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
102 tmprecv = (char*) rbuf + rank * rcount * rext;
103 if (MPI_IN_PLACE != sbuf) {
104 tmpsend = (char*) sbuf;
105 smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
108 /* Determine neighbors, order in which blocks will arrive, etc. */
109 even_rank = !(rank % 2);
111 neighbor[0] = (rank + 1) % size;
112 neighbor[1] = (rank - 1 + size) % size;
113 recv_data_from[0] = rank;
114 recv_data_from[1] = rank;
115 offset_at_step[0] = (+2);
116 offset_at_step[1] = (-2);
118 neighbor[0] = (rank - 1 + size) % size;
119 neighbor[1] = (rank + 1) % size;
120 recv_data_from[0] = neighbor[0];
121 recv_data_from[1] = neighbor[0];
122 offset_at_step[0] = (-2);
123 offset_at_step[1] = (+2);
126 /* Communication loop:
127 - First step is special: exchange a single block with neighbor[0].
129 update recv_data_from according to offset, and
130 exchange two blocks with appropriate neighbor.
131 the send location becomes previous receve location.
133 tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
134 tmpsend = (char*)rbuf + rank * rcount * rext;
136 smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
137 MCA_COLL_BASE_TAG_ALLGATHER,
138 tmprecv, rcount, rdtype, neighbor[0],
139 MCA_COLL_BASE_TAG_ALLGATHER,
140 comm, MPI_STATUS_IGNORE);
142 /* Determine initial sending location */
144 send_data_from = rank;
146 send_data_from = recv_data_from[0];
149 for (i = 1; i < (size / 2); i++) {
150 const int i_parity = i % 2;
151 recv_data_from[i_parity] =
152 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
154 tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
155 tmpsend = (char*)rbuf + send_data_from * rcount * rext;
158 smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype,
160 MCA_COLL_BASE_TAG_ALLGATHER,
161 tmprecv, 2 * rcount, rdtype,
163 MCA_COLL_BASE_TAG_ALLGATHER,
164 comm, MPI_STATUS_IGNORE);
166 send_data_from = recv_data_from[i_parity];
172 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
173 __FILE__, line, err, rank);