1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * ompi_coll_tuned_allgather_intra_neighborexchange
10 * Function: allgather using N/2 steps (O(N))
11 * Accepts: Same arguments as MPI_Allgather
12 * Returns: MPI_SUCCESS or error code
14 * Description: Neighbor Exchange algorithm for allgather.
15 * Described by Chen et.al. in
16 * "Performance Evaluation of Allgather Algorithms on
17 * Terascale Linux Cluster with Fast Ethernet",
18 * Proceedings of the Eighth International Conference on
19 * High-Performance Computing inn Asia-Pacific Region
22 * Rank r exchanges message with one of its neighbors and
23 * forwards the data further in the next step.
25 * No additional memory requirements.
27 * Limitations: Algorithm works only on even number of processes.
28 * For odd number of processes we switch to ring algorithm.
33 * [0] [ ] [ ] [ ] [ ] [ ]
34 * [ ] [1] [ ] [ ] [ ] [ ]
35 * [ ] [ ] [2] [ ] [ ] [ ]
36 * [ ] [ ] [ ] [3] [ ] [ ]
37 * [ ] [ ] [ ] [ ] [4] [ ]
38 * [ ] [ ] [ ] [ ] [ ] [5]
41 * [0] [0] [ ] [ ] [ ] [ ]
42 * [1] [1] [ ] [ ] [ ] [ ]
43 * [ ] [ ] [2] [2] [ ] [ ]
44 * [ ] [ ] [3] [3] [ ] [ ]
45 * [ ] [ ] [ ] [ ] [4] [4]
46 * [ ] [ ] [ ] [ ] [5] [5]
49 * [0] [0] [0] [ ] [ ] [0]
50 * [1] [1] [1] [ ] [ ] [1]
51 * [ ] [2] [2] [2] [2] [ ]
52 * [ ] [3] [3] [3] [3] [ ]
53 * [4] [ ] [ ] [4] [4] [4]
54 * [5] [ ] [ ] [5] [5] [5]
57 * [0] [0] [0] [0] [0] [0]
58 * [1] [1] [1] [1] [1] [1]
59 * [2] [2] [2] [2] [2] [2]
60 * [3] [3] [3] [3] [3] [3]
61 * [4] [4] [4] [4] [4] [4]
62 * [5] [5] [5] [5] [5] [5]
65 #include "colls_private.h"
67 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
69 void* rbuf, int rcount,
76 int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
79 ptrdiff_t slb, rlb, sext, rext;
80 char *tmpsend = NULL, *tmprecv = NULL;
82 size = smpi_comm_size(comm);
83 rank = smpi_comm_rank(comm);
87 "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
89 return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
95 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
97 err = smpi_datatype_extent (sdtype, &slb, &sext);
98 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
100 err = smpi_datatype_extent (rdtype, &rlb, &rext);
101 if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
103 /* Initialization step:
104 - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
107 tmprecv = (char*) rbuf + rank * rcount * rext;
108 if (MPI_IN_PLACE != sbuf) {
109 tmpsend = (char*) sbuf;
110 smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
113 /* Determine neighbors, order in which blocks will arrive, etc. */
114 even_rank = !(rank % 2);
116 neighbor[0] = (rank + 1) % size;
117 neighbor[1] = (rank - 1 + size) % size;
118 recv_data_from[0] = rank;
119 recv_data_from[1] = rank;
120 offset_at_step[0] = (+2);
121 offset_at_step[1] = (-2);
123 neighbor[0] = (rank - 1 + size) % size;
124 neighbor[1] = (rank + 1) % size;
125 recv_data_from[0] = neighbor[0];
126 recv_data_from[1] = neighbor[0];
127 offset_at_step[0] = (-2);
128 offset_at_step[1] = (+2);
131 /* Communication loop:
132 - First step is special: exchange a single block with neighbor[0].
134 update recv_data_from according to offset, and
135 exchange two blocks with appropriate neighbor.
136 the send location becomes previous receve location.
138 tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
139 tmpsend = (char*)rbuf + rank * rcount * rext;
141 smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
143 tmprecv, rcount, rdtype, neighbor[0],
145 comm, MPI_STATUS_IGNORE);
147 /* Determine initial sending location */
149 send_data_from = rank;
151 send_data_from = recv_data_from[0];
154 for (i = 1; i < (size / 2); i++) {
155 const int i_parity = i % 2;
156 recv_data_from[i_parity] =
157 (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
159 tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
160 tmpsend = (char*)rbuf + send_data_from * rcount * rext;
163 smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype,
166 tmprecv, 2 * rcount, rdtype,
169 comm, MPI_STATUS_IGNORE);
171 send_data_from = recv_data_from[i_parity];
177 XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
178 __FILE__, line, err, rank);