1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * (C) 2001 by Argonne National Laboratory.
9 * See COPYRIGHT in top-level directory.
12 /* Copyright (c) 2001-2014, The Ohio State University. All rights
15 * This file is part of the MVAPICH2 software package developed by the
16 * team members of The Ohio State University's Network-Based Computing
17 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
19 * For detailed copyright and licensing information, please refer to the
20 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
24 #include "colls_private.h"
26 int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf,
29 MPI_Datatype datatype,
30 MPI_Op op, MPI_Comm comm)
33 int mpi_errno = MPI_SUCCESS;
34 int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i,
35 send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
36 MPI_Aint true_lb, true_extent, extent;
37 void *tmp_buf, *tmp_buf_free;
45 comm_size = smpi_comm_size(comm);
46 rank = smpi_comm_rank(comm);
48 is_commutative = smpi_op_is_commute(op);
50 /* need to allocate temporary buffer to store incoming data */
51 smpi_datatype_extent(datatype, &true_lb, &true_extent);
52 extent = smpi_datatype_get_extent(datatype);
54 tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent)));
56 /* adjust for potential negative lower bound in datatype */
57 tmp_buf = (void *) ((char *) tmp_buf_free - true_lb);
59 /* copy local data into recvbuf */
60 if (sendbuf != MPI_IN_PLACE) {
62 smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count,
66 /* find nearest power-of-two less than or equal to comm_size */
67 for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
70 rem = comm_size - pof2;
72 /* In the non-power-of-two case, all even-numbered
73 processes of rank < 2*rem send their data to
74 (rank+1). These even-numbered processes no longer
75 participate in the algorithm until the very end. The
76 remaining processes form a nice power-of-two. */
81 smpi_mpi_send(recvbuf, count, datatype, rank + 1,
82 COLL_TAG_ALLREDUCE, comm);
84 /* temporarily set the rank to -1 so that this
85 process does not pariticipate in recursive
90 smpi_mpi_recv(tmp_buf, count, datatype, rank - 1,
91 COLL_TAG_ALLREDUCE, comm,
93 /* do the reduction on received data. since the
94 ordering is right, it doesn't matter whether
95 the operation is commutative or not. */
96 smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
100 } else { /* rank >= 2*rem */
101 newrank = rank - rem;
104 /* If op is user-defined or count is less than pof2, use
105 recursive doubling algorithm. Otherwise do a reduce-scatter
106 followed by allgather. (If op is user-defined,
107 derived datatypes are allowed and the user could pass basic
108 datatypes on one process and derived on another as long as
109 the type maps are the same. Breaking up derived
110 datatypes to do the reduce-scatter is tricky, therefore
111 using recursive doubling in that case.) */
114 if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) { /* use recursive doubling */
116 while (mask < pof2) {
117 newdst = newrank ^ mask;
118 /* find real rank of dest */
119 dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
121 /* Send the most current data, which is in recvbuf. Recv
123 smpi_mpi_sendrecv(recvbuf, count, datatype,
124 dst, COLL_TAG_ALLREDUCE,
125 tmp_buf, count, datatype, dst,
126 COLL_TAG_ALLREDUCE, comm,
129 /* tmp_buf contains data received in this step.
130 recvbuf contains data accumulated so far */
132 if (is_commutative || (dst < rank)) {
133 /* op is commutative OR the order is already right */
134 smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
136 /* op is noncommutative and the order is not right */
137 smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
138 /* copy result back into recvbuf */
139 mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype,
140 recvbuf, count, datatype);
146 /* do a reduce-scatter followed by allgather */
148 /* for the reduce-scatter, calculate the count that
149 each process receives and the displacement within
151 cnts = (int *)xbt_malloc(pof2 * sizeof (int));
152 disps = (int *)xbt_malloc(pof2 * sizeof (int));
154 for (i = 0; i < (pof2 - 1); i++) {
155 cnts[i] = count / pof2;
157 cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);
160 for (i = 1; i < pof2; i++) {
161 disps[i] = disps[i - 1] + cnts[i - 1];
165 send_idx = recv_idx = 0;
167 while (mask < pof2) {
168 newdst = newrank ^ mask;
169 /* find real rank of dest */
170 dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
172 send_cnt = recv_cnt = 0;
173 if (newrank < newdst) {
174 send_idx = recv_idx + pof2 / (mask * 2);
175 for (i = send_idx; i < last_idx; i++)
177 for (i = recv_idx; i < send_idx; i++)
180 recv_idx = send_idx + pof2 / (mask * 2);
181 for (i = send_idx; i < recv_idx; i++)
183 for (i = recv_idx; i < last_idx; i++)
187 /* Send data from recvbuf. Recv into tmp_buf */
188 smpi_mpi_sendrecv((char *) recvbuf +
189 disps[send_idx] * extent,
191 dst, COLL_TAG_ALLREDUCE,
193 disps[recv_idx] * extent,
194 recv_cnt, datatype, dst,
195 COLL_TAG_ALLREDUCE, comm,
198 /* tmp_buf contains data received in this step.
199 recvbuf contains data accumulated so far */
201 /* This algorithm is used only for predefined ops
202 and predefined ops are always commutative. */
204 smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
205 (char *) recvbuf + disps[recv_idx] * extent,
206 &recv_cnt, &datatype);
208 /* update send_idx for next iteration */
212 /* update last_idx, but not in last iteration
213 because the value is needed in the allgather
216 last_idx = recv_idx + pof2 / mask;
219 /* now do the allgather */
223 newdst = newrank ^ mask;
224 /* find real rank of dest */
225 dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;
227 send_cnt = recv_cnt = 0;
228 if (newrank < newdst) {
229 /* update last_idx except on first iteration */
230 if (mask != pof2 / 2) {
231 last_idx = last_idx + pof2 / (mask * 2);
234 recv_idx = send_idx + pof2 / (mask * 2);
235 for (i = send_idx; i < recv_idx; i++) {
238 for (i = recv_idx; i < last_idx; i++) {
242 recv_idx = send_idx - pof2 / (mask * 2);
243 for (i = send_idx; i < last_idx; i++) {
246 for (i = recv_idx; i < send_idx; i++) {
251 smpi_mpi_sendrecv((char *) recvbuf +
252 disps[send_idx] * extent,
254 dst, COLL_TAG_ALLREDUCE,
256 disps[recv_idx] * extent,
257 recv_cnt, datatype, dst,
258 COLL_TAG_ALLREDUCE, comm,
260 if (newrank > newdst) {
269 /* In the non-power-of-two case, all odd-numbered
270 processes of rank < 2*rem send the result to
271 (rank-1), the ranks who didn't participate above. */
272 if (rank < 2 * rem) {
273 if (rank % 2) { /* odd */
274 smpi_mpi_send(recvbuf, count,
276 COLL_TAG_ALLREDUCE, comm);
278 smpi_mpi_recv(recvbuf, count,
280 COLL_TAG_ALLREDUCE, comm,
284 smpi_free_tmp_buffer(tmp_buf_free);