1 /* Copyright (c) 2013-2017. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2009 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
18 * Copyright (c) 2009 University of Houston. All rights reserved.
20 * Additional copyrights may follow
23 #include "../colls_private.hpp"
25 * ompi_coll_tuned_allgatherv_intra_bruck
27 * Function: allgather using O(log(N)) steps.
28 * Accepts: Same arguments as MPI_Allgather
29 * Returns: MPI_SUCCESS or error code
31 * Description: Variation to All-to-all algorithm described by Bruck et al.in
32 * "Efficient Algorithms for All-to-all Communications
33 * in Multiport Message-Passing Systems"
34 * Note: Unlike in case of allgather implementation, we relay on
35 * indexed datatype to select buffers appropriately.
36 * The only additional memory requirement is for creation of
37 * temporary datatypes.
38 * Example on 7 nodes (memory lay out need not be in-order)
41 * [0] [ ] [ ] [ ] [ ] [ ] [ ]
42 * [ ] [1] [ ] [ ] [ ] [ ] [ ]
43 * [ ] [ ] [2] [ ] [ ] [ ] [ ]
44 * [ ] [ ] [ ] [3] [ ] [ ] [ ]
45 * [ ] [ ] [ ] [ ] [4] [ ] [ ]
46 * [ ] [ ] [ ] [ ] [ ] [5] [ ]
47 * [ ] [ ] [ ] [ ] [ ] [ ] [6]
48 * Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
50 * [0] [ ] [ ] [ ] [ ] [ ] [0]
51 * [1] [1] [ ] [ ] [ ] [ ] [ ]
52 * [ ] [2] [2] [ ] [ ] [ ] [ ]
53 * [ ] [ ] [3] [3] [ ] [ ] [ ]
54 * [ ] [ ] [ ] [4] [4] [ ] [ ]
55 * [ ] [ ] [ ] [ ] [5] [5] [ ]
56 * [ ] [ ] [ ] [ ] [ ] [6] [6]
57 * Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
58 * message contains all blocks from (rank) .. (rank + 2^2) with
61 * [0] [ ] [ ] [ ] [0] [0] [0]
62 * [1] [1] [ ] [ ] [ ] [1] [1]
63 * [2] [2] [2] [ ] [ ] [ ] [2]
64 * [3] [3] [3] [3] [ ] [ ] [ ]
65 * [ ] [4] [4] [4] [4] [ ] [ ]
66 * [ ] [ ] [5] [5] [5] [5] [ ]
67 * [ ] [ ] [ ] [6] [6] [6] [6]
68 * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
69 * message size is "all remaining blocks"
71 * [0] [0] [0] [0] [0] [0] [0]
72 * [1] [1] [1] [1] [1] [1] [1]
73 * [2] [2] [2] [2] [2] [2] [2]
74 * [3] [3] [3] [3] [3] [3] [3]
75 * [4] [4] [4] [4] [4] [4] [4]
76 * [5] [5] [5] [5] [5] [5] [5]
77 * [6] [6] [6] [6] [6] [6] [6]
83 int Coll_allgatherv_ompi_bruck::allgatherv(void *sbuf, int scount,
85 void *rbuf, int *rcounts,
90 int sendto, recvfrom, blockcount, i;
91 unsigned int distance;
92 ptrdiff_t slb, rlb, sext, rext;
93 char *tmpsend = NULL, *tmprecv = NULL;
94 MPI_Datatype new_rdtype = MPI_DATATYPE_NULL, new_sdtype = MPI_DATATYPE_NULL;
96 unsigned int size = comm->size();
97 unsigned int rank = comm->rank();
99 XBT_DEBUG("coll:tuned:allgather_ompi_bruck rank %u", rank);
101 sdtype->extent(&slb, &sext);
103 rdtype->extent(&rlb, &rext);
105 /* Initialization step:
106 - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
109 tmprecv = (char*) rbuf + rdispls[rank] * rext;
110 if (MPI_IN_PLACE != sbuf) {
111 tmpsend = (char*) sbuf;
112 Datatype::copy(tmpsend, scount, sdtype,
113 tmprecv, rcounts[rank], rdtype);
116 /* Communication step:
117 At every step i, rank r:
118 - doubles the distance
119 - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
120 to rank (r - distance)
121 - receives message of blockcount blocks,
122 (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
124 - blockcount doubles until the last step when only the remaining data is
127 int* new_rcounts = new int[4 * size];
128 int* new_rdispls = new_rcounts + size;
129 int* new_scounts = new_rdispls + size;
130 int* new_sdispls = new_scounts + size;
132 for (distance = 1; distance < size; distance<<=1) {
134 recvfrom = (rank + distance) % size;
135 sendto = (rank - distance + size) % size;
137 if (distance <= (size >> 1)) {
138 blockcount = distance;
140 blockcount = size - distance;
143 /* create send and receive datatypes */
144 for (i = 0; i < blockcount; i++) {
145 const int tmp_srank = (rank + i) % size;
146 const int tmp_rrank = (recvfrom + i) % size;
147 new_scounts[i] = rcounts[tmp_srank];
148 new_sdispls[i] = rdispls[tmp_srank];
149 new_rcounts[i] = rcounts[tmp_rrank];
150 new_rdispls[i] = rdispls[tmp_rrank];
152 Datatype::create_indexed(blockcount, new_scounts, new_sdispls,
153 rdtype, &new_sdtype);
154 Datatype::create_indexed(blockcount, new_rcounts, new_rdispls,
155 rdtype, &new_rdtype);
157 new_sdtype->commit();
158 new_rdtype->commit();
161 Request::sendrecv(rbuf, 1, new_sdtype, sendto,
163 rbuf, 1, new_rdtype, recvfrom,
165 comm, MPI_STATUS_IGNORE);
166 Datatype::unref(new_sdtype);
167 Datatype::unref(new_rdtype);
171 delete[] new_rcounts;