1 /* Short or medium size message and power-of-two no. of processes. Use
2 * recursive doubling algorithm */
3 #include "colls_private.h"
4 int smpi_coll_tuned_allgatherv_mpich_rdb (
11 MPI_Datatype recvtype,
14 int comm_size, rank, j, i;
16 MPI_Aint recvtype_extent, recvtype_true_extent, recvtype_true_lb;
17 int curr_cnt, dst, total_count;
19 int mask, dst_tree_root, my_tree_root, is_homogeneous, position,
20 send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
21 offset, tmp_mask, tree_root;
23 comm_size = smpi_comm_size(comm);
24 rank = smpi_comm_rank(comm);
27 for (i=0; i<comm_size; i++)
28 total_count += recvcounts[i];
30 if (total_count == 0) return MPI_ERR_COUNT;
32 recvtype_extent=smpi_datatype_get_extent( recvtype);
37 /* need to receive contiguously into tmp_buf because
38 displs could make the recvbuf noncontiguous */
40 smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);
42 tmp_buf= (void*)xbt_malloc(total_count*(max(recvtype_true_extent,recvtype_extent)));
44 /* adjust for potential negative lower bound in datatype */
45 tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);
47 /* copy local data into right location in tmp_buf */
49 for (i=0; i<rank; i++) position += recvcounts[i];
50 if (sendbuf != MPI_IN_PLACE)
52 smpi_datatype_copy(sendbuf, sendcount, sendtype,
53 ((char *)tmp_buf + position*
55 recvcounts[rank], recvtype);
59 /* if in_place specified, local data is found in recvbuf */
60 smpi_datatype_copy(((char *)recvbuf +
61 displs[rank]*recvtype_extent),
62 recvcounts[rank], recvtype,
63 ((char *)tmp_buf + position*
65 recvcounts[rank], recvtype);
67 curr_cnt = recvcounts[rank];
71 while (mask < comm_size) {
74 /* find offset into send and recv buffers. zero out
75 the least significant "i" bits of rank and dst to
76 find root of src and dst subtrees. Use ranks of
77 roots as index to send from and recv into buffer */
79 dst_tree_root = dst >> i;
82 my_tree_root = rank >> i;
85 if (dst < comm_size) {
87 for (j=0; j<my_tree_root; j++)
88 send_offset += recvcounts[j];
91 for (j=0; j<dst_tree_root; j++)
92 recv_offset += recvcounts[j];
94 smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
95 curr_cnt, recvtype, dst,
97 ((char *)tmp_buf + recv_offset * recvtype_extent),
98 total_count - recv_offset, recvtype, dst,
101 /* for convenience, recv is posted for a bigger amount
103 last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
104 curr_cnt += last_recv_cnt;
107 /* if some processes in this process's subtree in this step
108 did not have any destination process to communicate with
109 because of non-power-of-two, we need to send them the
110 data that they would normally have received from those
111 processes. That is, the haves in this subtree must send to
112 the havenots. We use a logarithmic
113 recursive-halfing algorithm for this. */
115 /* This part of the code will not currently be
116 executed because we are not using recursive
117 doubling for non power of two. Mark it as experimental
118 so that it doesn't show up as red in the coverage
121 /* --BEGIN EXPERIMENTAL-- */
122 if (dst_tree_root + mask > comm_size) {
123 nprocs_completed = comm_size - my_tree_root - mask;
124 /* nprocs_completed is the number of processes in this
125 subtree that have all the data. Send data to others
126 in a tree fashion. First find root of current tree
127 that is being divided into two. k is the number of
128 least-significant bits in this process's rank that
129 must be zeroed out to find the rank of the root */
138 tmp_mask = mask >> 1;
141 dst = rank ^ tmp_mask;
143 tree_root = rank >> k;
146 /* send only if this proc has data and destination
147 doesn't have data. at any step, multiple processes
148 can send if they have the data */
150 (rank < tree_root + nprocs_completed)
151 && (dst >= tree_root + nprocs_completed)) {
154 for (j=0; j<(my_tree_root+mask); j++)
155 offset += recvcounts[j];
156 offset *= recvtype_extent;
158 smpi_mpi_send(((char *)tmp_buf + offset),
161 COLL_TAG_ALLGATHERV, comm);
162 /* last_recv_cnt was set in the previous
163 receive. that's the amount of data to be
166 /* recv only if this proc. doesn't have data and sender
168 else if ((dst < rank) &&
169 (dst < tree_root + nprocs_completed) &&
170 (rank >= tree_root + nprocs_completed)) {
173 for (j=0; j<(my_tree_root+mask); j++)
174 offset += recvcounts[j];
176 smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
177 total_count - offset, recvtype,
178 dst, COLL_TAG_ALLGATHERV,
180 /* for convenience, recv is posted for a
181 bigger amount than will be sent */
182 last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
183 curr_cnt += last_recv_cnt;
189 /* --END EXPERIMENTAL-- */
195 /* copy data from tmp_buf to recvbuf */
197 for (j=0; j<comm_size; j++) {
198 if ((sendbuf != MPI_IN_PLACE) || (j != rank)) {
199 /* not necessary to copy if in_place and
200 j==rank. otherwise copy. */
201 smpi_datatype_copy(((char *)tmp_buf + position*recvtype_extent),
202 recvcounts[j], recvtype,
203 ((char *)recvbuf + displs[j]*recvtype_extent),
204 recvcounts[j], recvtype);
206 position += recvcounts[j];