1 /* Short or medium size message and power-of-two no. of processes. Use
2 * recursive doubling algorithm */
3 #include "colls_private.h"
4 #define MPIR_ALLGATHERV_TAG 222
5 int smpi_coll_tuned_allgatherv_mpich_rdb (
12 MPI_Datatype recvtype,
15 int comm_size, rank, j, i;
17 MPI_Aint recvtype_extent, recvtype_true_extent, recvtype_true_lb;
18 int curr_cnt, dst, total_count;
20 int mask, dst_tree_root, my_tree_root, is_homogeneous, position,
21 send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
22 offset, tmp_mask, tree_root;
24 comm_size = smpi_comm_size(comm);
25 rank = smpi_comm_rank(comm);
28 for (i=0; i<comm_size; i++)
29 total_count += recvcounts[i];
31 if (total_count == 0) return MPI_ERR_COUNT;
33 recvtype_extent=smpi_datatype_get_extent( recvtype);
38 /* need to receive contiguously into tmp_buf because
39 displs could make the recvbuf noncontiguous */
41 smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);
43 tmp_buf= (void*)xbt_malloc(total_count*(max(recvtype_true_extent,recvtype_extent)));
45 /* adjust for potential negative lower bound in datatype */
46 tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);
48 /* copy local data into right location in tmp_buf */
50 for (i=0; i<rank; i++) position += recvcounts[i];
51 if (sendbuf != MPI_IN_PLACE)
53 smpi_datatype_copy(sendbuf, sendcount, sendtype,
54 ((char *)tmp_buf + position*
56 recvcounts[rank], recvtype);
60 /* if in_place specified, local data is found in recvbuf */
61 smpi_datatype_copy(((char *)recvbuf +
62 displs[rank]*recvtype_extent),
63 recvcounts[rank], recvtype,
64 ((char *)tmp_buf + position*
66 recvcounts[rank], recvtype);
68 curr_cnt = recvcounts[rank];
72 while (mask < comm_size) {
75 /* find offset into send and recv buffers. zero out
76 the least significant "i" bits of rank and dst to
77 find root of src and dst subtrees. Use ranks of
78 roots as index to send from and recv into buffer */
80 dst_tree_root = dst >> i;
83 my_tree_root = rank >> i;
86 if (dst < comm_size) {
88 for (j=0; j<my_tree_root; j++)
89 send_offset += recvcounts[j];
92 for (j=0; j<dst_tree_root; j++)
93 recv_offset += recvcounts[j];
95 smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
96 curr_cnt, recvtype, dst,
98 ((char *)tmp_buf + recv_offset * recvtype_extent),
99 total_count - recv_offset, recvtype, dst,
102 /* for convenience, recv is posted for a bigger amount
104 last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
105 curr_cnt += last_recv_cnt;
108 /* if some processes in this process's subtree in this step
109 did not have any destination process to communicate with
110 because of non-power-of-two, we need to send them the
111 data that they would normally have received from those
112 processes. That is, the haves in this subtree must send to
113 the havenots. We use a logarithmic
114 recursive-halfing algorithm for this. */
116 /* This part of the code will not currently be
117 executed because we are not using recursive
118 doubling for non power of two. Mark it as experimental
119 so that it doesn't show up as red in the coverage
122 /* --BEGIN EXPERIMENTAL-- */
123 if (dst_tree_root + mask > comm_size) {
124 nprocs_completed = comm_size - my_tree_root - mask;
125 /* nprocs_completed is the number of processes in this
126 subtree that have all the data. Send data to others
127 in a tree fashion. First find root of current tree
128 that is being divided into two. k is the number of
129 least-significant bits in this process's rank that
130 must be zeroed out to find the rank of the root */
139 tmp_mask = mask >> 1;
142 dst = rank ^ tmp_mask;
144 tree_root = rank >> k;
147 /* send only if this proc has data and destination
148 doesn't have data. at any step, multiple processes
149 can send if they have the data */
151 (rank < tree_root + nprocs_completed)
152 && (dst >= tree_root + nprocs_completed)) {
155 for (j=0; j<(my_tree_root+mask); j++)
156 offset += recvcounts[j];
157 offset *= recvtype_extent;
159 smpi_mpi_send(((char *)tmp_buf + offset),
162 MPIR_ALLGATHERV_TAG, comm);
163 /* last_recv_cnt was set in the previous
164 receive. that's the amount of data to be
167 /* recv only if this proc. doesn't have data and sender
169 else if ((dst < rank) &&
170 (dst < tree_root + nprocs_completed) &&
171 (rank >= tree_root + nprocs_completed)) {
174 for (j=0; j<(my_tree_root+mask); j++)
175 offset += recvcounts[j];
177 smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
178 total_count - offset, recvtype,
179 dst, MPIR_ALLGATHERV_TAG,
181 /* for convenience, recv is posted for a
182 bigger amount than will be sent */
183 last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
184 curr_cnt += last_recv_cnt;
190 /* --END EXPERIMENTAL-- */
196 /* copy data from tmp_buf to recvbuf */
198 for (j=0; j<comm_size; j++) {
199 if ((sendbuf != MPI_IN_PLACE) || (j != rank)) {
200 /* not necessary to copy if in_place and
201 j==rank. otherwise copy. */
202 smpi_datatype_copy(((char *)tmp_buf + position*recvtype_extent),
203 recvcounts[j], recvtype,
204 ((char *)recvbuf + displs[j]*recvtype_extent),
205 recvcounts[j], recvtype);
207 position += recvcounts[j];