Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
83918079cb58d27ae5671eadc7a766b6d0fbe374
[simgrid.git] / src / smpi / colls / allgatherv-mpich-rdb.c
1         /* Short or medium size message and power-of-two no. of processes. Use
2          * recursive doubling algorithm */   
3 #include "colls_private.h"
4 int smpi_coll_tuned_allgatherv_mpich_rdb ( 
5     void *sendbuf,
6     int sendcount,
7     MPI_Datatype sendtype,
8     void *recvbuf,
9     int *recvcounts,
10     int *displs,
11     MPI_Datatype recvtype,
12     MPI_Comm comm)
13 {
14     int        comm_size, rank, j, i;
15     MPI_Status status;
16     MPI_Aint  recvtype_extent, recvtype_true_extent, recvtype_true_lb;
17     int curr_cnt, dst, total_count; 
18     void *tmp_buf;
19     int mask, dst_tree_root, my_tree_root, is_homogeneous, position,  
20         send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
21         offset, tmp_mask, tree_root;
22
23     comm_size = smpi_comm_size(comm);
24     rank = smpi_comm_rank(comm);
25     
26     total_count = 0;
27     for (i=0; i<comm_size; i++)
28         total_count += recvcounts[i];
29
30     if (total_count == 0) return MPI_ERR_COUNT;
31     
32     recvtype_extent=smpi_datatype_get_extent( recvtype);
33
34         is_homogeneous = 1;
35         
36         if (is_homogeneous) {
37             /* need to receive contiguously into tmp_buf because
38                displs could make the recvbuf noncontiguous */
39
40             smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);
41
42             tmp_buf= (void*)xbt_malloc(total_count*(max(recvtype_true_extent,recvtype_extent)));
43
44             /* adjust for potential negative lower bound in datatype */
45             tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);
46
47             /* copy local data into right location in tmp_buf */ 
48             position = 0;
49             for (i=0; i<rank; i++) position += recvcounts[i];
50             if (sendbuf != MPI_IN_PLACE)
51             {
52                 smpi_datatype_copy(sendbuf, sendcount, sendtype,
53                                            ((char *)tmp_buf + position*
54                                             recvtype_extent), 
55                                            recvcounts[rank], recvtype);
56             }
57             else
58             {
59                 /* if in_place specified, local data is found in recvbuf */ 
60                 smpi_datatype_copy(((char *)recvbuf +
61                                             displs[rank]*recvtype_extent), 
62                                            recvcounts[rank], recvtype,
63                                            ((char *)tmp_buf + position*
64                                             recvtype_extent), 
65                                            recvcounts[rank], recvtype);
66     }
67             curr_cnt = recvcounts[rank];
68             
69             mask = 0x1;
70             i = 0;
71             while (mask < comm_size) {
72                 dst = rank ^ mask;
73                 
74                 /* find offset into send and recv buffers. zero out 
75                    the least significant "i" bits of rank and dst to 
76                    find root of src and dst subtrees. Use ranks of 
77                    roots as index to send from and recv into buffer */ 
78                 
79                 dst_tree_root = dst >> i;
80                 dst_tree_root <<= i;
81                 
82                 my_tree_root = rank >> i;
83                 my_tree_root <<= i;
84                 
85                 if (dst < comm_size) {
86                     send_offset = 0;
87                     for (j=0; j<my_tree_root; j++)
88                         send_offset += recvcounts[j];
89                     
90                     recv_offset = 0;
91                     for (j=0; j<dst_tree_root; j++)
92                         recv_offset += recvcounts[j];
93
94                     smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
95                                                  curr_cnt, recvtype, dst,
96                                                  COLL_TAG_ALLGATHERV,
97                                                  ((char *)tmp_buf + recv_offset * recvtype_extent),
98                                                  total_count - recv_offset, recvtype, dst,
99                                                  COLL_TAG_ALLGATHERV,
100                                                  comm, &status);
101                         /* for convenience, recv is posted for a bigger amount
102                            than will be sent */
103                         last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
104                     curr_cnt += last_recv_cnt;
105                 }
106                 
107                 /* if some processes in this process's subtree in this step
108                    did not have any destination process to communicate with
109                    because of non-power-of-two, we need to send them the
110                    data that they would normally have received from those
111                    processes. That is, the haves in this subtree must send to
112                    the havenots. We use a logarithmic
113                    recursive-halfing algorithm for this. */
114                 
115                 /* This part of the code will not currently be
116                  executed because we are not using recursive
117                  doubling for non power of two. Mark it as experimental
118                  so that it doesn't show up as red in the coverage
119                  tests. */  
120
121                 /* --BEGIN EXPERIMENTAL-- */
122                 if (dst_tree_root + mask > comm_size) {
123                     nprocs_completed = comm_size - my_tree_root - mask;
124                     /* nprocs_completed is the number of processes in this
125                        subtree that have all the data. Send data to others
126                        in a tree fashion. First find root of current tree
127                        that is being divided into two. k is the number of
128                        least-significant bits in this process's rank that
129                        must be zeroed out to find the rank of the root */ 
130                     j = mask;
131                     k = 0;
132                     while (j) {
133                         j >>= 1;
134                         k++;
135                     }
136                     k--;
137                     
138                     tmp_mask = mask >> 1;
139                     
140                     while (tmp_mask) {
141                         dst = rank ^ tmp_mask;
142                         
143                         tree_root = rank >> k;
144                         tree_root <<= k;
145                         
146                         /* send only if this proc has data and destination
147                            doesn't have data. at any step, multiple processes
148                            can send if they have the data */
149                         if ((dst > rank) && 
150                             (rank < tree_root + nprocs_completed)
151                             && (dst >= tree_root + nprocs_completed)) {
152
153                             offset = 0;
154                             for (j=0; j<(my_tree_root+mask); j++)
155                                 offset += recvcounts[j];
156                             offset *= recvtype_extent;
157
158                             smpi_mpi_send(((char *)tmp_buf + offset),
159                                                      last_recv_cnt,
160                                                      recvtype, dst,
161                                                      COLL_TAG_ALLGATHERV, comm);
162                             /* last_recv_cnt was set in the previous
163                                receive. that's the amount of data to be
164                                sent now. */
165                         }
166                         /* recv only if this proc. doesn't have data and sender
167                            has data */
168                         else if ((dst < rank) && 
169                                  (dst < tree_root + nprocs_completed) &&
170                                  (rank >= tree_root + nprocs_completed)) {
171
172                             offset = 0;
173                             for (j=0; j<(my_tree_root+mask); j++)
174                                 offset += recvcounts[j];
175
176                             smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
177                                                      total_count - offset, recvtype,
178                                                      dst, COLL_TAG_ALLGATHERV,
179                                                      comm, &status);
180                                 /* for convenience, recv is posted for a
181                                    bigger amount than will be sent */
182                                 last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
183                             curr_cnt += last_recv_cnt;
184                         }
185                         tmp_mask >>= 1;
186                         k--;
187                     }
188                 }
189                 /* --END EXPERIMENTAL-- */
190                 
191                 mask <<= 1;
192                 i++;
193             }
194
195             /* copy data from tmp_buf to recvbuf */
196             position = 0;
197             for (j=0; j<comm_size; j++) {
198                 if ((sendbuf != MPI_IN_PLACE) || (j != rank)) {
199                     /* not necessary to copy if in_place and
200                        j==rank. otherwise copy. */
201                     smpi_datatype_copy(((char *)tmp_buf + position*recvtype_extent),
202                                                recvcounts[j], recvtype,
203                                                ((char *)recvbuf + displs[j]*recvtype_extent),
204                                                recvcounts[j], recvtype);
205                 }
206                 position += recvcounts[j];
207             }
208         }
209   free(tmp_buf);
210   return MPI_SUCCESS;
211 }