Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
ff68e9d1e1cbd86e8e5f7ae5b79dc49f6b90e2be
[simgrid.git] / src / smpi / colls / allgatherv / allgatherv-ompi-bruck.cpp
1 /* Copyright (c) 2013-2017. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9  *                         University Research and Technology
10  *                         Corporation.  All rights reserved.
11  * Copyright (c) 2004-2009 The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15  *                         University of Stuttgart.  All rights reserved.
16  * Copyright (c) 2004-2005 The Regents of the University of California.
17  *                         All rights reserved.
18  * Copyright (c) 2009      University of Houston. All rights reserved.
19  *
20  * Additional copyrights may follow
21  */
22
23 #include "../colls_private.h"
24 /*
25  * ompi_coll_tuned_allgatherv_intra_bruck
26  *
27  * Function:     allgather using O(log(N)) steps.
28  * Accepts:      Same arguments as MPI_Allgather
29  * Returns:      MPI_SUCCESS or error code
30  *
31  * Description:  Variation to All-to-all algorithm described by Bruck et al.in
32  *               "Efficient Algorithms for All-to-all Communications
33  *                in Multiport Message-Passing Systems"
34  * Note:         Unlike in case of allgather implementation, we relay on
35  *               indexed datatype to select buffers appropriately.
36  *               The only additional memory requirement is for creation of 
37  *               temporary datatypes.
38  * Example on 7 nodes (memory lay out need not be in-order)
39  *   Initial set up:
40  *    #     0      1      2      3      4      5      6
41  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [ ]
42  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
43  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]    [ ]
44  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]    [ ]
45  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]    [ ]
46  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]    [ ]
47  *         [ ]    [ ]    [ ]    [ ]    [ ]    [ ]    [6]
48  *   Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
49  *    #     0      1      2      3      4      5      6
50  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [0]
51  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
52  *         [ ]    [2]    [2]    [ ]    [ ]    [ ]    [ ]
53  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]    [ ]
54  *         [ ]    [ ]    [ ]    [4]    [4]    [ ]    [ ]
55  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]    [ ]
56  *         [ ]    [ ]    [ ]    [ ]    [ ]    [6]    [6]
57  *   Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
58  *           message contains all blocks from (rank) .. (rank + 2^2) with 
59  *           wrap around.
60  *    #     0      1      2      3      4      5      6
61  *         [0]    [ ]    [ ]    [ ]    [0]    [0]    [0]
62  *         [1]    [1]    [ ]    [ ]    [ ]    [1]    [1]
63  *         [2]    [2]    [2]    [ ]    [ ]    [ ]    [2]
64  *         [3]    [3]    [3]    [3]    [ ]    [ ]    [ ]
65  *         [ ]    [4]    [4]    [4]    [4]    [ ]    [ ]
66  *         [ ]    [ ]    [5]    [5]    [5]    [5]    [ ]
67  *         [ ]    [ ]    [ ]    [6]    [6]    [6]    [6]
68  *   Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
69  *           message size is "all remaining blocks" 
70  *    #     0      1      2      3      4      5      6
71  *         [0]    [0]    [0]    [0]    [0]    [0]    [0]
72  *         [1]    [1]    [1]    [1]    [1]    [1]    [1]
73  *         [2]    [2]    [2]    [2]    [2]    [2]    [2]
74  *         [3]    [3]    [3]    [3]    [3]    [3]    [3]
75  *         [4]    [4]    [4]    [4]    [4]    [4]    [4]
76  *         [5]    [5]    [5]    [5]    [5]    [5]    [5]
77  *         [6]    [6]    [6]    [6]    [6]    [6]    [6]
78  */
79
80 namespace simgrid{
81 namespace smpi{
82
83 int Coll_allgatherv_ompi_bruck::allgatherv(void *sbuf, int scount,
84                                            MPI_Datatype sdtype,
85                                            void *rbuf, int *rcounts,
86                                            int *rdispls, 
87                                            MPI_Datatype rdtype,
88                                            MPI_Comm comm)
89 {
90    int sendto, recvfrom, blockcount, i;
91    unsigned int distance;
92    int *new_rcounts = NULL, *new_rdispls = NULL;
93    int *new_scounts = NULL, *new_sdispls = NULL;
94    ptrdiff_t slb, rlb, sext, rext;
95    char *tmpsend = NULL, *tmprecv = NULL;
96    MPI_Datatype new_rdtype = MPI_DATATYPE_NULL, new_sdtype = MPI_DATATYPE_NULL;
97
98    unsigned int size = comm->size();
99    unsigned int rank = comm->rank();
100
101    XBT_DEBUG(
102                 "coll:tuned:allgather_ompi_bruck rank %d", rank);
103    
104    sdtype->extent(&slb, &sext);
105
106    rdtype->extent(&rlb, &rext);
107
108    /* Initialization step:
109       - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of 
110         the receive buffer.
111    */
112    tmprecv = (char*) rbuf + rdispls[rank] * rext;
113    if (MPI_IN_PLACE != sbuf) {
114       tmpsend = (char*) sbuf;
115       Datatype::copy(tmpsend, scount, sdtype, 
116                             tmprecv, rcounts[rank], rdtype);
117    }
118    
119    /* Communication step:
120       At every step i, rank r:
121       - doubles the distance
122       - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
123         to rank (r - distance)
124       - receives message of blockcount blocks, 
125         (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from 
126         rank (r + distance)
127       - blockcount doubles until the last step when only the remaining data is 
128       exchanged.
129    */
130    new_rcounts = (int*) calloc(4*size, sizeof(int));
131    new_rdispls = new_rcounts + size;
132    new_scounts = new_rdispls + size;
133    new_sdispls = new_scounts + size;
134
135    for (distance = 1; distance < size; distance<<=1) {
136
137       recvfrom = (rank + distance) % size;
138       sendto = (rank - distance + size) % size;
139
140       if (distance <= (size >> 1)) {
141          blockcount = distance;
142       } else { 
143          blockcount = size - distance;
144       }
145
146       /* create send and receive datatypes */
147       for (i = 0; i < blockcount; i++) {
148           const int tmp_srank = (rank + i) % size;
149           const int tmp_rrank = (recvfrom + i) % size;
150           new_scounts[i] = rcounts[tmp_srank];
151           new_sdispls[i] = rdispls[tmp_srank];
152           new_rcounts[i] = rcounts[tmp_rrank];
153           new_rdispls[i] = rdispls[tmp_rrank];
154       }
155       Datatype::create_indexed(blockcount, new_scounts, new_sdispls, 
156                                     rdtype, &new_sdtype);
157       Datatype::create_indexed(blockcount, new_rcounts, new_rdispls,
158                                     rdtype, &new_rdtype);
159
160       new_sdtype->commit();
161       new_rdtype->commit();
162
163       /* Sendreceive */
164       Request::sendrecv(rbuf, 1, new_sdtype, sendto,
165                                      COLL_TAG_ALLGATHERV,
166                                      rbuf, 1, new_rdtype, recvfrom,
167                                      COLL_TAG_ALLGATHERV,
168                                      comm, MPI_STATUS_IGNORE);
169       Datatype::unref(new_sdtype);
170       Datatype::unref(new_rdtype);
171
172    }
173
174    free(new_rcounts);
175
176    return MPI_SUCCESS;
177
178 }
179
180
181 }
182 }