Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Use DBL_MAX for values of type double.
[simgrid.git] / src / smpi / colls / allgatherv-ompi-bruck.c
1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2009 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2009      University of Houston. All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19
20 #include "colls_private.h"
21 /*
22  * ompi_coll_tuned_allgatherv_intra_bruck
23  *
24  * Function:     allgather using O(log(N)) steps.
25  * Accepts:      Same arguments as MPI_Allgather
26  * Returns:      MPI_SUCCESS or error code
27  *
28  * Description:  Variation to All-to-all algorithm described by Bruck et al.in
29  *               "Efficient Algorithms for All-to-all Communications
30  *                in Multiport Message-Passing Systems"
31  * Note:         Unlike in case of allgather implementation, we relay on
32  *               indexed datatype to select buffers appropriately.
33  *               The only additional memory requirement is for creation of 
34  *               temporary datatypes.
35  * Example on 7 nodes (memory lay out need not be in-order)
36  *   Initial set up:
37  *    #     0      1      2      3      4      5      6
38  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [ ]
39  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
40  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]    [ ]
41  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]    [ ]
42  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]    [ ]
43  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]    [ ]
44  *         [ ]    [ ]    [ ]    [ ]    [ ]    [ ]    [6]
45  *   Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
46  *    #     0      1      2      3      4      5      6
47  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [0]
48  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
49  *         [ ]    [2]    [2]    [ ]    [ ]    [ ]    [ ]
50  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]    [ ]
51  *         [ ]    [ ]    [ ]    [4]    [4]    [ ]    [ ]
52  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]    [ ]
53  *         [ ]    [ ]    [ ]    [ ]    [ ]    [6]    [6]
54  *   Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
55  *           message contains all blocks from (rank) .. (rank + 2^2) with 
56  *           wrap around.
57  *    #     0      1      2      3      4      5      6
58  *         [0]    [ ]    [ ]    [ ]    [0]    [0]    [0]
59  *         [1]    [1]    [ ]    [ ]    [ ]    [1]    [1]
60  *         [2]    [2]    [2]    [ ]    [ ]    [ ]    [2]
61  *         [3]    [3]    [3]    [3]    [ ]    [ ]    [ ]
62  *         [ ]    [4]    [4]    [4]    [4]    [ ]    [ ]
63  *         [ ]    [ ]    [5]    [5]    [5]    [5]    [ ]
64  *         [ ]    [ ]    [ ]    [6]    [6]    [6]    [6]
65  *   Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
66  *           message size is "all remaining blocks" 
67  *    #     0      1      2      3      4      5      6
68  *         [0]    [0]    [0]    [0]    [0]    [0]    [0]
69  *         [1]    [1]    [1]    [1]    [1]    [1]    [1]
70  *         [2]    [2]    [2]    [2]    [2]    [2]    [2]
71  *         [3]    [3]    [3]    [3]    [3]    [3]    [3]
72  *         [4]    [4]    [4]    [4]    [4]    [4]    [4]
73  *         [5]    [5]    [5]    [5]    [5]    [5]    [5]
74  *         [6]    [6]    [6]    [6]    [6]    [6]    [6]
75  */
76 int smpi_coll_tuned_allgatherv_ompi_bruck(void *sbuf, int scount,
77                                            MPI_Datatype sdtype,
78                                            void *rbuf, int *rcounts,
79                                            int *rdispls, 
80                                            MPI_Datatype rdtype,
81                                            MPI_Comm comm)
82 {
83    int rank, size;
84    int sendto, recvfrom, distance, blockcount, i;
85    int *new_rcounts = NULL, *new_rdispls = NULL;
86    int *new_scounts = NULL, *new_sdispls = NULL;
87    ptrdiff_t slb, rlb, sext, rext;
88    char *tmpsend = NULL, *tmprecv = NULL;
89    MPI_Datatype new_rdtype, new_sdtype;
90
91    size = smpi_comm_size(comm);
92    rank = smpi_comm_rank(comm);
93
94    XBT_DEBUG(
95                 "coll:tuned:allgather_ompi_bruck rank %d", rank);
96    
97    smpi_datatype_extent (sdtype, &slb, &sext);
98
99    smpi_datatype_extent (rdtype, &rlb, &rext);
100
101    /* Initialization step:
102       - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of 
103         the receive buffer.
104    */
105    tmprecv = (char*) rbuf + rdispls[rank] * rext;
106    if (MPI_IN_PLACE != sbuf) {
107       tmpsend = (char*) sbuf;
108       smpi_datatype_copy(tmpsend, scount, sdtype, 
109                             tmprecv, rcounts[rank], rdtype);
110    }
111    
112    /* Communication step:
113       At every step i, rank r:
114       - doubles the distance
115       - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
116         to rank (r - distance)
117       - receives message of blockcount blocks, 
118         (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from 
119         rank (r + distance)
120       - blockcount doubles until the last step when only the remaining data is 
121       exchanged.
122    */
123    blockcount = 1;
124    tmpsend = (char*) rbuf;
125
126    new_rcounts = (int*) calloc(4*size, sizeof(int));
127    new_rdispls = new_rcounts + size;
128    new_scounts = new_rdispls + size;
129    new_sdispls = new_scounts + size;
130
131    for (distance = 1; distance < size; distance<<=1) {
132
133       recvfrom = (rank + distance) % size;
134       sendto = (rank - distance + size) % size;
135
136       if (distance <= (size >> 1)) {
137          blockcount = distance;
138       } else { 
139          blockcount = size - distance;
140       }
141
142       /* create send and receive datatypes */
143       for (i = 0; i < blockcount; i++) {
144           const int tmp_srank = (rank + i) % size;
145           const int tmp_rrank = (recvfrom + i) % size;
146           new_scounts[i] = rcounts[tmp_srank];
147           new_sdispls[i] = rdispls[tmp_srank];
148           new_rcounts[i] = rcounts[tmp_rrank];
149           new_rdispls[i] = rdispls[tmp_rrank];
150       }
151       smpi_datatype_indexed(blockcount, new_scounts, new_sdispls, 
152                                     rdtype, &new_sdtype);
153       smpi_datatype_indexed(blockcount, new_rcounts, new_rdispls,
154                                     rdtype, &new_rdtype);
155
156       smpi_datatype_commit(&new_sdtype);
157       smpi_datatype_commit(&new_rdtype);
158
159       /* Sendreceive */
160       smpi_mpi_sendrecv(rbuf, 1, new_sdtype, sendto,
161                                      COLL_TAG_ALLGATHERV,
162                                      rbuf, 1, new_rdtype, recvfrom,
163                                      COLL_TAG_ALLGATHERV,
164                                      comm, MPI_STATUS_IGNORE);
165       smpi_datatype_free(&new_sdtype);
166       smpi_datatype_free(&new_rdtype);
167
168    }
169
170    free(new_rcounts);
171
172    return MPI_SUCCESS;
173
174 }
175