Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
moved a line for comprehension
[simgrid.git] / src / smpi / colls / allgatherv / allgatherv-ompi-neighborexchange.cpp
1 /* Copyright (c) 2013-2020. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
9  *
10  * Function:     allgatherv using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgatherv
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather adapted for
15  *               allgatherv.
16  *               Described by Chen et.al. in
17  *               "Performance Evaluation of Allgather Algorithms on
18  *                Terascale Linux Cluster with Fast Ethernet",
19  *               Proceedings of the Eighth International Conference on
20  *               High-Performance Computing inn Asia-Pacific Region
21  *               (HPCASIA'05), 2005
22  *
23  *               Rank r exchanges message with one of its neighbors and
24  *               forwards the data further in the next step.
25  *
26  *               No additional memory requirements.
27  *
28  * Limitations:  Algorithm works only on even number of processes.
29  *               For odd number of processes we switch to ring algorithm.
30  *
31  * Example on 6 nodes:
32  *  Initial state
33  *    #     0      1      2      3      4      5
34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
40  *   Step 0:
41  *    #     0      1      2      3      4      5
42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
48  *   Step 1:
49  *    #     0      1      2      3      4      5
50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
56  *   Step 2:
57  *    #     0      1      2      3      4      5
58  *         [0]    [0]    [0]    [0]    [0]    [0]
59  *         [1]    [1]    [1]    [1]    [1]    [1]
60  *         [2]    [2]    [2]    [2]    [2]    [2]
61  *         [3]    [3]    [3]    [3]    [3]    [3]
62  *         [4]    [4]    [4]    [4]    [4]    [4]
63  *         [5]    [5]    [5]    [5]    [5]    [5]
64  */
65
66 #include "../colls_private.hpp"
67
68 namespace simgrid{
69 namespace smpi{
70
71 int
72 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
73                                   MPI_Datatype sdtype,
74                                   void* rbuf, const int *rcounts, const int *rdispls,
75                                   MPI_Datatype rdtype,
76                                   MPI_Comm comm)
77 {
78     int line = -1;
79     int rank, size;
80     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
81
82     int i, even_rank;
83     int err = 0;
84     ptrdiff_t slb, rlb, sext, rext;
85     char *tmpsend = NULL, *tmprecv = NULL;
86
87
88     size = comm->size();
89     rank = comm->rank();
90
91     if (size % 2) {
92         XBT_DEBUG("allgatherv__ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
93                      size);
94         return allgatherv__ring(sbuf, scount, sdtype,
95                                                      rbuf, rcounts,
96                                                      rdispls, rdtype,
97                                                      comm);
98     }
99
100     XBT_DEBUG(
101                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
102
103     err = sdtype->extent(&slb, &sext);
104     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
105
106     err = rdtype->extent(&rlb, &rext);
107     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
108
109     /* Initialization step:
110        - if send buffer is not MPI_IN_PLACE, copy send buffer to
111        the appropriate block of receive buffer
112     */
113     tmprecv = (char*) rbuf + rdispls[rank] * rext;
114     if (MPI_IN_PLACE != sbuf) {
115         tmpsend = (char*) sbuf;
116         err = Datatype::copy(tmpsend, scount, sdtype,
117                               tmprecv, rcounts[rank], rdtype);
118         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
119     }
120
121     /* Determine neighbors, order in which blocks will arrive, etc. */
122     even_rank = !(rank % 2);
123     if (even_rank) {
124         neighbor[0] = (rank + 1) % size;
125         neighbor[1] = (rank - 1 + size) % size;
126         recv_data_from[0] = rank;
127         recv_data_from[1] = rank;
128         offset_at_step[0] = (+2);
129         offset_at_step[1] = (-2);
130     } else {
131         neighbor[0] = (rank - 1 + size) % size;
132         neighbor[1] = (rank + 1) % size;
133         recv_data_from[0] = neighbor[0];
134         recv_data_from[1] = neighbor[0];
135         offset_at_step[0] = (-2);
136         offset_at_step[1] = (+2);
137     }
138
139     /* Communication loop:
140        - First step is special: exchange a single block with neighbor[0].
141        - Rest of the steps:
142        update recv_data_from according to offset, and
143        exchange two blocks with appropriate neighbor.
144        the send location becomes previous receve location.
145        Note, we need to create indexed datatype to send and receive these
146        blocks properly.
147     */
148     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
149     tmpsend = (char*)rbuf + rdispls[rank] * rext;
150     Request::sendrecv(tmpsend, rcounts[rank], rdtype,
151                                    neighbor[0], COLL_TAG_ALLGATHERV,
152                                    tmprecv, rcounts[neighbor[0]], rdtype,
153                                    neighbor[0], COLL_TAG_ALLGATHERV,
154                                    comm, MPI_STATUS_IGNORE);
155
156
157
158
159
160     /* Determine initial sending counts and displacements*/
161     if (even_rank) {
162         send_data_from = rank;
163     } else {
164         send_data_from = recv_data_from[0];
165     }
166
167     for (i = 1; i < (size / 2); i++) {
168         MPI_Datatype new_rdtype, new_sdtype;
169         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
170         const int i_parity = i % 2;
171         recv_data_from[i_parity] =
172             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
173
174         /* Create new indexed types for sending and receiving.
175            We are sending data from ranks (send_data_from) and (send_data_from+1)
176            We are receiving data from ranks (recv_data_from[i_parity]) and
177            (recv_data_from[i_parity]+1).
178         */
179
180         new_scounts[0] = rcounts[send_data_from];
181         new_scounts[1] = rcounts[(send_data_from + 1)];
182         new_sdispls[0] = rdispls[send_data_from];
183         new_sdispls[1] = rdispls[(send_data_from + 1)];
184         err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
185                                       &new_sdtype);
186         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
187         new_sdtype->commit();
188
189         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
190         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
191         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
192         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
193         err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
194                                       &new_rdtype);
195         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
196         new_rdtype->commit();
197
198         tmprecv = (char*)rbuf;
199         tmpsend = (char*)rbuf;
200
201         /* Sendreceive */
202         Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
203                                        COLL_TAG_ALLGATHERV,
204                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
205                                        COLL_TAG_ALLGATHERV,
206                                        comm, MPI_STATUS_IGNORE);
207
208         send_data_from = recv_data_from[i_parity];
209
210         Datatype::unref(new_sdtype);
211         Datatype::unref(new_rdtype);
212     }
213
214     return MPI_SUCCESS;
215
216  err_hndl:
217     XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
218                  __FILE__, line, err, rank);
219     return err;
220 }
221
222 }
223 }