Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Update copyright lines for 2022.
[simgrid.git] / src / smpi / colls / allgatherv / allgatherv-ompi-neighborexchange.cpp
1 /* Copyright (c) 2013-2022. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
9  *
10  * Function:     allgatherv using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgatherv
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather adapted for
15  *               allgatherv.
16  *               Described by Chen et.al. in
17  *               "Performance Evaluation of Allgather Algorithms on
18  *                Terascale Linux Cluster with Fast Ethernet",
19  *               Proceedings of the Eighth International Conference on
20  *               High-Performance Computing inn Asia-Pacific Region
21  *               (HPCASIA'05), 2005
22  *
23  *               Rank r exchanges message with one of its neighbors and
24  *               forwards the data further in the next step.
25  *
26  *               No additional memory requirements.
27  *
28  * Limitations:  Algorithm works only on even number of processes.
29  *               For odd number of processes we switch to ring algorithm.
30  *
31  * Example on 6 nodes:
32  *  Initial state
33  *    #     0      1      2      3      4      5
34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
40  *   Step 0:
41  *    #     0      1      2      3      4      5
42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
48  *   Step 1:
49  *    #     0      1      2      3      4      5
50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
56  *   Step 2:
57  *    #     0      1      2      3      4      5
58  *         [0]    [0]    [0]    [0]    [0]    [0]
59  *         [1]    [1]    [1]    [1]    [1]    [1]
60  *         [2]    [2]    [2]    [2]    [2]    [2]
61  *         [3]    [3]    [3]    [3]    [3]    [3]
62  *         [4]    [4]    [4]    [4]    [4]    [4]
63  *         [5]    [5]    [5]    [5]    [5]    [5]
64  */
65
66 #include "../colls_private.hpp"
67
68 namespace simgrid{
69 namespace smpi{
70
71 int
72 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
73                                   MPI_Datatype sdtype,
74                                   void* rbuf, const int *rcounts, const int *rdispls,
75                                   MPI_Datatype rdtype,
76                                   MPI_Comm comm)
77 {
78     int line = -1;
79     int rank, size;
80     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
81
82     int i, even_rank;
83     int err = 0;
84     ptrdiff_t slb, rlb, sext, rext;
85     char *tmpsend = nullptr, *tmprecv = nullptr;
86
87     size = comm->size();
88     rank = comm->rank();
89
90     if (size % 2) {
91         XBT_INFO("allgatherv__ompi_neighborexchange: odd size %d, switching to ring algorithm",
92                      size);
93         return allgatherv__ring(sbuf, scount, sdtype,
94                                                      rbuf, rcounts,
95                                                      rdispls, rdtype,
96                                                      comm);
97     }
98
99     XBT_DEBUG(
100                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
101
102     err = sdtype->extent(&slb, &sext);
103     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
104
105     err = rdtype->extent(&rlb, &rext);
106     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
107
108     /* Initialization step:
109        - if send buffer is not MPI_IN_PLACE, copy send buffer to
110        the appropriate block of receive buffer
111     */
112     tmprecv = (char*) rbuf + rdispls[rank] * rext;
113     if (MPI_IN_PLACE != sbuf) {
114         tmpsend = (char*) sbuf;
115         err = Datatype::copy(tmpsend, scount, sdtype,
116                               tmprecv, rcounts[rank], rdtype);
117         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
118     }
119
120     /* Determine neighbors, order in which blocks will arrive, etc. */
121     even_rank = !(rank % 2);
122     if (even_rank) {
123         neighbor[0] = (rank + 1) % size;
124         neighbor[1] = (rank - 1 + size) % size;
125         recv_data_from[0] = rank;
126         recv_data_from[1] = rank;
127         offset_at_step[0] = (+2);
128         offset_at_step[1] = (-2);
129     } else {
130         neighbor[0] = (rank - 1 + size) % size;
131         neighbor[1] = (rank + 1) % size;
132         recv_data_from[0] = neighbor[0];
133         recv_data_from[1] = neighbor[0];
134         offset_at_step[0] = (-2);
135         offset_at_step[1] = (+2);
136     }
137
138     /* Communication loop:
139        - First step is special: exchange a single block with neighbor[0].
140        - Rest of the steps:
141        update recv_data_from according to offset, and
142        exchange two blocks with appropriate neighbor.
143        the send location becomes previous receive location.
144        Note, we need to create indexed datatype to send and receive these
145        blocks properly.
146     */
147     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
148     tmpsend = (char*)rbuf + rdispls[rank] * rext;
149     Request::sendrecv(tmpsend, rcounts[rank], rdtype,
150                                    neighbor[0], COLL_TAG_ALLGATHERV,
151                                    tmprecv, rcounts[neighbor[0]], rdtype,
152                                    neighbor[0], COLL_TAG_ALLGATHERV,
153                                    comm, MPI_STATUS_IGNORE);
154
155
156
157
158
159     /* Determine initial sending counts and displacements*/
160     if (even_rank) {
161         send_data_from = rank;
162     } else {
163         send_data_from = recv_data_from[0];
164     }
165
166     for (i = 1; i < (size / 2); i++) {
167         MPI_Datatype new_rdtype, new_sdtype;
168         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
169         const int i_parity = i % 2;
170         recv_data_from[i_parity] =
171             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
172
173         /* Create new indexed types for sending and receiving.
174            We are sending data from ranks (send_data_from) and (send_data_from+1)
175            We are receiving data from ranks (recv_data_from[i_parity]) and
176            (recv_data_from[i_parity]+1).
177         */
178
179         new_scounts[0] = rcounts[send_data_from];
180         new_scounts[1] = rcounts[(send_data_from + 1)];
181         new_sdispls[0] = rdispls[send_data_from];
182         new_sdispls[1] = rdispls[(send_data_from + 1)];
183         err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
184                                       &new_sdtype);
185         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
186         new_sdtype->commit();
187
188         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
189         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
190         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
191         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
192         err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
193                                       &new_rdtype);
194         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
195         new_rdtype->commit();
196
197         tmprecv = (char*)rbuf;
198         tmpsend = (char*)rbuf;
199
200         /* Sendreceive */
201         Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
202                                        COLL_TAG_ALLGATHERV,
203                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
204                                        COLL_TAG_ALLGATHERV,
205                                        comm, MPI_STATUS_IGNORE);
206
207         send_data_from = recv_data_from[i_parity];
208
209         Datatype::unref(new_sdtype);
210         Datatype::unref(new_rdtype);
211     }
212
213     return MPI_SUCCESS;
214
215  err_hndl:
216     XBT_WARN(  "%s:%4d\tError occurred %d, rank %2d",
217                  __FILE__, line, err, rank);
218     return err;
219 }
220
221 }
222 }