Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add new entry in Release_Notes.
[simgrid.git] / src / smpi / colls / allgatherv / allgatherv-ompi-neighborexchange.cpp
1 /* Copyright (c) 2013-2023. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
9  *
10  * Function:     allgatherv using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgatherv
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather adapted for
15  *               allgatherv.
16  *               Described by Chen et.al. in
17  *               "Performance Evaluation of Allgather Algorithms on
18  *                Terascale Linux Cluster with Fast Ethernet",
19  *               Proceedings of the Eighth International Conference on
20  *               High-Performance Computing inn Asia-Pacific Region
21  *               (HPCASIA'05), 2005
22  *
23  *               Rank r exchanges message with one of its neighbors and
24  *               forwards the data further in the next step.
25  *
26  *               No additional memory requirements.
27  *
28  * Limitations:  Algorithm works only on even number of processes.
29  *               For odd number of processes we switch to ring algorithm.
30  *
31  * Example on 6 nodes:
32  *  Initial state
33  *    #     0      1      2      3      4      5
34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
40  *   Step 0:
41  *    #     0      1      2      3      4      5
42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
48  *   Step 1:
49  *    #     0      1      2      3      4      5
50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
56  *   Step 2:
57  *    #     0      1      2      3      4      5
58  *         [0]    [0]    [0]    [0]    [0]    [0]
59  *         [1]    [1]    [1]    [1]    [1]    [1]
60  *         [2]    [2]    [2]    [2]    [2]    [2]
61  *         [3]    [3]    [3]    [3]    [3]    [3]
62  *         [4]    [4]    [4]    [4]    [4]    [4]
63  *         [5]    [5]    [5]    [5]    [5]    [5]
64  */
65
66 #include "../colls_private.hpp"
67
68 namespace simgrid::smpi {
69
70 int
71 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
72                                   MPI_Datatype sdtype,
73                                   void* rbuf, const int *rcounts, const int *rdispls,
74                                   MPI_Datatype rdtype,
75                                   MPI_Comm comm)
76 {
77     int line = -1;
78     int rank, size;
79     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
80
81     int i, even_rank;
82     int err = 0;
83     ptrdiff_t slb, rlb, sext, rext;
84     char *tmpsend = nullptr, *tmprecv = nullptr;
85
86     size = comm->size();
87     rank = comm->rank();
88
89     if (size % 2) {
90         XBT_INFO("allgatherv__ompi_neighborexchange: odd size %d, switching to ring algorithm",
91                      size);
92         return allgatherv__ring(sbuf, scount, sdtype,
93                                                      rbuf, rcounts,
94                                                      rdispls, rdtype,
95                                                      comm);
96     }
97
98     XBT_DEBUG(
99                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
100
101     err = sdtype->extent(&slb, &sext);
102     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
103
104     err = rdtype->extent(&rlb, &rext);
105     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
106
107     /* Initialization step:
108        - if send buffer is not MPI_IN_PLACE, copy send buffer to
109        the appropriate block of receive buffer
110     */
111     tmprecv = (char*) rbuf + rdispls[rank] * rext;
112     if (MPI_IN_PLACE != sbuf) {
113         tmpsend = (char*) sbuf;
114         err = Datatype::copy(tmpsend, scount, sdtype,
115                               tmprecv, rcounts[rank], rdtype);
116         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
117     }
118
119     /* Determine neighbors, order in which blocks will arrive, etc. */
120     even_rank = !(rank % 2);
121     if (even_rank) {
122         neighbor[0] = (rank + 1) % size;
123         neighbor[1] = (rank - 1 + size) % size;
124         recv_data_from[0] = rank;
125         recv_data_from[1] = rank;
126         offset_at_step[0] = (+2);
127         offset_at_step[1] = (-2);
128     } else {
129         neighbor[0] = (rank - 1 + size) % size;
130         neighbor[1] = (rank + 1) % size;
131         recv_data_from[0] = neighbor[0];
132         recv_data_from[1] = neighbor[0];
133         offset_at_step[0] = (-2);
134         offset_at_step[1] = (+2);
135     }
136
137     /* Communication loop:
138        - First step is special: exchange a single block with neighbor[0].
139        - Rest of the steps:
140        update recv_data_from according to offset, and
141        exchange two blocks with appropriate neighbor.
142        the send location becomes previous receive location.
143        Note, we need to create indexed datatype to send and receive these
144        blocks properly.
145     */
146     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
147     tmpsend = (char*)rbuf + rdispls[rank] * rext;
148     Request::sendrecv(tmpsend, rcounts[rank], rdtype,
149                                    neighbor[0], COLL_TAG_ALLGATHERV,
150                                    tmprecv, rcounts[neighbor[0]], rdtype,
151                                    neighbor[0], COLL_TAG_ALLGATHERV,
152                                    comm, MPI_STATUS_IGNORE);
153
154
155
156
157
158     /* Determine initial sending counts and displacements*/
159     if (even_rank) {
160         send_data_from = rank;
161     } else {
162         send_data_from = recv_data_from[0];
163     }
164
165     for (i = 1; i < (size / 2); i++) {
166         MPI_Datatype new_rdtype, new_sdtype;
167         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
168         const int i_parity = i % 2;
169         recv_data_from[i_parity] =
170             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
171
172         /* Create new indexed types for sending and receiving.
173            We are sending data from ranks (send_data_from) and (send_data_from+1)
174            We are receiving data from ranks (recv_data_from[i_parity]) and
175            (recv_data_from[i_parity]+1).
176         */
177
178         new_scounts[0] = rcounts[send_data_from];
179         new_scounts[1] = rcounts[(send_data_from + 1)];
180         new_sdispls[0] = rdispls[send_data_from];
181         new_sdispls[1] = rdispls[(send_data_from + 1)];
182         err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
183                                       &new_sdtype);
184         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
185         new_sdtype->commit();
186
187         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
188         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
189         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
190         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
191         err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
192                                       &new_rdtype);
193         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
194         new_rdtype->commit();
195
196         tmprecv = (char*)rbuf;
197         tmpsend = (char*)rbuf;
198
199         /* Sendreceive */
200         Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
201                                        COLL_TAG_ALLGATHERV,
202                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
203                                        COLL_TAG_ALLGATHERV,
204                                        comm, MPI_STATUS_IGNORE);
205
206         send_data_from = recv_data_from[i_parity];
207
208         Datatype::unref(new_sdtype);
209         Datatype::unref(new_rdtype);
210     }
211
212     return MPI_SUCCESS;
213
214  err_hndl:
215     XBT_WARN(  "%s:%4d\tError occurred %d, rank %2d",
216                  __FILE__, line, err, rank);
217     return err;
218 }
219
220 } // namespace simgrid::smpi