Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
f9bc2b909cae8c7703024cd7a98b0619f2fb97e4
[simgrid.git] / src / smpi / colls / allgather / allgather-ompi-neighborexchange.cpp
1 /* Copyright (c) 2013-2019. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgather_intra_neighborexchange
9  *
10  * Function:     allgather using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgather
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather.
15  *               Described by Chen et.al. in
16  *               "Performance Evaluation of Allgather Algorithms on
17  *                Terascale Linux Cluster with Fast Ethernet",
18  *               Proceedings of the Eighth International Conference on
19  *               High-Performance Computing inn Asia-Pacific Region
20  *               (HPCASIA'05), 2005
21  *
22  *               Rank r exchanges message with one of its neighbors and
23  *               forwards the data further in the next step.
24  *
25  *               No additional memory requirements.
26  *
27  * Limitations:  Algorithm works only on even number of processes.
28  *               For odd number of processes we switch to ring algorithm.
29  *
30  * Example on 6 nodes:
31  *  Initial state
32  *    #     0      1      2      3      4      5
33  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
34  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
39  *   Step 0:
40  *    #     0      1      2      3      4      5
41  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
42  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
43  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
44  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
45  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
46  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
47  *   Step 1:
48  *    #     0      1      2      3      4      5
49  *         [0]    [0]    [0]    [ ]    [ ]    [0]
50  *         [1]    [1]    [1]    [ ]    [ ]    [1]
51  *         [ ]    [2]    [2]    [2]    [2]    [ ]
52  *         [ ]    [3]    [3]    [3]    [3]    [ ]
53  *         [4]    [ ]    [ ]    [4]    [4]    [4]
54  *         [5]    [ ]    [ ]    [5]    [5]    [5]
55  *   Step 2:
56  *    #     0      1      2      3      4      5
57  *         [0]    [0]    [0]    [0]    [0]    [0]
58  *         [1]    [1]    [1]    [1]    [1]    [1]
59  *         [2]    [2]    [2]    [2]    [2]    [2]
60  *         [3]    [3]    [3]    [3]    [3]    [3]
61  *         [4]    [4]    [4]    [4]    [4]    [4]
62  *         [5]    [5]    [5]    [5]    [5]    [5]
63  */
64
65 #include "../colls_private.hpp"
66
67 namespace simgrid{
68 namespace smpi{
69
70 int
71 Coll_allgather_ompi_neighborexchange::allgather(void *sbuf, int scount,
72                                                  MPI_Datatype sdtype,
73                                                  void* rbuf, int rcount,
74                                                  MPI_Datatype rdtype,
75                                                  MPI_Comm comm
76 )
77 {
78    int line = -1;
79    int rank, size;
80    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
81    int i, even_rank;
82    int err = 0;
83    ptrdiff_t slb, rlb, sext, rext;
84    char *tmpsend = NULL, *tmprecv = NULL;
85
86    size = comm->size();
87    rank = comm->rank();
88
89    if (size % 2) {
90       XBT_DEBUG(
91                    "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
92                    size);
93       return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
94                                                   rbuf, rcount, rdtype,
95                                                   comm);
96    }
97
98    XBT_DEBUG(
99                 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
100
101    err = sdtype->extent(&slb, &sext);
102    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
103
104    err = rdtype->extent(&rlb, &rext);
105    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
106
107    /* Initialization step:
108       - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
109         of receive buffer
110    */
111    tmprecv = (char*) rbuf + rank * rcount * rext;
112    if (MPI_IN_PLACE != sbuf) {
113       tmpsend = (char*) sbuf;
114       Datatype::copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
115    }
116
117    /* Determine neighbors, order in which blocks will arrive, etc. */
118    even_rank = not(rank % 2);
119    if (even_rank) {
120       neighbor[0] = (rank + 1) % size;
121       neighbor[1] = (rank - 1 + size) % size;
122       recv_data_from[0] = rank;
123       recv_data_from[1] = rank;
124       offset_at_step[0] = (+2);
125       offset_at_step[1] = (-2);
126    } else {
127       neighbor[0] = (rank - 1 + size) % size;
128       neighbor[1] = (rank + 1) % size;
129       recv_data_from[0] = neighbor[0];
130       recv_data_from[1] = neighbor[0];
131       offset_at_step[0] = (-2);
132       offset_at_step[1] = (+2);
133    }
134
135    /* Communication loop:
136       - First step is special: exchange a single block with neighbor[0].
137       - Rest of the steps:
138         update recv_data_from according to offset, and
139         exchange two blocks with appropriate neighbor.
140         the send location becomes previous receve location.
141    */
142    tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
143    tmpsend = (char*)rbuf + rank * rcount * rext;
144    /* Sendreceive */
145    Request::sendrecv(tmpsend, rcount, rdtype, neighbor[0],
146                                   COLL_TAG_ALLGATHER,
147                                   tmprecv, rcount, rdtype, neighbor[0],
148                                   COLL_TAG_ALLGATHER,
149                                   comm, MPI_STATUS_IGNORE);
150
151    /* Determine initial sending location */
152    if (even_rank) {
153       send_data_from = rank;
154    } else {
155       send_data_from = recv_data_from[0];
156    }
157
158    for (i = 1; i < (size / 2); i++) {
159       const int i_parity = i % 2;
160       recv_data_from[i_parity] =
161          (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
162
163       tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
164       tmpsend = (char*)rbuf + send_data_from * rcount * rext;
165
166       /* Sendreceive */
167       Request::sendrecv(tmpsend, 2 * rcount, rdtype,
168                                      neighbor[i_parity],
169                                      COLL_TAG_ALLGATHER,
170                                      tmprecv, 2 * rcount, rdtype,
171                                      neighbor[i_parity],
172                                      COLL_TAG_ALLGATHER,
173                                      comm, MPI_STATUS_IGNORE);
174
175       send_data_from = recv_data_from[i_parity];
176    }
177
178    return MPI_SUCCESS;
179
180  err_hndl:
181    XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
182                 __FILE__, line, err, rank);
183    return err;
184 }
185
186
187 }
188 }