Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
memleaks --
[simgrid.git] / src / smpi / colls / allgather-ompi-neighborexchange.c
1 /*
2  * ompi_coll_tuned_allgather_intra_neighborexchange
3  *
4  * Function:     allgather using N/2 steps (O(N))
5  * Accepts:      Same arguments as MPI_Allgather
6  * Returns:      MPI_SUCCESS or error code
7  *
8  * Description:  Neighbor Exchange algorithm for allgather.
9  *               Described by Chen et.al. in 
10  *               "Performance Evaluation of Allgather Algorithms on 
11  *                Terascale Linux Cluster with Fast Ethernet",
12  *               Proceedings of the Eighth International Conference on 
13  *               High-Performance Computing inn Asia-Pacific Region
14  *               (HPCASIA'05), 2005
15  * 
16  *               Rank r exchanges message with one of its neighbors and
17  *               forwards the data further in the next step.
18  *
19  *               No additional memory requirements.
20  * 
21  * Limitations:  Algorithm works only on even number of processes.
22  *               For odd number of processes we switch to ring algorithm.
23  * 
24  * Example on 6 nodes:
25  *  Initial state
26  *    #     0      1      2      3      4      5
27  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
28  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
29  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
30  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
31  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
32  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
33  *   Step 0:
34  *    #     0      1      2      3      4      5
35  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
36  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
38  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
40  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
41  *   Step 1:
42  *    #     0      1      2      3      4      5
43  *         [0]    [0]    [0]    [ ]    [ ]    [0]
44  *         [1]    [1]    [1]    [ ]    [ ]    [1]
45  *         [ ]    [2]    [2]    [2]    [2]    [ ]
46  *         [ ]    [3]    [3]    [3]    [3]    [ ]
47  *         [4]    [ ]    [ ]    [4]    [4]    [4]
48  *         [5]    [ ]    [ ]    [5]    [5]    [5]
49  *   Step 2:
50  *    #     0      1      2      3      4      5
51  *         [0]    [0]    [0]    [0]    [0]    [0]
52  *         [1]    [1]    [1]    [1]    [1]    [1]
53  *         [2]    [2]    [2]    [2]    [2]    [2]
54  *         [3]    [3]    [3]    [3]    [3]    [3]
55  *         [4]    [4]    [4]    [4]    [4]    [4]
56  *         [5]    [5]    [5]    [5]    [5]    [5]
57  */
58  
59  #include "colls_private.h"
60 int 
61 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
62                                                  MPI_Datatype sdtype,
63                                                  void* rbuf, int rcount,
64                                                  MPI_Datatype rdtype,
65                                                  MPI_Comm comm
66 )
67 {
68    int line = -1;
69    int rank, size;
70    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
71    int i, even_rank;
72    int err = 0;
73    ptrdiff_t slb, rlb, sext, rext;
74    char *tmpsend = NULL, *tmprecv = NULL;
75
76    size = smpi_comm_size(comm);
77    rank = smpi_comm_rank(comm);
78
79    if (size % 2) {
80       XBT_DEBUG(
81                    "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
82                    size);
83       return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
84                                                   rbuf, rcount, rdtype,
85                                                   comm);
86    }
87
88    XBT_DEBUG(
89                 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
90
91    err = smpi_datatype_extent (sdtype, &slb, &sext);
92    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
93
94    err = smpi_datatype_extent (rdtype, &rlb, &rext);
95    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
96
97    /* Initialization step:
98       - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
99         of receive buffer
100    */
101    tmprecv = (char*) rbuf + rank * rcount * rext;
102    if (MPI_IN_PLACE != sbuf) {
103       tmpsend = (char*) sbuf;
104       smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
105    } 
106
107    /* Determine neighbors, order in which blocks will arrive, etc. */
108    even_rank = !(rank % 2);
109    if (even_rank) {
110       neighbor[0] = (rank + 1) % size;
111       neighbor[1] = (rank - 1 + size) % size;
112       recv_data_from[0] = rank;
113       recv_data_from[1] = rank;
114       offset_at_step[0] = (+2);
115       offset_at_step[1] = (-2);
116    } else {
117       neighbor[0] = (rank - 1 + size) % size;
118       neighbor[1] = (rank + 1) % size;
119       recv_data_from[0] = neighbor[0];
120       recv_data_from[1] = neighbor[0];
121       offset_at_step[0] = (-2);
122       offset_at_step[1] = (+2);
123    }
124
125    /* Communication loop:
126       - First step is special: exchange a single block with neighbor[0].
127       - Rest of the steps: 
128         update recv_data_from according to offset, and 
129         exchange two blocks with appropriate neighbor.
130         the send location becomes previous receve location.
131    */
132    tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
133    tmpsend = (char*)rbuf + rank * rcount * rext;
134    /* Sendreceive */
135    smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
136                                   COLL_TAG_ALLGATHER,
137                                   tmprecv, rcount, rdtype, neighbor[0],
138                                   COLL_TAG_ALLGATHER,
139                                   comm, MPI_STATUS_IGNORE);
140
141    /* Determine initial sending location */
142    if (even_rank) {
143       send_data_from = rank;
144    } else {
145       send_data_from = recv_data_from[0];
146    }
147
148    for (i = 1; i < (size / 2); i++) {
149       const int i_parity = i % 2;
150       recv_data_from[i_parity] = 
151          (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
152
153       tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
154       tmpsend = (char*)rbuf + send_data_from * rcount * rext;
155       
156       /* Sendreceive */
157       smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
158                                      neighbor[i_parity], 
159                                      COLL_TAG_ALLGATHER,
160                                      tmprecv, 2 * rcount, rdtype,
161                                      neighbor[i_parity],
162                                      COLL_TAG_ALLGATHER,
163                                      comm, MPI_STATUS_IGNORE);
164
165       send_data_from = recv_data_from[i_parity];
166    }
167
168    return MPI_SUCCESS;
169
170  err_hndl:
171    XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
172                 __FILE__, line, err, rank);
173    return err;
174 }