Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
add scatter algos from ompi
[simgrid.git] / src / smpi / colls / allgather-ompi-neighborexchange.c
1 /*
2  * ompi_coll_tuned_allgather_intra_neighborexchange
3  *
4  * Function:     allgather using N/2 steps (O(N))
5  * Accepts:      Same arguments as MPI_Allgather
6  * Returns:      MPI_SUCCESS or error code
7  *
8  * Description:  Neighbor Exchange algorithm for allgather.
9  *               Described by Chen et.al. in 
10  *               "Performance Evaluation of Allgather Algorithms on 
11  *                Terascale Linux Cluster with Fast Ethernet",
12  *               Proceedings of the Eighth International Conference on 
13  *               High-Performance Computing inn Asia-Pacific Region
14  *               (HPCASIA'05), 2005
15  * 
16  *               Rank r exchanges message with one of its neighbors and
17  *               forwards the data further in the next step.
18  *
19  *               No additional memory requirements.
20  * 
21  * Limitations:  Algorithm works only on even number of processes.
22  *               For odd number of processes we switch to ring algorithm.
23  * 
24  * Example on 6 nodes:
25  *  Initial state
26  *    #     0      1      2      3      4      5
27  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
28  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
29  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
30  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
31  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
32  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
33  *   Step 0:
34  *    #     0      1      2      3      4      5
35  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
36  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
38  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
40  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
41  *   Step 1:
42  *    #     0      1      2      3      4      5
43  *         [0]    [0]    [0]    [ ]    [ ]    [0]
44  *         [1]    [1]    [1]    [ ]    [ ]    [1]
45  *         [ ]    [2]    [2]    [2]    [2]    [ ]
46  *         [ ]    [3]    [3]    [3]    [3]    [ ]
47  *         [4]    [ ]    [ ]    [4]    [4]    [4]
48  *         [5]    [ ]    [ ]    [5]    [5]    [5]
49  *   Step 2:
50  *    #     0      1      2      3      4      5
51  *         [0]    [0]    [0]    [0]    [0]    [0]
52  *         [1]    [1]    [1]    [1]    [1]    [1]
53  *         [2]    [2]    [2]    [2]    [2]    [2]
54  *         [3]    [3]    [3]    [3]    [3]    [3]
55  *         [4]    [4]    [4]    [4]    [4]    [4]
56  *         [5]    [5]    [5]    [5]    [5]    [5]
57  */
58  
59  #include "colls_private.h"
60  #define MCA_COLL_BASE_TAG_ALLGATHER 555
61 int 
62 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
63                                                  MPI_Datatype sdtype,
64                                                  void* rbuf, int rcount,
65                                                  MPI_Datatype rdtype,
66                                                  MPI_Comm comm
67 )
68 {
69    int line = -1;
70    int rank, size;
71    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
72    int i, even_rank;
73    int err = 0;
74    ptrdiff_t slb, rlb, sext, rext;
75    char *tmpsend = NULL, *tmprecv = NULL;
76
77    size = smpi_comm_size(comm);
78    rank = smpi_comm_rank(comm);
79
80    if (size % 2) {
81       XBT_DEBUG(
82                    "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
83                    size);
84       return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
85                                                   rbuf, rcount, rdtype,
86                                                   comm);
87    }
88
89    XBT_DEBUG(
90                 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
91
92    err = smpi_datatype_extent (sdtype, &slb, &sext);
93    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
94
95    err = smpi_datatype_extent (rdtype, &rlb, &rext);
96    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
97
98    /* Initialization step:
99       - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
100         of receive buffer
101    */
102    tmprecv = (char*) rbuf + rank * rcount * rext;
103    if (MPI_IN_PLACE != sbuf) {
104       tmpsend = (char*) sbuf;
105       smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
106    } 
107
108    /* Determine neighbors, order in which blocks will arrive, etc. */
109    even_rank = !(rank % 2);
110    if (even_rank) {
111       neighbor[0] = (rank + 1) % size;
112       neighbor[1] = (rank - 1 + size) % size;
113       recv_data_from[0] = rank;
114       recv_data_from[1] = rank;
115       offset_at_step[0] = (+2);
116       offset_at_step[1] = (-2);
117    } else {
118       neighbor[0] = (rank - 1 + size) % size;
119       neighbor[1] = (rank + 1) % size;
120       recv_data_from[0] = neighbor[0];
121       recv_data_from[1] = neighbor[0];
122       offset_at_step[0] = (-2);
123       offset_at_step[1] = (+2);
124    }
125
126    /* Communication loop:
127       - First step is special: exchange a single block with neighbor[0].
128       - Rest of the steps: 
129         update recv_data_from according to offset, and 
130         exchange two blocks with appropriate neighbor.
131         the send location becomes previous receve location.
132    */
133    tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
134    tmpsend = (char*)rbuf + rank * rcount * rext;
135    /* Sendreceive */
136    smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
137                                   MCA_COLL_BASE_TAG_ALLGATHER,
138                                   tmprecv, rcount, rdtype, neighbor[0],
139                                   MCA_COLL_BASE_TAG_ALLGATHER,
140                                   comm, MPI_STATUS_IGNORE);
141
142    /* Determine initial sending location */
143    if (even_rank) {
144       send_data_from = rank;
145    } else {
146       send_data_from = recv_data_from[0];
147    }
148
149    for (i = 1; i < (size / 2); i++) {
150       const int i_parity = i % 2;
151       recv_data_from[i_parity] = 
152          (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
153
154       tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
155       tmpsend = (char*)rbuf + send_data_from * rcount * rext;
156       
157       /* Sendreceive */
158       smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
159                                      neighbor[i_parity], 
160                                      MCA_COLL_BASE_TAG_ALLGATHER,
161                                      tmprecv, 2 * rcount, rdtype,
162                                      neighbor[i_parity],
163                                      MCA_COLL_BASE_TAG_ALLGATHER,
164                                      comm, MPI_STATUS_IGNORE);
165
166       send_data_from = recv_data_from[i_parity];
167    }
168
169    return MPI_SUCCESS;
170
171  err_hndl:
172    XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
173                 __FILE__, line, err, rank);
174    return err;
175 }