Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Define buffer on the stack here.
[simgrid.git] / src / smpi / colls / allgatherv-ompi-neighborexchange.c
1
2 /*
3  * ompi_coll_tuned_allgatherv_intra_neighborexchange
4  *
5  * Function:     allgatherv using N/2 steps (O(N))
6  * Accepts:      Same arguments as MPI_Allgatherv
7  * Returns:      MPI_SUCCESS or error code
8  *
9  * Description:  Neighbor Exchange algorithm for allgather adapted for 
10  *               allgatherv.
11  *               Described by Chen et.al. in 
12  *               "Performance Evaluation of Allgather Algorithms on 
13  *                Terascale Linux Cluster with Fast Ethernet",
14  *               Proceedings of the Eighth International Conference on 
15  *               High-Performance Computing inn Asia-Pacific Region
16  *               (HPCASIA'05), 2005
17  * 
18  *               Rank r exchanges message with one of its neighbors and
19  *               forwards the data further in the next step.
20  *
21  *               No additional memory requirements.
22  * 
23  * Limitations:  Algorithm works only on even number of processes.
24  *               For odd number of processes we switch to ring algorithm.
25  * 
26  * Example on 6 nodes:
27  *  Initial state
28  *    #     0      1      2      3      4      5
29  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
30  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
31  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
32  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
33  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
34  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
35  *   Step 0:
36  *    #     0      1      2      3      4      5
37  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
38  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
39  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
40  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
41  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
42  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
43  *   Step 1:
44  *    #     0      1      2      3      4      5
45  *         [0]    [0]    [0]    [ ]    [ ]    [0]
46  *         [1]    [1]    [1]    [ ]    [ ]    [1]
47  *         [ ]    [2]    [2]    [2]    [2]    [ ]
48  *         [ ]    [3]    [3]    [3]    [3]    [ ]
49  *         [4]    [ ]    [ ]    [4]    [4]    [4]
50  *         [5]    [ ]    [ ]    [5]    [5]    [5]
51  *   Step 2:
52  *    #     0      1      2      3      4      5
53  *         [0]    [0]    [0]    [0]    [0]    [0]
54  *         [1]    [1]    [1]    [1]    [1]    [1]
55  *         [2]    [2]    [2]    [2]    [2]    [2]
56  *         [3]    [3]    [3]    [3]    [3]    [3]
57  *         [4]    [4]    [4]    [4]    [4]    [4]
58  *         [5]    [5]    [5]    [5]    [5]    [5]
59  */
60  
61  #include "colls_private.h"
62  
63 int 
64 smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
65                                                   MPI_Datatype sdtype,
66                                                   void* rbuf, int *rcounts, int *rdispls,
67                                                   MPI_Datatype rdtype,
68                                                   MPI_Comm comm)
69 {
70     int line = -1;
71     int rank, size;
72     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
73   
74     int i, even_rank;
75     int err = 0;
76     ptrdiff_t slb, rlb, sext, rext;
77     char *tmpsend = NULL, *tmprecv = NULL;
78
79
80     size = smpi_comm_size(comm);
81     rank = smpi_comm_rank(comm);
82
83     if (size % 2) {
84         XBT_DEBUG(
85                      "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
86                      size);
87         return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
88                                                      rbuf, rcounts, 
89                                                      rdispls, rdtype,
90                                                      comm);
91     }
92
93     XBT_DEBUG(
94                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
95
96     err = smpi_datatype_extent (sdtype, &slb, &sext);
97     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
98
99     err = smpi_datatype_extent (rdtype, &rlb, &rext);
100     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
101
102     /* Initialization step:
103        - if send buffer is not MPI_IN_PLACE, copy send buffer to 
104        the appropriate block of receive buffer
105     */
106     tmprecv = (char*) rbuf + rdispls[rank] * rext;
107     if (MPI_IN_PLACE != sbuf) {
108         tmpsend = (char*) sbuf;
109         err = smpi_datatype_copy(tmpsend, scount, sdtype, 
110                               tmprecv, rcounts[rank], rdtype);
111         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
112     } 
113
114     /* Determine neighbors, order in which blocks will arrive, etc. */
115     even_rank = !(rank % 2);
116     if (even_rank) {
117         neighbor[0] = (rank + 1) % size;
118         neighbor[1] = (rank - 1 + size) % size;
119         recv_data_from[0] = rank;
120         recv_data_from[1] = rank;
121         offset_at_step[0] = (+2);
122         offset_at_step[1] = (-2);
123     } else {
124         neighbor[0] = (rank - 1 + size) % size;
125         neighbor[1] = (rank + 1) % size;
126         recv_data_from[0] = neighbor[0];
127         recv_data_from[1] = neighbor[0];
128         offset_at_step[0] = (-2);
129         offset_at_step[1] = (+2);
130     }
131
132     /* Communication loop:
133        - First step is special: exchange a single block with neighbor[0].
134        - Rest of the steps: 
135        update recv_data_from according to offset, and 
136        exchange two blocks with appropriate neighbor.
137        the send location becomes previous receve location.
138        Note, we need to create indexed datatype to send and receive these
139        blocks properly.
140     */
141     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
142     tmpsend = (char*)rbuf + rdispls[rank] * rext;
143     smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, 
144                                    neighbor[0], COLL_TAG_ALLGATHERV,
145                                    tmprecv, rcounts[neighbor[0]], rdtype, 
146                                    neighbor[0], COLL_TAG_ALLGATHERV,
147                                    comm, MPI_STATUS_IGNORE);
148
149
150
151   
152    
153     /* Determine initial sending counts and displacements*/
154     if (even_rank) {
155         send_data_from = rank;
156     } else {
157         send_data_from = recv_data_from[0];
158     }
159
160     for (i = 1; i < (size / 2); i++) {
161         MPI_Datatype new_rdtype, new_sdtype;
162         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
163         const int i_parity = i % 2;
164         recv_data_from[i_parity] = 
165             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
166
167         /* Create new indexed types for sending and receiving.
168            We are sending data from ranks (send_data_from) and (send_data_from+1)
169            We are receiving data from ranks (recv_data_from[i_parity]) and
170            (recv_data_from[i_parity]+1).
171         */
172         
173         new_scounts[0] = rcounts[send_data_from];
174         new_scounts[1] = rcounts[(send_data_from + 1)];
175         new_sdispls[0] = rdispls[send_data_from];
176         new_sdispls[1] = rdispls[(send_data_from + 1)];
177         err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, 
178                                       &new_sdtype);
179         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
180         smpi_datatype_commit(&new_sdtype);
181
182         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
183         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
184         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
185         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
186         err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, 
187                                       &new_rdtype);
188         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
189         smpi_datatype_commit(&new_rdtype);
190       
191         tmprecv = (char*)rbuf;
192         tmpsend = (char*)rbuf;
193       
194         /* Sendreceive */
195         smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
196                                        COLL_TAG_ALLGATHERV,
197                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
198                                        COLL_TAG_ALLGATHERV,
199                                        comm, MPI_STATUS_IGNORE);
200
201         send_data_from = recv_data_from[i_parity];
202       
203         smpi_datatype_free(&new_sdtype);
204         smpi_datatype_free(&new_rdtype);
205     }
206
207     return MPI_SUCCESS;
208
209  err_hndl:
210     XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
211                  __FILE__, line, err, rank);
212     return err;
213 }