Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
SMPI colls in not really C++. But cleaner than before.
[simgrid.git] / src / smpi / colls / allgatherv / allgatherv-ompi-neighborexchange.cpp
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
9  *
10  * Function:     allgatherv using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgatherv
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather adapted for 
15  *               allgatherv.
16  *               Described by Chen et.al. in 
17  *               "Performance Evaluation of Allgather Algorithms on 
18  *                Terascale Linux Cluster with Fast Ethernet",
19  *               Proceedings of the Eighth International Conference on 
20  *               High-Performance Computing inn Asia-Pacific Region
21  *               (HPCASIA'05), 2005
22  * 
23  *               Rank r exchanges message with one of its neighbors and
24  *               forwards the data further in the next step.
25  *
26  *               No additional memory requirements.
27  * 
28  * Limitations:  Algorithm works only on even number of processes.
29  *               For odd number of processes we switch to ring algorithm.
30  * 
31  * Example on 6 nodes:
32  *  Initial state
33  *    #     0      1      2      3      4      5
34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
40  *   Step 0:
41  *    #     0      1      2      3      4      5
42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
48  *   Step 1:
49  *    #     0      1      2      3      4      5
50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
56  *   Step 2:
57  *    #     0      1      2      3      4      5
58  *         [0]    [0]    [0]    [0]    [0]    [0]
59  *         [1]    [1]    [1]    [1]    [1]    [1]
60  *         [2]    [2]    [2]    [2]    [2]    [2]
61  *         [3]    [3]    [3]    [3]    [3]    [3]
62  *         [4]    [4]    [4]    [4]    [4]    [4]
63  *         [5]    [5]    [5]    [5]    [5]    [5]
64  */
65  
66  #include "../colls_private.h"
67  
68 namespace simgrid{
69 namespace smpi{
70
71 int 
72 Coll_allgatherv_ompi_neighborexchange::allgatherv(void *sbuf, int scount,
73                                                   MPI_Datatype sdtype,
74                                                   void* rbuf, int *rcounts, int *rdispls,
75                                                   MPI_Datatype rdtype,
76                                                   MPI_Comm comm)
77 {
78     int line = -1;
79     int rank, size;
80     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
81   
82     int i, even_rank;
83     int err = 0;
84     ptrdiff_t slb, rlb, sext, rext;
85     char *tmpsend = NULL, *tmprecv = NULL;
86
87
88     size = comm->size();
89     rank = comm->rank();
90
91     if (size % 2) {
92         XBT_DEBUG(
93                      "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
94                      size);
95         return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
96                                                      rbuf, rcounts, 
97                                                      rdispls, rdtype,
98                                                      comm);
99     }
100
101     XBT_DEBUG(
102                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
103
104     err = sdtype->extent(&slb, &sext);
105     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
106
107     err = rdtype->extent(&rlb, &rext);
108     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
109
110     /* Initialization step:
111        - if send buffer is not MPI_IN_PLACE, copy send buffer to 
112        the appropriate block of receive buffer
113     */
114     tmprecv = (char*) rbuf + rdispls[rank] * rext;
115     if (MPI_IN_PLACE != sbuf) {
116         tmpsend = (char*) sbuf;
117         err = Datatype::copy(tmpsend, scount, sdtype, 
118                               tmprecv, rcounts[rank], rdtype);
119         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
120     } 
121
122     /* Determine neighbors, order in which blocks will arrive, etc. */
123     even_rank = !(rank % 2);
124     if (even_rank) {
125         neighbor[0] = (rank + 1) % size;
126         neighbor[1] = (rank - 1 + size) % size;
127         recv_data_from[0] = rank;
128         recv_data_from[1] = rank;
129         offset_at_step[0] = (+2);
130         offset_at_step[1] = (-2);
131     } else {
132         neighbor[0] = (rank - 1 + size) % size;
133         neighbor[1] = (rank + 1) % size;
134         recv_data_from[0] = neighbor[0];
135         recv_data_from[1] = neighbor[0];
136         offset_at_step[0] = (-2);
137         offset_at_step[1] = (+2);
138     }
139
140     /* Communication loop:
141        - First step is special: exchange a single block with neighbor[0].
142        - Rest of the steps: 
143        update recv_data_from according to offset, and 
144        exchange two blocks with appropriate neighbor.
145        the send location becomes previous receve location.
146        Note, we need to create indexed datatype to send and receive these
147        blocks properly.
148     */
149     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
150     tmpsend = (char*)rbuf + rdispls[rank] * rext;
151     Request::sendrecv(tmpsend, rcounts[rank], rdtype, 
152                                    neighbor[0], COLL_TAG_ALLGATHERV,
153                                    tmprecv, rcounts[neighbor[0]], rdtype, 
154                                    neighbor[0], COLL_TAG_ALLGATHERV,
155                                    comm, MPI_STATUS_IGNORE);
156
157
158
159   
160    
161     /* Determine initial sending counts and displacements*/
162     if (even_rank) {
163         send_data_from = rank;
164     } else {
165         send_data_from = recv_data_from[0];
166     }
167
168     for (i = 1; i < (size / 2); i++) {
169         MPI_Datatype new_rdtype, new_sdtype;
170         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
171         const int i_parity = i % 2;
172         recv_data_from[i_parity] = 
173             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
174
175         /* Create new indexed types for sending and receiving.
176            We are sending data from ranks (send_data_from) and (send_data_from+1)
177            We are receiving data from ranks (recv_data_from[i_parity]) and
178            (recv_data_from[i_parity]+1).
179         */
180         
181         new_scounts[0] = rcounts[send_data_from];
182         new_scounts[1] = rcounts[(send_data_from + 1)];
183         new_sdispls[0] = rdispls[send_data_from];
184         new_sdispls[1] = rdispls[(send_data_from + 1)];
185         err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype, 
186                                       &new_sdtype);
187         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
188         new_sdtype->commit();
189
190         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
191         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
192         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
193         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
194         err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype, 
195                                       &new_rdtype);
196         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
197         new_rdtype->commit();
198       
199         tmprecv = (char*)rbuf;
200         tmpsend = (char*)rbuf;
201       
202         /* Sendreceive */
203         Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
204                                        COLL_TAG_ALLGATHERV,
205                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
206                                        COLL_TAG_ALLGATHERV,
207                                        comm, MPI_STATUS_IGNORE);
208
209         send_data_from = recv_data_from[i_parity];
210       
211         Datatype::unref(new_sdtype);
212         Datatype::unref(new_rdtype);
213     }
214
215     return MPI_SUCCESS;
216
217  err_hndl:
218     XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
219                  __FILE__, line, err, rank);
220     return err;
221 }
222
223 }
224 }