Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
leaks --
[simgrid.git] / src / smpi / colls / allgather-ompi-neighborexchange.c
1 /* Copyright (c) 2013-2014. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 /*
8  * ompi_coll_tuned_allgather_intra_neighborexchange
9  *
10  * Function:     allgather using N/2 steps (O(N))
11  * Accepts:      Same arguments as MPI_Allgather
12  * Returns:      MPI_SUCCESS or error code
13  *
14  * Description:  Neighbor Exchange algorithm for allgather.
15  *               Described by Chen et.al. in 
16  *               "Performance Evaluation of Allgather Algorithms on 
17  *                Terascale Linux Cluster with Fast Ethernet",
18  *               Proceedings of the Eighth International Conference on 
19  *               High-Performance Computing inn Asia-Pacific Region
20  *               (HPCASIA'05), 2005
21  * 
22  *               Rank r exchanges message with one of its neighbors and
23  *               forwards the data further in the next step.
24  *
25  *               No additional memory requirements.
26  * 
27  * Limitations:  Algorithm works only on even number of processes.
28  *               For odd number of processes we switch to ring algorithm.
29  * 
30  * Example on 6 nodes:
31  *  Initial state
32  *    #     0      1      2      3      4      5
33  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
34  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
35  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
36  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
37  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
38  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
39  *   Step 0:
40  *    #     0      1      2      3      4      5
41  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
42  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
43  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
44  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
45  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
46  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
47  *   Step 1:
48  *    #     0      1      2      3      4      5
49  *         [0]    [0]    [0]    [ ]    [ ]    [0]
50  *         [1]    [1]    [1]    [ ]    [ ]    [1]
51  *         [ ]    [2]    [2]    [2]    [2]    [ ]
52  *         [ ]    [3]    [3]    [3]    [3]    [ ]
53  *         [4]    [ ]    [ ]    [4]    [4]    [4]
54  *         [5]    [ ]    [ ]    [5]    [5]    [5]
55  *   Step 2:
56  *    #     0      1      2      3      4      5
57  *         [0]    [0]    [0]    [0]    [0]    [0]
58  *         [1]    [1]    [1]    [1]    [1]    [1]
59  *         [2]    [2]    [2]    [2]    [2]    [2]
60  *         [3]    [3]    [3]    [3]    [3]    [3]
61  *         [4]    [4]    [4]    [4]    [4]    [4]
62  *         [5]    [5]    [5]    [5]    [5]    [5]
63  */
64  
65  #include "colls_private.h"
66 int 
67 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
68                                                  MPI_Datatype sdtype,
69                                                  void* rbuf, int rcount,
70                                                  MPI_Datatype rdtype,
71                                                  MPI_Comm comm
72 )
73 {
74    int line = -1;
75    int rank, size;
76    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
77    int i, even_rank;
78    int err = 0;
79    ptrdiff_t slb, rlb, sext, rext;
80    char *tmpsend = NULL, *tmprecv = NULL;
81
82    size = smpi_comm_size(comm);
83    rank = smpi_comm_rank(comm);
84
85    if (size % 2) {
86       XBT_DEBUG(
87                    "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
88                    size);
89       return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
90                                                   rbuf, rcount, rdtype,
91                                                   comm);
92    }
93
94    XBT_DEBUG(
95                 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
96
97    err = smpi_datatype_extent (sdtype, &slb, &sext);
98    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
99
100    err = smpi_datatype_extent (rdtype, &rlb, &rext);
101    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
102
103    /* Initialization step:
104       - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
105         of receive buffer
106    */
107    tmprecv = (char*) rbuf + rank * rcount * rext;
108    if (MPI_IN_PLACE != sbuf) {
109       tmpsend = (char*) sbuf;
110       smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
111    } 
112
113    /* Determine neighbors, order in which blocks will arrive, etc. */
114    even_rank = !(rank % 2);
115    if (even_rank) {
116       neighbor[0] = (rank + 1) % size;
117       neighbor[1] = (rank - 1 + size) % size;
118       recv_data_from[0] = rank;
119       recv_data_from[1] = rank;
120       offset_at_step[0] = (+2);
121       offset_at_step[1] = (-2);
122    } else {
123       neighbor[0] = (rank - 1 + size) % size;
124       neighbor[1] = (rank + 1) % size;
125       recv_data_from[0] = neighbor[0];
126       recv_data_from[1] = neighbor[0];
127       offset_at_step[0] = (-2);
128       offset_at_step[1] = (+2);
129    }
130
131    /* Communication loop:
132       - First step is special: exchange a single block with neighbor[0].
133       - Rest of the steps: 
134         update recv_data_from according to offset, and 
135         exchange two blocks with appropriate neighbor.
136         the send location becomes previous receve location.
137    */
138    tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
139    tmpsend = (char*)rbuf + rank * rcount * rext;
140    /* Sendreceive */
141    smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
142                                   COLL_TAG_ALLGATHER,
143                                   tmprecv, rcount, rdtype, neighbor[0],
144                                   COLL_TAG_ALLGATHER,
145                                   comm, MPI_STATUS_IGNORE);
146
147    /* Determine initial sending location */
148    if (even_rank) {
149       send_data_from = rank;
150    } else {
151       send_data_from = recv_data_from[0];
152    }
153
154    for (i = 1; i < (size / 2); i++) {
155       const int i_parity = i % 2;
156       recv_data_from[i_parity] = 
157          (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
158
159       tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
160       tmpsend = (char*)rbuf + send_data_from * rcount * rext;
161       
162       /* Sendreceive */
163       smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
164                                      neighbor[i_parity], 
165                                      COLL_TAG_ALLGATHER,
166                                      tmprecv, 2 * rcount, rdtype,
167                                      neighbor[i_parity],
168                                      COLL_TAG_ALLGATHER,
169                                      comm, MPI_STATUS_IGNORE);
170
171       send_data_from = recv_data_from[i_parity];
172    }
173
174    return MPI_SUCCESS;
175
176  err_hndl:
177    XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
178                 __FILE__, line, err, rank);
179    return err;
180 }