src/smpi/colls/allgatherv/allgatherv-ompi-neighborexchange.cpp

   1 /* Copyright (c) 2013-2019. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
   9  *
  10  * Function:     allgatherv using N/2 steps (O(N))
  11  * Accepts:      Same arguments as MPI_Allgatherv
  12  * Returns:      MPI_SUCCESS or error code
  13  *
  14  * Description:  Neighbor Exchange algorithm for allgather adapted for
  15  *               allgatherv.
  16  *               Described by Chen et.al. in
  17  *               "Performance Evaluation of Allgather Algorithms on
  18  *                Terascale Linux Cluster with Fast Ethernet",
  19  *               Proceedings of the Eighth International Conference on
  20  *               High-Performance Computing inn Asia-Pacific Region
  21  *               (HPCASIA'05), 2005
  22  *
  23  *               Rank r exchanges message with one of its neighbors and
  24  *               forwards the data further in the next step.
  25  *
  26  *               No additional memory requirements.
  27  *
  28  * Limitations:  Algorithm works only on even number of processes.
  29  *               For odd number of processes we switch to ring algorithm.
  30  *
  31  * Example on 6 nodes:
  32  *  Initial state
  33  *    #     0      1      2      3      4      5
  34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
  35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
  36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
  37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
  38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
  39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
  40  *   Step 0:
  41  *    #     0      1      2      3      4      5
  42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
  43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
  44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
  45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
  46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
  47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
  48  *   Step 1:
  49  *    #     0      1      2      3      4      5
  50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
  51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
  52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
  53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
  54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
  55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
  56  *   Step 2:
  57  *    #     0      1      2      3      4      5
  58  *         [0]    [0]    [0]    [0]    [0]    [0]
  59  *         [1]    [1]    [1]    [1]    [1]    [1]
  60  *         [2]    [2]    [2]    [2]    [2]    [2]
  61  *         [3]    [3]    [3]    [3]    [3]    [3]
  62  *         [4]    [4]    [4]    [4]    [4]    [4]
  63  *         [5]    [5]    [5]    [5]    [5]    [5]
  64  */
  65
  66 #include "../colls_private.hpp"
  67
  68 namespace simgrid{
  69 namespace smpi{
  70
  71 int
  72 allgatherv__ompi_neighborexchange(const void *sbuf, int scount,
  73                                   MPI_Datatype sdtype,
  74                                   void* rbuf, const int *rcounts, const int *rdispls,
  75                                   MPI_Datatype rdtype,
  76                                   MPI_Comm comm)
  77 {
  78     int line = -1;
  79     int rank, size;
  80     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  81
  82     int i, even_rank;
  83     int err = 0;
  84     ptrdiff_t slb, rlb, sext, rext;
  85     char *tmpsend = NULL, *tmprecv = NULL;
  86
  87
  88     size = comm->size();
  89     rank = comm->rank();
  90
  91     if (size % 2) {
  92         XBT_DEBUG("allgatherv__ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
  93                      size);
  94         return allgatherv__ring(sbuf, scount, sdtype,
  95                                                      rbuf, rcounts,
  96                                                      rdispls, rdtype,
  97                                                      comm);
  98     }
  99
 100     XBT_DEBUG(
 101                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
 102
 103     err = sdtype->extent(&slb, &sext);
 104     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 105
 106     err = rdtype->extent(&rlb, &rext);
 107     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 108
 109     /* Initialization step:
 110        - if send buffer is not MPI_IN_PLACE, copy send buffer to
 111        the appropriate block of receive buffer
 112     */
 113     tmprecv = (char*) rbuf + rdispls[rank] * rext;
 114     if (MPI_IN_PLACE != sbuf) {
 115         tmpsend = (char*) sbuf;
 116         err = Datatype::copy(tmpsend, scount, sdtype,
 117                               tmprecv, rcounts[rank], rdtype);
 118         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 119     }
 120
 121     /* Determine neighbors, order in which blocks will arrive, etc. */
 122     even_rank = !(rank % 2);
 123     if (even_rank) {
 124         neighbor[0] = (rank + 1) % size;
 125         neighbor[1] = (rank - 1 + size) % size;
 126         recv_data_from[0] = rank;
 127         recv_data_from[1] = rank;
 128         offset_at_step[0] = (+2);
 129         offset_at_step[1] = (-2);
 130     } else {
 131         neighbor[0] = (rank - 1 + size) % size;
 132         neighbor[1] = (rank + 1) % size;
 133         recv_data_from[0] = neighbor[0];
 134         recv_data_from[1] = neighbor[0];
 135         offset_at_step[0] = (-2);
 136         offset_at_step[1] = (+2);
 137     }
 138
 139     /* Communication loop:
 140        - First step is special: exchange a single block with neighbor[0].
 141        - Rest of the steps:
 142        update recv_data_from according to offset, and
 143        exchange two blocks with appropriate neighbor.
 144        the send location becomes previous receve location.
 145        Note, we need to create indexed datatype to send and receive these
 146        blocks properly.
 147     */
 148     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
 149     tmpsend = (char*)rbuf + rdispls[rank] * rext;
 150     Request::sendrecv(tmpsend, rcounts[rank], rdtype,
 151                                    neighbor[0], COLL_TAG_ALLGATHERV,
 152                                    tmprecv, rcounts[neighbor[0]], rdtype,
 153                                    neighbor[0], COLL_TAG_ALLGATHERV,
 154                                    comm, MPI_STATUS_IGNORE);
 155
 156
 157
 158
 159
 160     /* Determine initial sending counts and displacements*/
 161     if (even_rank) {
 162         send_data_from = rank;
 163     } else {
 164         send_data_from = recv_data_from[0];
 165     }
 166
 167     for (i = 1; i < (size / 2); i++) {
 168         MPI_Datatype new_rdtype, new_sdtype;
 169         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
 170         const int i_parity = i % 2;
 171         recv_data_from[i_parity] =
 172             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
 173
 174         /* Create new indexed types for sending and receiving.
 175            We are sending data from ranks (send_data_from) and (send_data_from+1)
 176            We are receiving data from ranks (recv_data_from[i_parity]) and
 177            (recv_data_from[i_parity]+1).
 178         */
 179
 180         new_scounts[0] = rcounts[send_data_from];
 181         new_scounts[1] = rcounts[(send_data_from + 1)];
 182         new_sdispls[0] = rdispls[send_data_from];
 183         new_sdispls[1] = rdispls[(send_data_from + 1)];
 184         err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
 185                                       &new_sdtype);
 186         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 187         new_sdtype->commit();
 188
 189         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
 190         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
 191         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
 192         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
 193         err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
 194                                       &new_rdtype);
 195         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 196         new_rdtype->commit();
 197
 198         tmprecv = (char*)rbuf;
 199         tmpsend = (char*)rbuf;
 200
 201         /* Sendreceive */
 202         Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
 203                                        COLL_TAG_ALLGATHERV,
 204                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
 205                                        COLL_TAG_ALLGATHERV,
 206                                        comm, MPI_STATUS_IGNORE);
 207
 208         send_data_from = recv_data_from[i_parity];
 209
 210         Datatype::unref(new_sdtype);
 211         Datatype::unref(new_rdtype);
 212     }
 213
 214     return MPI_SUCCESS;
 215
 216  err_hndl:
 217     XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
 218                  __FILE__, line, err, rank);
 219     return err;
 220 }
 221
 222 }
 223 }