src/smpi/colls/allgatherv-ompi-neighborexchange.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * ompi_coll_tuned_allgatherv_intra_neighborexchange
   9  *
  10  * Function:     allgatherv using N/2 steps (O(N))
  11  * Accepts:      Same arguments as MPI_Allgatherv
  12  * Returns:      MPI_SUCCESS or error code
  13  *
  14  * Description:  Neighbor Exchange algorithm for allgather adapted for
  15  *               allgatherv.
  16  *               Described by Chen et.al. in
  17  *               "Performance Evaluation of Allgather Algorithms on
  18  *                Terascale Linux Cluster with Fast Ethernet",
  19  *               Proceedings of the Eighth International Conference on
  20  *               High-Performance Computing inn Asia-Pacific Region
  21  *               (HPCASIA'05), 2005
  22  *
  23  *               Rank r exchanges message with one of its neighbors and
  24  *               forwards the data further in the next step.
  25  *
  26  *               No additional memory requirements.
  27  *
  28  * Limitations:  Algorithm works only on even number of processes.
  29  *               For odd number of processes we switch to ring algorithm.
  30  *
  31  * Example on 6 nodes:
  32  *  Initial state
  33  *    #     0      1      2      3      4      5
  34  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
  35  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
  36  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
  37  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
  38  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
  39  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
  40  *   Step 0:
  41  *    #     0      1      2      3      4      5
  42  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
  43  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
  44  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
  45  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
  46  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
  47  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
  48  *   Step 1:
  49  *    #     0      1      2      3      4      5
  50  *         [0]    [0]    [0]    [ ]    [ ]    [0]
  51  *         [1]    [1]    [1]    [ ]    [ ]    [1]
  52  *         [ ]    [2]    [2]    [2]    [2]    [ ]
  53  *         [ ]    [3]    [3]    [3]    [3]    [ ]
  54  *         [4]    [ ]    [ ]    [4]    [4]    [4]
  55  *         [5]    [ ]    [ ]    [5]    [5]    [5]
  56  *   Step 2:
  57  *    #     0      1      2      3      4      5
  58  *         [0]    [0]    [0]    [0]    [0]    [0]
  59  *         [1]    [1]    [1]    [1]    [1]    [1]
  60  *         [2]    [2]    [2]    [2]    [2]    [2]
  61  *         [3]    [3]    [3]    [3]    [3]    [3]
  62  *         [4]    [4]    [4]    [4]    [4]    [4]
  63  *         [5]    [5]    [5]    [5]    [5]    [5]
  64  */
  65
  66  #include "colls_private.h"
  67
  68 int
  69 smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
  70                                                   MPI_Datatype sdtype,
  71                                                   void* rbuf, int *rcounts, int *rdispls,
  72                                                   MPI_Datatype rdtype,
  73                                                   MPI_Comm comm)
  74 {
  75     int line = -1;
  76     int rank, size;
  77     int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  78
  79     int i, even_rank;
  80     int err = 0;
  81     ptrdiff_t slb, rlb, sext, rext;
  82     char *tmpsend = NULL, *tmprecv = NULL;
  83
  84
  85     size = smpi_comm_size(comm);
  86     rank = smpi_comm_rank(comm);
  87
  88     if (size % 2) {
  89         XBT_DEBUG(
  90                      "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
  91                      size);
  92         return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
  93                                                      rbuf, rcounts,
  94                                                      rdispls, rdtype,
  95                                                      comm);
  96     }
  97
  98     XBT_DEBUG(
  99                  "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
 100
 101     err = smpi_datatype_extent (sdtype, &slb, &sext);
 102     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 103
 104     err = smpi_datatype_extent (rdtype, &rlb, &rext);
 105     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 106
 107     /* Initialization step:
 108        - if send buffer is not MPI_IN_PLACE, copy send buffer to
 109        the appropriate block of receive buffer
 110     */
 111     tmprecv = (char*) rbuf + rdispls[rank] * rext;
 112     if (MPI_IN_PLACE != sbuf) {
 113         tmpsend = (char*) sbuf;
 114         err = smpi_datatype_copy(tmpsend, scount, sdtype,
 115                               tmprecv, rcounts[rank], rdtype);
 116         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
 117     }
 118
 119     /* Determine neighbors, order in which blocks will arrive, etc. */
 120     even_rank = !(rank % 2);
 121     if (even_rank) {
 122         neighbor[0] = (rank + 1) % size;
 123         neighbor[1] = (rank - 1 + size) % size;
 124         recv_data_from[0] = rank;
 125         recv_data_from[1] = rank;
 126         offset_at_step[0] = (+2);
 127         offset_at_step[1] = (-2);
 128     } else {
 129         neighbor[0] = (rank - 1 + size) % size;
 130         neighbor[1] = (rank + 1) % size;
 131         recv_data_from[0] = neighbor[0];
 132         recv_data_from[1] = neighbor[0];
 133         offset_at_step[0] = (-2);
 134         offset_at_step[1] = (+2);
 135     }
 136
 137     /* Communication loop:
 138        - First step is special: exchange a single block with neighbor[0].
 139        - Rest of the steps:
 140        update recv_data_from according to offset, and
 141        exchange two blocks with appropriate neighbor.
 142        the send location becomes previous receve location.
 143        Note, we need to create indexed datatype to send and receive these
 144        blocks properly.
 145     */
 146     tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
 147     tmpsend = (char*)rbuf + rdispls[rank] * rext;
 148     smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype,
 149                                    neighbor[0], COLL_TAG_ALLGATHERV,
 150                                    tmprecv, rcounts[neighbor[0]], rdtype,
 151                                    neighbor[0], COLL_TAG_ALLGATHERV,
 152                                    comm, MPI_STATUS_IGNORE);
 153
 154
 155
 156
 157
 158     /* Determine initial sending counts and displacements*/
 159     if (even_rank) {
 160         send_data_from = rank;
 161     } else {
 162         send_data_from = recv_data_from[0];
 163     }
 164
 165     for (i = 1; i < (size / 2); i++) {
 166         MPI_Datatype new_rdtype, new_sdtype;
 167         int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
 168         const int i_parity = i % 2;
 169         recv_data_from[i_parity] =
 170             (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
 171
 172         /* Create new indexed types for sending and receiving.
 173            We are sending data from ranks (send_data_from) and (send_data_from+1)
 174            We are receiving data from ranks (recv_data_from[i_parity]) and
 175            (recv_data_from[i_parity]+1).
 176         */
 177
 178         new_scounts[0] = rcounts[send_data_from];
 179         new_scounts[1] = rcounts[(send_data_from + 1)];
 180         new_sdispls[0] = rdispls[send_data_from];
 181         new_sdispls[1] = rdispls[(send_data_from + 1)];
 182         err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype,
 183                                       &new_sdtype);
 184         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 185         smpi_datatype_commit(&new_sdtype);
 186
 187         new_rcounts[0] = rcounts[recv_data_from[i_parity]];
 188         new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
 189         new_rdispls[0] = rdispls[recv_data_from[i_parity]];
 190         new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
 191         err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype,
 192                                       &new_rdtype);
 193         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 194         smpi_datatype_commit(&new_rdtype);
 195
 196         tmprecv = (char*)rbuf;
 197         tmpsend = (char*)rbuf;
 198
 199         /* Sendreceive */
 200         smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
 201                                        COLL_TAG_ALLGATHERV,
 202                                        tmprecv, 1, new_rdtype, neighbor[i_parity],
 203                                        COLL_TAG_ALLGATHERV,
 204                                        comm, MPI_STATUS_IGNORE);
 205
 206         send_data_from = recv_data_from[i_parity];
 207
 208         smpi_datatype_free(&new_sdtype);
 209         smpi_datatype_free(&new_rdtype);
 210     }
 211
 212     return MPI_SUCCESS;
 213
 214  err_hndl:
 215     XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
 216                  __FILE__, line, err, rank);
 217     return err;
 218 }