src/smpi/colls/allgather-ompi-neighborexchange.c

   1 /*
   2  * ompi_coll_tuned_allgather_intra_neighborexchange
   3  *
   4  * Function:     allgather using N/2 steps (O(N))
   5  * Accepts:      Same arguments as MPI_Allgather
   6  * Returns:      MPI_SUCCESS or error code
   7  *
   8  * Description:  Neighbor Exchange algorithm for allgather.
   9  *               Described by Chen et.al. in
  10  *               "Performance Evaluation of Allgather Algorithms on
  11  *                Terascale Linux Cluster with Fast Ethernet",
  12  *               Proceedings of the Eighth International Conference on
  13  *               High-Performance Computing inn Asia-Pacific Region
  14  *               (HPCASIA'05), 2005
  15  *
  16  *               Rank r exchanges message with one of its neighbors and
  17  *               forwards the data further in the next step.
  18  *
  19  *               No additional memory requirements.
  20  *
  21  * Limitations:  Algorithm works only on even number of processes.
  22  *               For odd number of processes we switch to ring algorithm.
  23  *
  24  * Example on 6 nodes:
  25  *  Initial state
  26  *    #     0      1      2      3      4      5
  27  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
  28  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
  29  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
  30  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
  31  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
  32  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
  33  *   Step 0:
  34  *    #     0      1      2      3      4      5
  35  *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
  36  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
  37  *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
  38  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
  39  *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
  40  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
  41  *   Step 1:
  42  *    #     0      1      2      3      4      5
  43  *         [0]    [0]    [0]    [ ]    [ ]    [0]
  44  *         [1]    [1]    [1]    [ ]    [ ]    [1]
  45  *         [ ]    [2]    [2]    [2]    [2]    [ ]
  46  *         [ ]    [3]    [3]    [3]    [3]    [ ]
  47  *         [4]    [ ]    [ ]    [4]    [4]    [4]
  48  *         [5]    [ ]    [ ]    [5]    [5]    [5]
  49  *   Step 2:
  50  *    #     0      1      2      3      4      5
  51  *         [0]    [0]    [0]    [0]    [0]    [0]
  52  *         [1]    [1]    [1]    [1]    [1]    [1]
  53  *         [2]    [2]    [2]    [2]    [2]    [2]
  54  *         [3]    [3]    [3]    [3]    [3]    [3]
  55  *         [4]    [4]    [4]    [4]    [4]    [4]
  56  *         [5]    [5]    [5]    [5]    [5]    [5]
  57  */
  58
  59  #include "colls_private.h"
  60 int
  61 smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
  62                                                  MPI_Datatype sdtype,
  63                                                  void* rbuf, int rcount,
  64                                                  MPI_Datatype rdtype,
  65                                                  MPI_Comm comm
  66 )
  67 {
  68    int line = -1;
  69    int rank, size;
  70    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  71    int i, even_rank;
  72    int err = 0;
  73    ptrdiff_t slb, rlb, sext, rext;
  74    char *tmpsend = NULL, *tmprecv = NULL;
  75
  76    size = smpi_comm_size(comm);
  77    rank = smpi_comm_rank(comm);
  78
  79    if (size % 2) {
  80       XBT_DEBUG(
  81                    "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
  82                    size);
  83       return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
  84                                                   rbuf, rcount, rdtype,
  85                                                   comm);
  86    }
  87
  88    XBT_DEBUG(
  89                 "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
  90
  91    err = smpi_datatype_extent (sdtype, &slb, &sext);
  92    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
  93
  94    err = smpi_datatype_extent (rdtype, &rlb, &rext);
  95    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
  96
  97    /* Initialization step:
  98       - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
  99         of receive buffer
 100    */
 101    tmprecv = (char*) rbuf + rank * rcount * rext;
 102    if (MPI_IN_PLACE != sbuf) {
 103       tmpsend = (char*) sbuf;
 104       smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
 105    }
 106
 107    /* Determine neighbors, order in which blocks will arrive, etc. */
 108    even_rank = !(rank % 2);
 109    if (even_rank) {
 110       neighbor[0] = (rank + 1) % size;
 111       neighbor[1] = (rank - 1 + size) % size;
 112       recv_data_from[0] = rank;
 113       recv_data_from[1] = rank;
 114       offset_at_step[0] = (+2);
 115       offset_at_step[1] = (-2);
 116    } else {
 117       neighbor[0] = (rank - 1 + size) % size;
 118       neighbor[1] = (rank + 1) % size;
 119       recv_data_from[0] = neighbor[0];
 120       recv_data_from[1] = neighbor[0];
 121       offset_at_step[0] = (-2);
 122       offset_at_step[1] = (+2);
 123    }
 124
 125    /* Communication loop:
 126       - First step is special: exchange a single block with neighbor[0].
 127       - Rest of the steps:
 128         update recv_data_from according to offset, and
 129         exchange two blocks with appropriate neighbor.
 130         the send location becomes previous receve location.
 131    */
 132    tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
 133    tmpsend = (char*)rbuf + rank * rcount * rext;
 134    /* Sendreceive */
 135    smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
 136                                   COLL_TAG_ALLGATHER,
 137                                   tmprecv, rcount, rdtype, neighbor[0],
 138                                   COLL_TAG_ALLGATHER,
 139                                   comm, MPI_STATUS_IGNORE);
 140
 141    /* Determine initial sending location */
 142    if (even_rank) {
 143       send_data_from = rank;
 144    } else {
 145       send_data_from = recv_data_from[0];
 146    }
 147
 148    for (i = 1; i < (size / 2); i++) {
 149       const int i_parity = i % 2;
 150       recv_data_from[i_parity] =
 151          (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
 152
 153       tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
 154       tmpsend = (char*)rbuf + send_data_from * rcount * rext;
 155
 156       /* Sendreceive */
 157       smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype,
 158                                      neighbor[i_parity],
 159                                      COLL_TAG_ALLGATHER,
 160                                      tmprecv, 2 * rcount, rdtype,
 161                                      neighbor[i_parity],
 162                                      COLL_TAG_ALLGATHER,
 163                                      comm, MPI_STATUS_IGNORE);
 164
 165       send_data_from = recv_data_from[i_parity];
 166    }
 167
 168    return MPI_SUCCESS;
 169
 170  err_hndl:
 171    XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
 172                 __FILE__, line, err, rank);
 173    return err;
 174 }