src/smpi/colls/allgatherv-ompi-bruck.c

   1 /* Copyright (c) 2013-2014. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2009 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2009      University of Houston. All rights reserved.
  19  *
  20  * Additional copyrights may follow
  21  */
  22
  23 #include "colls_private.h"
  24 /*
  25  * ompi_coll_tuned_allgatherv_intra_bruck
  26  *
  27  * Function:     allgather using O(log(N)) steps.
  28  * Accepts:      Same arguments as MPI_Allgather
  29  * Returns:      MPI_SUCCESS or error code
  30  *
  31  * Description:  Variation to All-to-all algorithm described by Bruck et al.in
  32  *               "Efficient Algorithms for All-to-all Communications
  33  *                in Multiport Message-Passing Systems"
  34  * Note:         Unlike in case of allgather implementation, we relay on
  35  *               indexed datatype to select buffers appropriately.
  36  *               The only additional memory requirement is for creation of
  37  *               temporary datatypes.
  38  * Example on 7 nodes (memory lay out need not be in-order)
  39  *   Initial set up:
  40  *    #     0      1      2      3      4      5      6
  41  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [ ]
  42  *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
  43  *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]    [ ]
  44  *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]    [ ]
  45  *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]    [ ]
  46  *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]    [ ]
  47  *         [ ]    [ ]    [ ]    [ ]    [ ]    [ ]    [6]
  48  *   Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
  49  *    #     0      1      2      3      4      5      6
  50  *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [0]
  51  *         [1]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
  52  *         [ ]    [2]    [2]    [ ]    [ ]    [ ]    [ ]
  53  *         [ ]    [ ]    [3]    [3]    [ ]    [ ]    [ ]
  54  *         [ ]    [ ]    [ ]    [4]    [4]    [ ]    [ ]
  55  *         [ ]    [ ]    [ ]    [ ]    [5]    [5]    [ ]
  56  *         [ ]    [ ]    [ ]    [ ]    [ ]    [6]    [6]
  57  *   Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
  58  *           message contains all blocks from (rank) .. (rank + 2^2) with
  59  *           wrap around.
  60  *    #     0      1      2      3      4      5      6
  61  *         [0]    [ ]    [ ]    [ ]    [0]    [0]    [0]
  62  *         [1]    [1]    [ ]    [ ]    [ ]    [1]    [1]
  63  *         [2]    [2]    [2]    [ ]    [ ]    [ ]    [2]
  64  *         [3]    [3]    [3]    [3]    [ ]    [ ]    [ ]
  65  *         [ ]    [4]    [4]    [4]    [4]    [ ]    [ ]
  66  *         [ ]    [ ]    [5]    [5]    [5]    [5]    [ ]
  67  *         [ ]    [ ]    [ ]    [6]    [6]    [6]    [6]
  68  *   Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
  69  *           message size is "all remaining blocks"
  70  *    #     0      1      2      3      4      5      6
  71  *         [0]    [0]    [0]    [0]    [0]    [0]    [0]
  72  *         [1]    [1]    [1]    [1]    [1]    [1]    [1]
  73  *         [2]    [2]    [2]    [2]    [2]    [2]    [2]
  74  *         [3]    [3]    [3]    [3]    [3]    [3]    [3]
  75  *         [4]    [4]    [4]    [4]    [4]    [4]    [4]
  76  *         [5]    [5]    [5]    [5]    [5]    [5]    [5]
  77  *         [6]    [6]    [6]    [6]    [6]    [6]    [6]
  78  */
  79 int smpi_coll_tuned_allgatherv_ompi_bruck(void *sbuf, int scount,
  80                                            MPI_Datatype sdtype,
  81                                            void *rbuf, int *rcounts,
  82                                            int *rdispls,
  83                                            MPI_Datatype rdtype,
  84                                            MPI_Comm comm)
  85 {
  86    int rank, size;
  87    int sendto, recvfrom, distance, blockcount, i;
  88    int *new_rcounts = NULL, *new_rdispls = NULL;
  89    int *new_scounts = NULL, *new_sdispls = NULL;
  90    ptrdiff_t slb, rlb, sext, rext;
  91    char *tmpsend = NULL, *tmprecv = NULL;
  92    MPI_Datatype new_rdtype, new_sdtype;
  93
  94    size = smpi_comm_size(comm);
  95    rank = smpi_comm_rank(comm);
  96
  97    XBT_DEBUG(
  98                 "coll:tuned:allgather_ompi_bruck rank %d", rank);
  99
 100    smpi_datatype_extent (sdtype, &slb, &sext);
 101
 102    smpi_datatype_extent (rdtype, &rlb, &rext);
 103
 104    /* Initialization step:
 105       - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
 106         the receive buffer.
 107    */
 108    tmprecv = (char*) rbuf + rdispls[rank] * rext;
 109    if (MPI_IN_PLACE != sbuf) {
 110       tmpsend = (char*) sbuf;
 111       smpi_datatype_copy(tmpsend, scount, sdtype,
 112                             tmprecv, rcounts[rank], rdtype);
 113    }
 114
 115    /* Communication step:
 116       At every step i, rank r:
 117       - doubles the distance
 118       - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
 119         to rank (r - distance)
 120       - receives message of blockcount blocks,
 121         (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
 122         rank (r + distance)
 123       - blockcount doubles until the last step when only the remaining data is
 124       exchanged.
 125    */
 126    new_rcounts = (int*) calloc(4*size, sizeof(int));
 127    new_rdispls = new_rcounts + size;
 128    new_scounts = new_rdispls + size;
 129    new_sdispls = new_scounts + size;
 130
 131    for (distance = 1; distance < size; distance<<=1) {
 132
 133       recvfrom = (rank + distance) % size;
 134       sendto = (rank - distance + size) % size;
 135
 136       if (distance <= (size >> 1)) {
 137          blockcount = distance;
 138       } else {
 139          blockcount = size - distance;
 140       }
 141
 142       /* create send and receive datatypes */
 143       for (i = 0; i < blockcount; i++) {
 144           const int tmp_srank = (rank + i) % size;
 145           const int tmp_rrank = (recvfrom + i) % size;
 146           new_scounts[i] = rcounts[tmp_srank];
 147           new_sdispls[i] = rdispls[tmp_srank];
 148           new_rcounts[i] = rcounts[tmp_rrank];
 149           new_rdispls[i] = rdispls[tmp_rrank];
 150       }
 151       smpi_datatype_indexed(blockcount, new_scounts, new_sdispls,
 152                                     rdtype, &new_sdtype);
 153       smpi_datatype_indexed(blockcount, new_rcounts, new_rdispls,
 154                                     rdtype, &new_rdtype);
 155
 156       smpi_datatype_commit(&new_sdtype);
 157       smpi_datatype_commit(&new_rdtype);
 158
 159       /* Sendreceive */
 160       smpi_mpi_sendrecv(rbuf, 1, new_sdtype, sendto,
 161                                      COLL_TAG_ALLGATHERV,
 162                                      rbuf, 1, new_rdtype, recvfrom,
 163                                      COLL_TAG_ALLGATHERV,
 164                                      comm, MPI_STATUS_IGNORE);
 165       smpi_datatype_free(&new_sdtype);
 166       smpi_datatype_free(&new_rdtype);
 167
 168    }
 169
 170    free(new_rcounts);
 171
 172    return MPI_SUCCESS;
 173
 174 }
 175