src/smpi/colls/alltoall/alltoall-mvapich-scatter-dest.cpp

   1 /* Copyright (c) 2013-2018. The SimGrid Team.
   2  * All rights reserved.                                                     */
   3
   4 /* This program is free software; you can redistribute it and/or modify it
   5  * under the terms of the license (GNU LGPL) which comes with this package. */
   6
   7 /*
   8  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
   9  *                         University Research and Technology
  10  *                         Corporation.  All rights reserved.
  11  * Copyright (c) 2004-2012 The University of Tennessee and The University
  12  *                         of Tennessee Research Foundation.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  15  *                         University of Stuttgart.  All rights reserved.
  16  * Copyright (c) 2004-2005 The Regents of the University of California.
  17  *                         All rights reserved.
  18  * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
  19  * Copyright (c) 2009      University of Houston. All rights reserved.
  20  *
  21  * Additional copyrights may follow
  22  */
  23
  24 /*
  25  *
  26  *  (C) 2001 by Argonne National Laboratory.
  27  *      See COPYRIGHT in top-level directory.
  28  */
  29 /* Copyright (c) 2001-2014, The Ohio State University. All rights
  30  * reserved.
  31  *
  32  * This file is part of the MVAPICH2 software package developed by the
  33  * team members of The Ohio State University's Network-Based Computing
  34  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
  35  *
  36  * For detailed copyright and licensing information, please refer to the
  37  * copyright file COPYRIGHT in the top level MVAPICH2 directory.
  38  *
  39  */
  40
  41 //correct on stampede
  42 #define MV2_ALLTOALL_THROTTLE_FACTOR         4
  43
  44 #include "../colls_private.hpp"
  45 namespace simgrid{
  46 namespace smpi{
  47 int Coll_alltoall_mvapich2_scatter_dest::alltoall(
  48                             void *sendbuf,
  49                             int sendcount,
  50                             MPI_Datatype sendtype,
  51                             void *recvbuf,
  52                             int recvcount,
  53                             MPI_Datatype recvtype,
  54                             MPI_Comm comm)
  55 {
  56     int          comm_size, i, j;
  57     MPI_Aint     sendtype_extent = 0, recvtype_extent = 0;
  58     int mpi_errno=MPI_SUCCESS;
  59     int dst, rank;
  60     MPI_Request *reqarray;
  61     MPI_Status *starray;
  62
  63     if (recvcount == 0) return MPI_SUCCESS;
  64
  65     comm_size =  comm->size();
  66     rank = comm->rank();
  67
  68     /* Get extent of send and recv types */
  69     recvtype_extent = recvtype->get_extent();
  70     sendtype_extent = sendtype->get_extent();
  71
  72     /* Medium-size message. Use isend/irecv with scattered
  73      destinations. Use Tony Ladd's modification to post only
  74      a small number of isends/irecvs at a time. */
  75     /* FIXME: This converts the Alltoall to a set of blocking phases.
  76      Two alternatives should be considered:
  77      1) the choice of communication pattern could try to avoid
  78      contending routes in each phase
  79      2) rather than wait for all communication to finish (waitall),
  80      we could maintain constant queue size by using waitsome
  81      and posting new isend/irecv as others complete.  This avoids
  82      synchronization delays at the end of each block (when
  83      there are only a few isend/irecvs left)
  84      */
  85     int ii, ss, bblock;
  86
  87     //Stampede is configured with
  88     bblock = MV2_ALLTOALL_THROTTLE_FACTOR;//mv2_coll_param.alltoall_throttle_factor;
  89
  90     if (bblock >= comm_size) bblock = comm_size;
  91     /* If throttle_factor is n, each process posts n pairs of isend/irecv
  92      in each iteration. */
  93
  94     /* FIXME: This should use the memory macros (there are storage
  95      leaks here if there is an error, for example) */
  96     reqarray= (MPI_Request*)xbt_malloc(2*bblock*sizeof(MPI_Request));
  97
  98     starray=(MPI_Status *)xbt_malloc(2*bblock*sizeof(MPI_Status));
  99
 100     for (ii=0; ii<comm_size; ii+=bblock) {
 101         ss = comm_size-ii < bblock ? comm_size-ii : bblock;
 102         /* do the communication -- post ss sends and receives: */
 103         for ( i=0; i<ss; i++ ) {
 104             dst = (rank+i+ii) % comm_size;
 105             reqarray[i]=Request::irecv((char *)recvbuf +
 106                                       dst*recvcount*recvtype_extent,
 107                                       recvcount, recvtype, dst,
 108                                       COLL_TAG_ALLTOALL, comm);
 109
 110         }
 111         for ( i=0; i<ss; i++ ) {
 112             dst = (rank-i-ii+comm_size) % comm_size;
 113             reqarray[i+ss]=Request::isend((char *)sendbuf +
 114                                           dst*sendcount*sendtype_extent,
 115                                           sendcount, sendtype, dst,
 116                                           COLL_TAG_ALLTOALL, comm);
 117
 118         }
 119
 120         /* ... then wait for them to finish: */
 121         Request::waitall(2*ss,reqarray,starray);
 122
 123
 124         /* --BEGIN ERROR HANDLING-- */
 125         if (mpi_errno == MPI_ERR_IN_STATUS) {
 126                 for (j=0; j<2*ss; j++) {
 127                      if (starray[j].MPI_ERROR != MPI_SUCCESS) {
 128                          mpi_errno = starray[j].MPI_ERROR;
 129                      }
 130                 }
 131         }
 132     }
 133     /* --END ERROR HANDLING-- */
 134     xbt_free(starray);
 135     xbt_free(reqarray);
 136     return (mpi_errno);
 137
 138 }
 139 }
 140 }