1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
8 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
9 * University Research and Technology
10 * Corporation. All rights reserved.
11 * Copyright (c) 2004-2012 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
14 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
15 * University of Stuttgart. All rights reserved.
16 * Copyright (c) 2004-2005 The Regents of the University of California.
17 * All rights reserved.
18 * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
19 * Copyright (c) 2009 University of Houston. All rights reserved.
21 * Additional copyrights may follow
26 * (C) 2001 by Argonne National Laboratory.
27 * See COPYRIGHT in top-level directory.
29 /* Copyright (c) 2001-2014, The Ohio State University. All rights
32 * This file is part of the MVAPICH2 software package developed by the
33 * team members of The Ohio State University's Network-Based Computing
34 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
36 * For detailed copyright and licensing information, please refer to the
37 * copyright file COPYRIGHT in the top level MVAPICH2 directory.
42 #define MV2_ALLTOALL_THROTTLE_FACTOR 4
44 #include "colls_private.h"
46 int smpi_coll_tuned_alltoall_mvapich2_scatter_dest(
49 MPI_Datatype sendtype,
52 MPI_Datatype recvtype,
56 MPI_Aint sendtype_extent = 0, recvtype_extent = 0;
57 int mpi_errno=MPI_SUCCESS;
59 MPI_Request *reqarray;
62 if (recvcount == 0) return MPI_SUCCESS;
64 comm_size = smpi_comm_size(comm);
65 rank = smpi_comm_rank(comm);
67 /* Get extent of send and recv types */
68 recvtype_extent = smpi_datatype_get_extent(recvtype);
69 sendtype_extent = smpi_datatype_get_extent(sendtype);
71 /* Medium-size message. Use isend/irecv with scattered
72 destinations. Use Tony Ladd's modification to post only
73 a small number of isends/irecvs at a time. */
74 /* FIXME: This converts the Alltoall to a set of blocking phases.
75 Two alternatives should be considered:
76 1) the choice of communication pattern could try to avoid
77 contending routes in each phase
78 2) rather than wait for all communication to finish (waitall),
79 we could maintain constant queue size by using waitsome
80 and posting new isend/irecv as others complete. This avoids
81 synchronization delays at the end of each block (when
82 there are only a few isend/irecvs left)
86 //Stampede is configured with
87 bblock = MV2_ALLTOALL_THROTTLE_FACTOR;//mv2_coll_param.alltoall_throttle_factor;
89 if (bblock >= comm_size) bblock = comm_size;
90 /* If throttle_factor is n, each process posts n pairs of isend/irecv
93 /* FIXME: This should use the memory macros (there are storage
94 leaks here if there is an error, for example) */
95 reqarray= (MPI_Request*)xbt_malloc(2*bblock*sizeof(MPI_Request));
97 starray=(MPI_Status *)xbt_malloc(2*bblock*sizeof(MPI_Status));
99 for (ii=0; ii<comm_size; ii+=bblock) {
100 ss = comm_size-ii < bblock ? comm_size-ii : bblock;
101 /* do the communication -- post ss sends and receives: */
102 for ( i=0; i<ss; i++ ) {
103 dst = (rank+i+ii) % comm_size;
104 reqarray[i]=smpi_mpi_irecv((char *)recvbuf +
105 dst*recvcount*recvtype_extent,
106 recvcount, recvtype, dst,
107 COLL_TAG_ALLTOALL, comm);
110 for ( i=0; i<ss; i++ ) {
111 dst = (rank-i-ii+comm_size) % comm_size;
112 reqarray[i+ss]=smpi_mpi_isend((char *)sendbuf +
113 dst*sendcount*sendtype_extent,
114 sendcount, sendtype, dst,
115 COLL_TAG_ALLTOALL, comm);
119 /* ... then wait for them to finish: */
120 smpi_mpi_waitall(2*ss,reqarray,starray);
123 /* --BEGIN ERROR HANDLING-- */
124 if (mpi_errno == MPI_ERR_IN_STATUS) {
125 for (j=0; j<2*ss; j++) {
126 if (starray[j].MPI_ERROR != MPI_SUCCESS) {
127 mpi_errno = starray[j].MPI_ERROR;
132 /* --END ERROR HANDLING-- */