From da62b5cbd53ece70108a4f0eba4757e75f3dc00a Mon Sep 17 00:00:00 2001 From: degomme Date: Thu, 13 Jun 2013 09:26:51 +0200 Subject: [PATCH] add scatter algos from ompi --- buildtools/Cmake/DefinePackages.cmake | 1 + src/include/smpi/smpi_interface.h | 9 + src/simgrid/sg_config.c | 8 + src/smpi/colls/colls.h | 16 ++ src/smpi/colls/scatter-ompi.c | 246 +++++++++++++++++++++++++ src/smpi/colls/smpi_openmpi_selector.c | 23 ++- src/smpi/smpi_coll.c | 8 + src/smpi/smpi_global.c | 7 + src/smpi/smpi_pmpi.c | 2 +- 9 files changed, 307 insertions(+), 13 deletions(-) create mode 100644 src/smpi/colls/scatter-ompi.c diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake index 4612ad1cc0..4b5c74efcb 100644 --- a/buildtools/Cmake/DefinePackages.cmake +++ b/buildtools/Cmake/DefinePackages.cmake @@ -197,6 +197,7 @@ set(SMPI_SRC src/smpi/colls/reduce-ompi.c src/smpi/colls/gather-ompi.c src/smpi/colls/reduce_scatter-ompi.c + src/smpi/colls/scatter-ompi.c ) if(SMPI_F2C) diff --git a/src/include/smpi/smpi_interface.h b/src/include/smpi/smpi_interface.h index f90ef705c3..7ba0fdc34a 100644 --- a/src/include/smpi/smpi_interface.h +++ b/src/include/smpi/smpi_interface.h @@ -99,6 +99,15 @@ XBT_PUBLIC_DATA(int (*mpi_coll_reduce_scatter_fun) (void *sbuf, void *rbuf, int *rcounts, MPI_Datatype dtype, MPI_Op op,MPI_Comm comm)); +/** \ingroup MPI scatter + * \brief The list of all available allgather collectives + */ +XBT_PUBLIC_DATA(s_mpi_coll_description_t) mpi_coll_scatter_description[]; +XBT_PUBLIC_DATA(int (*mpi_coll_scatter_fun) + (void *sendbuf, int sendcount, MPI_Datatype sendtype, + void *recvbuf, int recvcount, MPI_Datatype recvtype, + int root, MPI_Comm comm)); + XBT_PUBLIC(void) coll_help(const char *category, s_mpi_coll_description_t * table); XBT_PUBLIC(int) find_coll_description(s_mpi_coll_description_t * table, diff --git a/src/simgrid/sg_config.c b/src/simgrid/sg_config.c index d600f19af7..c29f5e8841 100644 --- a/src/simgrid/sg_config.c +++ b/src/simgrid/sg_config.c @@ -279,6 +279,9 @@ static void _sg_cfg_cb__coll_reduce(const char *name, int pos) static void _sg_cfg_cb__coll_reduce_scatter(const char *name, int pos){ _sg_cfg_cb__coll("reduce_scatter", mpi_coll_reduce_scatter_description, name, pos); } +static void _sg_cfg_cb__coll_scatter(const char *name, int pos){ + _sg_cfg_cb__coll("scatter", mpi_coll_scatter_description, name, pos); +} #endif /* callback of the inclusion path */ @@ -780,6 +783,11 @@ void sg_config_init(int *argc, char **argv) xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_reduce_scatter, NULL); + xbt_cfg_register(&_sg_cfg_set, "smpi/scatter", + "Which collective to use for scatter", + xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_scatter, + NULL); + xbt_cfg_register(&_sg_cfg_set, "smpi/allgatherv", "Which collective to use for allgatherv", xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_allgatherv, diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index 55be878d14..12627c06c8 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -227,4 +227,20 @@ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring) COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep) + +/************* + * REDUCE_SCATTER * + *************/ +#define COLL_SCATTER_SIG scatter, int, \ + (void *sendbuf, int sendcount, MPI_Datatype sendtype,\ + void *recvbuf, int recvcount, MPI_Datatype recvtype,\ + int root, MPI_Comm comm) + +#define COLL_SCATTERS(action, COLL_sep) \ +COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \ +COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \ +COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial) + +COLL_SCATTERS(COLL_PROTO, COLL_NOsep) + #endif diff --git a/src/smpi/colls/scatter-ompi.c b/src/smpi/colls/scatter-ompi.c new file mode 100644 index 0000000000..205d60287c --- /dev/null +++ b/src/smpi/colls/scatter-ompi.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "colls_private.h" +#include "coll_tuned_topo.h" + +#define MCA_COLL_BASE_TAG_SCATTER 111 + +int +smpi_coll_tuned_scatter_ompi_binomial(void *sbuf, int scount, + MPI_Datatype sdtype, + void *rbuf, int rcount, + MPI_Datatype rdtype, + int root, + MPI_Comm comm + ) +{ + int line = -1; + int i; + int rank; + int vrank; + int size; + int total_send = 0; + char *ptmp = NULL; + char *tempbuf = NULL; + int err; + ompi_coll_tree_t* bmtree; + MPI_Status status; + MPI_Aint sextent, slb, strue_lb, strue_extent; + MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; + + size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + XBT_DEBUG( + "smpi_coll_tuned_scatter_ompi_binomial rank %d", rank); + + /* create the binomial tree */ + +// COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root ); + bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( comm, root);//ompi_ data->cached_in_order_bmtree; + + smpi_datatype_extent(sdtype, &slb, &sextent); + smpi_datatype_extent(sdtype, &strue_lb, &strue_extent); + smpi_datatype_extent(rdtype, &rlb, &rextent); + smpi_datatype_extent(rdtype, &rtrue_lb, &rtrue_extent); + + vrank = (rank - root + size) % size; + + if (rank == root) { + if (0 == root) { + /* root on 0, just use the send buffer */ + ptmp = (char *) sbuf; + if (rbuf != MPI_IN_PLACE) { + /* local copy to rbuf */ + err = smpi_datatype_copy(sbuf, scount, sdtype, + rbuf, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } else { + /* root is not on 0, allocate temp buffer for send */ + tempbuf = (char *) malloc(strue_extent + (scount*size - 1) * sextent); + if (NULL == tempbuf) { + err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl; + } + + ptmp = tempbuf - slb; + + /* and rotate data so they will eventually in the right place */ + err = smpi_datatype_copy((char *) sbuf + sextent*root*scount, scount*(size-root), sdtype, + ptmp, scount*(size-root), sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + + err = smpi_datatype_copy((char*)sbuf, scount*root, sdtype, + ptmp + sextent*scount*(size - root), scount*root, sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + if (rbuf != MPI_IN_PLACE) { + /* local copy to rbuf */ + err = smpi_datatype_copy(ptmp, scount, sdtype, + rbuf, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } + total_send = scount; + } else if (!(vrank % 2)) { + /* non-root, non-leaf nodes, allocte temp buffer for recv + * the most we need is rcount*size/2 */ + tempbuf = (char *) malloc(rtrue_extent + (rcount*size - 1) * rextent); + if (NULL == tempbuf) { + err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl; + } + + ptmp = tempbuf - rlb; + + sdtype = rdtype; + scount = rcount; + sextent = rextent; + total_send = scount; + } else { + /* leaf nodes, just use rbuf */ + ptmp = (char *) rbuf; + } + + if (!(vrank % 2)) { + if (rank != root) { + /* recv from parent on non-root */ + smpi_mpi_recv(ptmp, rcount*size, rdtype, bmtree->tree_prev, + MCA_COLL_BASE_TAG_SCATTER, comm, &status); + /* local copy to rbuf */ + err = smpi_datatype_copy(ptmp, scount, sdtype, + rbuf, rcount, rdtype); + } + /* send to children on all non-leaf */ + for (i = 0; i < bmtree->tree_nextsize; i++) { + int mycount = 0, vkid; + /* figure out how much data I have to send to this child */ + vkid = (bmtree->tree_next[i] - root + size) % size; + mycount = vkid - vrank; + if (mycount > (size - vkid)) + mycount = size - vkid; + mycount *= scount; + + smpi_mpi_send(ptmp + total_send*sextent, mycount, sdtype, + bmtree->tree_next[i], + MCA_COLL_BASE_TAG_SCATTER, + comm); + + total_send += mycount; + } + + if (NULL != tempbuf) + free(tempbuf); + } else { + /* recv from parent on leaf nodes */ + smpi_mpi_recv(ptmp, rcount, rdtype, bmtree->tree_prev, + MCA_COLL_BASE_TAG_SCATTER, comm, &status); + } + + return MPI_SUCCESS; + + err_hndl: + if (NULL != tempbuf) + free(tempbuf); + + XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank); + return err; +} + +/* + * Linear functions are copied from the BASIC coll module + * they do not segment the message and are simple implementations + * but for some small number of nodes and/or small data sizes they + * are just as fast as tuned/tree based segmenting operations + * and as such may be selected by the decision functions + * These are copied into this module due to the way we select modules + * in V1. i.e. in V2 we will handle this differently and so will not + * have to duplicate code. + * JPG following the examples from other coll_tuned implementations. Dec06. + */ + +/* copied function (with appropriate renaming) starts here */ +/* + * scatter_intra + * + * Function: - basic scatter operation + * Accepts: - same arguments as MPI_Scatter() + * Returns: - MPI_SUCCESS or error code + */ +int +smpi_coll_tuned_scatter_ompi_basic_linear(void *sbuf, int scount, + MPI_Datatype sdtype, + void *rbuf, int rcount, + MPI_Datatype rdtype, + int root, + MPI_Comm comm + ) +{ + int i, rank, size, err; + char *ptmp; + ptrdiff_t lb, incr; + + /* Initialize */ + + rank = smpi_comm_rank(comm); + size = smpi_comm_size(comm); + + /* If not root, receive data. */ + + if (rank != root) { + smpi_mpi_recv(rbuf, rcount, rdtype, root, + MCA_COLL_BASE_TAG_SCATTER, + comm, MPI_STATUS_IGNORE); + return MPI_SUCCESS; + } + + /* I am the root, loop sending data. */ + + err = smpi_datatype_extent(sdtype, &lb, &incr); + if (MPI_SUCCESS != err) { + return MPI_ERR_OTHER; + } + + incr *= scount; + for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) { + + /* simple optimization */ + + if (i == rank) { + if (MPI_IN_PLACE != rbuf) { + err = + smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount, + rdtype); + } + } else { + smpi_mpi_send(ptmp, scount, sdtype, i, + MCA_COLL_BASE_TAG_SCATTER, + comm); + } + if (MPI_SUCCESS != err) { + return err; + } + } + + /* All done */ + + return MPI_SUCCESS; +} diff --git a/src/smpi/colls/smpi_openmpi_selector.c b/src/smpi/colls/smpi_openmpi_selector.c index 15dac048e7..f5a23c4d92 100644 --- a/src/smpi/colls/smpi_openmpi_selector.c +++ b/src/smpi/colls/smpi_openmpi_selector.c @@ -568,12 +568,12 @@ int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, rbuf, rcount, rdtype, root, comm); } -/* + int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount, MPI_Datatype rdtype, - int root, MPI_Comm comm, + int root, MPI_Comm comm ) { const size_t small_block_size = 300; @@ -581,28 +581,27 @@ int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, int communicator_size, rank; size_t dsize, block_size; - OPAL_OUTPUT((smpi_coll_tuned_stream, - "smpi_coll_tuned_scatter_ompi")); + XBT_DEBUG("smpi_coll_tuned_scatter_ompi"); communicator_size = smpi_comm_size(comm); - rank = ompi_comm_rank(comm); + rank = smpi_comm_rank(comm); // Determine block size if (root == rank) { - ompi_datatype_type_size(sdtype, &dsize); + dsize=smpi_datatype_size(sdtype); block_size = dsize * scount; } else { - ompi_datatype_type_size(rdtype, &dsize); + dsize=smpi_datatype_size(rdtype); block_size = dsize * rcount; } if ((communicator_size > small_comm_size) && (block_size < small_block_size)) { - return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype, + return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype, rbuf, rcount, rdtype, - root, comm, module); + root, comm); } - return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, + return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, - root, comm, module); -}*/ + root, comm); +} diff --git a/src/smpi/smpi_coll.c b/src/smpi/smpi_coll.c index 9140f3ad70..50ee98edc4 100644 --- a/src/smpi/smpi_coll.c +++ b/src/smpi/smpi_coll.c @@ -55,6 +55,13 @@ COLL_REDUCE_SCATTERS(COLL_DESCRIPTION, COLL_COMMA), {NULL, NULL, NULL} /* this array must be NULL terminated */ }; +s_mpi_coll_description_t mpi_coll_scatter_description[] = { + {"default", + "scatter default collective", + smpi_mpi_scatter}, +COLL_SCATTERS(COLL_DESCRIPTION, COLL_COMMA), + {NULL, NULL, NULL} /* this array must be NULL terminated */ +}; s_mpi_coll_description_t mpi_coll_alltoall_description[] = { {"default", @@ -153,6 +160,7 @@ int (*mpi_coll_alltoallv_fun)(void *, int*, int*, MPI_Datatype, void*, int*, int int (*mpi_coll_bcast_fun)(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm com); int (*mpi_coll_reduce_fun)(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm); int (*mpi_coll_reduce_scatter_fun)(void *sbuf, void *rbuf, int *rcounts,MPI_Datatype dtype,MPI_Op op,MPI_Comm comm); +int (*mpi_coll_scatter_fun)(void *sendbuf, int sendcount, MPI_Datatype sendtype,void *recvbuf, int recvcount, MPI_Datatype recvtype,int root, MPI_Comm comm); struct s_proc_tree { int PROCTREE_A; int numChildren; diff --git a/src/smpi/smpi_global.c b/src/smpi/smpi_global.c index 5b05cfe912..d980dfc6b5 100644 --- a/src/smpi/smpi_global.c +++ b/src/smpi/smpi_global.c @@ -413,6 +413,13 @@ int smpi_main(int (*realmain) (int argc, char *argv[]),int argc, char *argv[]) mpi_coll_reduce_scatter_fun = (int (*)(void *sbuf, void *rbuf, int *rcounts,\ MPI_Datatype dtype,MPI_Op op,MPI_Comm comm)) mpi_coll_reduce_scatter_description[reduce_scatter_id].coll; + + int scatter_id = find_coll_description(mpi_coll_scatter_description, + sg_cfg_get_string("smpi/scatter")); + mpi_coll_scatter_fun = (int (*)(void *sendbuf, int sendcount, MPI_Datatype sendtype,\ + void *recvbuf, int recvcount, MPI_Datatype recvtype,\ + int root, MPI_Comm comm)) + mpi_coll_scatter_description[scatter_id].coll; smpi_global_init(); /* Clean IO before the run */ diff --git a/src/smpi/smpi_pmpi.c b/src/smpi/smpi_pmpi.c index 23621eac21..32fb308b0b 100644 --- a/src/smpi/smpi_pmpi.c +++ b/src/smpi/smpi_pmpi.c @@ -1735,7 +1735,7 @@ int PMPI_Scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, || recvtype == MPI_DATATYPE_NULL) { retval = MPI_ERR_TYPE; } else { - smpi_mpi_scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount, + mpi_coll_scatter_fun(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); retval = MPI_SUCCESS; } -- 2.20.1