From: degomme Date: Thu, 13 Jun 2013 22:51:18 +0000 (+0200) Subject: we now have an mpich selector, and a bunch of new algorithms (--cfg=smpi/coll_selecto... X-Git-Tag: v3_9_90~271 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/8293a833f9c68986f8bd174d5bf3d04eb62918d7 we now have an mpich selector, and a bunch of new algorithms (--cfg=smpi/coll_selector:mpich) --- diff --git a/buildtools/Cmake/AddTests.cmake b/buildtools/Cmake/AddTests.cmake index d46d860a6e..55fe3bda45 100644 --- a/buildtools/Cmake/AddTests.cmake +++ b/buildtools/Cmake/AddTests.cmake @@ -372,23 +372,23 @@ if(NOT enable_memcheck) endif() - FOREACH (GATHER_COLL default ompi ompi_basic_linear ompi_linear_sync ompi_binomial) + FOREACH (GATHER_COLL default ompi mpich ompi_basic_linear ompi_linear_sync ompi_binomial) ADD_TEST(smpi-gather-coll-${GATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/gather:${GATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/gather_coll.tesh) ENDFOREACH() FOREACH (ALLGATHER_COLL default 2dmesh 3dmesh bruck GB loosely_lr lr NTSLR NTSLR_NB pair rdb rhv ring SMP_NTS - smp_simple spreading_simple ompi ompi_neighborexchange) + smp_simple spreading_simple ompi mpich ompi_neighborexchange) ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh) ENDFOREACH() - FOREACH (ALLGATHERV_COLL default GB pair ring ompi ompi_neighborexchange) + FOREACH (ALLGATHERV_COLL default GB pair ring ompi mpich ompi_neighborexchange ompi_bruck) ADD_TEST(smpi-allgatherv-coll-${ALLGATHERV_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgatherv:${ALLGATHERV_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgatherv_coll.tesh) ENDFOREACH() FOREACH (ALLREDUCE_COLL default lr NTS rab1 rab2 rab_rdb rab_rsag rdb smp_binomial smp_binomial_pipeline - smp_rdb smp_rsag smp_rsag_lr smp_rsag_rab redbcast ompi ompi_ring_segmented) + smp_rdb smp_rsag smp_rsag_lr smp_rsag_rab redbcast ompi mpich ompi_ring_segmented) ADD_TEST(smpi-allreduce-coll-${ALLREDUCE_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allreduce:${ALLREDUCE_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allreduce_coll.tesh) ENDFOREACH() @@ -407,23 +407,23 @@ if(NOT enable_memcheck) FOREACH (BCAST_COLL default arrival_nb arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather - scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi ompi_split_bintree ompi_pipeline) + scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline) ADD_TEST(smpi-bcast-coll-${BCAST_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/bcast:${BCAST_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast_coll.tesh) ENDFOREACH() - FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary) + FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi mpich ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary) ADD_TEST(smpi-reduce-coll-${REDUCE_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/reduce:${REDUCE_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce_coll.tesh) ENDFOREACH() - FOREACH (REDUCE_SCATTER_COLL default ompi ompi_basic_recursivehalving ompi_ring) + FOREACH (REDUCE_SCATTER_COLL default ompi mpich ompi_basic_recursivehalving ompi_ring mpich_noncomm mpich_pair mpich_rdb) ADD_TEST(smpi-reduce_scatter-coll-${REDUCE_SCATTER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/reduce_scatter:${REDUCE_SCATTER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce_scatter_coll.tesh) ENDFOREACH() - FOREACH (SCATTER_COLL default ompi ompi_basic_linear ompi_binomial) + FOREACH (SCATTER_COLL default ompi mpich ompi_basic_linear ompi_binomial) ADD_TEST(smpi-scatter-coll-${SCATTER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/scatter:${SCATTER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/scatter_coll.tesh) ENDFOREACH() - FOREACH (BARRIER_COLL default ompi ompi_basic_linear ompi_tree ompi_bruck ompi_recursivedoubling ompi_doublering) + FOREACH (BARRIER_COLL default ompi mpich ompi_basic_linear ompi_tree ompi_bruck ompi_recursivedoubling ompi_doublering) ADD_TEST(smpi-barrier-coll-${BARRIER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/barrier:${BARRIER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/barrier_coll.tesh) ENDFOREACH() diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake index 6d5586ff42..b934337529 100644 --- a/buildtools/Cmake/DefinePackages.cmake +++ b/buildtools/Cmake/DefinePackages.cmake @@ -112,6 +112,7 @@ set(SMPI_SRC src/smpi/smpi_pmpi.c src/smpi/smpi_replay.c src/smpi/colls/smpi_openmpi_selector.c + src/smpi/colls/smpi_mpich_selector.c src/smpi/colls/colls_global.c src/smpi/colls/allgather-2dmesh.c src/smpi/colls/allgather-3dmesh.c @@ -133,6 +134,8 @@ set(SMPI_SRC src/smpi/colls/allgatherv-pair.c src/smpi/colls/allgatherv-ring.c src/smpi/colls/allgatherv-ompi-neighborexchange.c + src/smpi/colls/allgatherv-ompi-bruck.c + src/smpi/colls/allgatherv-mpich-rdb.c src/smpi/colls/allreduce-lr.c src/smpi/colls/allreduce-NTS.c src/smpi/colls/allreduce-rab1.c @@ -197,6 +200,7 @@ set(SMPI_SRC src/smpi/colls/reduce-ompi.c src/smpi/colls/gather-ompi.c src/smpi/colls/reduce_scatter-ompi.c + src/smpi/colls/reduce_scatter-mpich.c src/smpi/colls/scatter-ompi.c src/smpi/colls/barrier-ompi.c ) diff --git a/src/smpi/colls/allgatherv-ompi-bruck.c b/src/smpi/colls/allgatherv-ompi-bruck.c new file mode 100644 index 0000000000..420f4302fd --- /dev/null +++ b/src/smpi/colls/allgatherv-ompi-bruck.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 University of Houston. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "colls_private.h" +#define MCA_COLL_BASE_TAG_ALLGATHERV 444 +/* + * ompi_coll_tuned_allgatherv_intra_bruck + * + * Function: allgather using O(log(N)) steps. + * Accepts: Same arguments as MPI_Allgather + * Returns: MPI_SUCCESS or error code + * + * Description: Variation to All-to-all algorithm described by Bruck et al.in + * "Efficient Algorithms for All-to-all Communications + * in Multiport Message-Passing Systems" + * Note: Unlike in case of allgather implementation, we relay on + * indexed datatype to select buffers appropriately. + * The only additional memory requirement is for creation of + * temporary datatypes. + * Example on 7 nodes (memory lay out need not be in-order) + * Initial set up: + * # 0 1 2 3 4 5 6 + * [0] [ ] [ ] [ ] [ ] [ ] [ ] + * [ ] [1] [ ] [ ] [ ] [ ] [ ] + * [ ] [ ] [2] [ ] [ ] [ ] [ ] + * [ ] [ ] [ ] [3] [ ] [ ] [ ] + * [ ] [ ] [ ] [ ] [4] [ ] [ ] + * [ ] [ ] [ ] [ ] [ ] [5] [ ] + * [ ] [ ] [ ] [ ] [ ] [ ] [6] + * Step 0: send message to (rank - 2^0), receive message from (rank + 2^0) + * # 0 1 2 3 4 5 6 + * [0] [ ] [ ] [ ] [ ] [ ] [0] + * [1] [1] [ ] [ ] [ ] [ ] [ ] + * [ ] [2] [2] [ ] [ ] [ ] [ ] + * [ ] [ ] [3] [3] [ ] [ ] [ ] + * [ ] [ ] [ ] [4] [4] [ ] [ ] + * [ ] [ ] [ ] [ ] [5] [5] [ ] + * [ ] [ ] [ ] [ ] [ ] [6] [6] + * Step 1: send message to (rank - 2^1), receive message from (rank + 2^1). + * message contains all blocks from (rank) .. (rank + 2^2) with + * wrap around. + * # 0 1 2 3 4 5 6 + * [0] [ ] [ ] [ ] [0] [0] [0] + * [1] [1] [ ] [ ] [ ] [1] [1] + * [2] [2] [2] [ ] [ ] [ ] [2] + * [3] [3] [3] [3] [ ] [ ] [ ] + * [ ] [4] [4] [4] [4] [ ] [ ] + * [ ] [ ] [5] [5] [5] [5] [ ] + * [ ] [ ] [ ] [6] [6] [6] [6] + * Step 2: send message to (rank - 2^2), receive message from (rank + 2^2). + * message size is "all remaining blocks" + * # 0 1 2 3 4 5 6 + * [0] [0] [0] [0] [0] [0] [0] + * [1] [1] [1] [1] [1] [1] [1] + * [2] [2] [2] [2] [2] [2] [2] + * [3] [3] [3] [3] [3] [3] [3] + * [4] [4] [4] [4] [4] [4] [4] + * [5] [5] [5] [5] [5] [5] [5] + * [6] [6] [6] [6] [6] [6] [6] + */ +int smpi_coll_tuned_allgatherv_ompi_bruck(void *sbuf, int scount, + MPI_Datatype sdtype, + void *rbuf, int *rcounts, + int *rdispls, + MPI_Datatype rdtype, + MPI_Comm comm) +{ + int rank, size; + int sendto, recvfrom, distance, blockcount, i; + int *new_rcounts = NULL, *new_rdispls = NULL; + int *new_scounts = NULL, *new_sdispls = NULL; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; + MPI_Datatype new_rdtype, new_sdtype; + + size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + XBT_DEBUG( + "coll:tuned:allgather_ompi_bruck rank %d", rank); + + smpi_datatype_extent (sdtype, &slb, &sext); + + smpi_datatype_extent (rdtype, &rlb, &rext); + + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of + the receive buffer. + */ + tmprecv = (char*) rbuf + rdispls[rank] * rext; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + smpi_datatype_copy(tmpsend, scount, sdtype, + tmprecv, rcounts[rank], rdtype); + } + + /* Communication step: + At every step i, rank r: + - doubles the distance + - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i]) + to rank (r - distance) + - receives message of blockcount blocks, + (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from + rank (r + distance) + - blockcount doubles until the last step when only the remaining data is + exchanged. + */ + blockcount = 1; + tmpsend = (char*) rbuf; + + new_rcounts = (int*) calloc(4*size, sizeof(int)); + new_rdispls = new_rcounts + size; + new_scounts = new_rdispls + size; + new_sdispls = new_scounts + size; + + for (distance = 1; distance < size; distance<<=1) { + + recvfrom = (rank + distance) % size; + sendto = (rank - distance + size) % size; + + if (distance <= (size >> 1)) { + blockcount = distance; + } else { + blockcount = size - distance; + } + + /* create send and receive datatypes */ + for (i = 0; i < blockcount; i++) { + const int tmp_srank = (rank + i) % size; + const int tmp_rrank = (recvfrom + i) % size; + new_scounts[i] = rcounts[tmp_srank]; + new_sdispls[i] = rdispls[tmp_srank]; + new_rcounts[i] = rcounts[tmp_rrank]; + new_rdispls[i] = rdispls[tmp_rrank]; + } + smpi_datatype_indexed(blockcount, new_scounts, new_sdispls, + rdtype, &new_sdtype); + smpi_datatype_indexed(blockcount, new_rcounts, new_rdispls, + rdtype, &new_rdtype); + + smpi_datatype_commit(&new_sdtype); + smpi_datatype_commit(&new_rdtype); + + /* Sendreceive */ + smpi_mpi_sendrecv(rbuf, 1, new_sdtype, sendto, + MCA_COLL_BASE_TAG_ALLGATHERV, + rbuf, 1, new_rdtype, recvfrom, + MCA_COLL_BASE_TAG_ALLGATHERV, + comm, MPI_STATUS_IGNORE); + smpi_datatype_free(&new_sdtype); + smpi_datatype_free(&new_rdtype); + + } + + free(new_rcounts); + + return MPI_SUCCESS; + +} + diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index 09e47ce818..3de368157b 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -32,7 +32,9 @@ COLL_APPLY(action, COLL_GATHER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_GATHER_SIG, ompi_basic_linear) COLL_sep \ COLL_APPLY(action, COLL_GATHER_SIG, ompi_binomial) COLL_sep \ -COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) \ +COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) COLL_sep \ +COLL_APPLY(action, COLL_GATHER_SIG, mpich) \ + COLL_GATHERS(COLL_PROTO, COLL_NOsep) @@ -62,7 +64,9 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi) COLL_sep \ -COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange) +COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) + COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep) @@ -79,7 +83,10 @@ COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \ -COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) +COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_bruck) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb) COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep) @@ -107,7 +114,8 @@ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_lr) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast) COLL_sep \ COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi) COLL_sep \ -COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented) +COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented) COLL_sep \ +COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich) COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep) @@ -134,7 +142,8 @@ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_light_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALL_SIG, simple) COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi) +COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich) COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep) @@ -156,7 +165,8 @@ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier) COLL_sep \ -COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi) +COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi) COLL_sep \ +COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich) COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep) @@ -185,7 +195,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \ -COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) +COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \ +COLL_APPLY(action, COLL_BCAST_SIG, mpich) COLL_BCASTS(COLL_PROTO, COLL_NOsep) @@ -209,7 +220,8 @@ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_pipeline) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_basic_linear) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_in_order_binary) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binary) COLL_sep \ -COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) +COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SIG, mpich) COLL_REDUCES(COLL_PROTO, COLL_NOsep) @@ -223,7 +235,12 @@ COLL_REDUCES(COLL_PROTO, COLL_NOsep) #define COLL_REDUCE_SCATTERS(action, COLL_sep) \ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_basic_recursivehalving) COLL_sep \ -COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring) +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_pair) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_rdb) COLL_sep \ +COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) + COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep) @@ -239,7 +256,8 @@ COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep) #define COLL_SCATTERS(action, COLL_sep) \ COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \ COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \ -COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial) +COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial) COLL_sep \ +COLL_APPLY(action, COLL_SCATTER_SIG, mpich) COLL_SCATTERS(COLL_PROTO, COLL_NOsep) @@ -256,7 +274,8 @@ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_two_procs) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_tree) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_bruck) COLL_sep \ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_recursivedoubling) COLL_sep \ -COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering) +COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering) COLL_sep \ +COLL_APPLY(action, COLL_BARRIER_SIG, mpich) COLL_BARRIERS(COLL_PROTO, COLL_NOsep) diff --git a/src/smpi/colls/reduce_scatter-mpich.c b/src/smpi/colls/reduce_scatter-mpich.c new file mode 100644 index 0000000000..77eaef5bf8 --- /dev/null +++ b/src/smpi/colls/reduce_scatter-mpich.c @@ -0,0 +1,493 @@ +#include "colls_private.h" +#define MPIR_REDUCE_SCATTER_TAG 222 + +static inline int MPIU_Mirror_permutation(unsigned int x, int bits) +{ + /* a mask for the high order bits that should be copied as-is */ + int high_mask = ~((0x1 << bits) - 1); + int retval = x & high_mask; + int i; + + for (i = 0; i < bits; ++i) { + unsigned int bitval = (x & (0x1 << i)) >> i; /* 0x1 or 0x0 */ + retval |= bitval << ((bits - i) - 1); + } + + return retval; +} + + +int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[], + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) +{ + int rank, comm_size, i; + MPI_Aint extent, true_extent, true_lb; + int *disps; + void *tmp_recvbuf; + int mpi_errno = MPI_SUCCESS; + int type_size, total_count, nbytes, dst, src; + int is_commutative; + comm_size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + extent =smpi_datatype_get_extent(datatype); + smpi_datatype_extent(datatype, &true_lb, &true_extent); + + if (smpi_op_is_commute(op)) { + is_commutative = 1; + } + + disps = (int*)xbt_malloc( comm_size * sizeof(int)); + + total_count = 0; + for (i=0; i peer) { + /* we have the higher rank: send top half, recv bottom half */ + recv_offset += size; + } + else { + /* we have the lower rank: recv top half, send bottom half */ + send_offset += size; + } + + smpi_mpi_sendrecv(outgoing_data + send_offset*true_extent, + size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, + incoming_data + recv_offset*true_extent, + size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, + comm, MPI_STATUS_IGNORE); + /* always perform the reduction at recv_offset, the data at send_offset + is now our peer's responsibility */ + if (rank > peer) { + /* higher ranked value so need to call op(received_data, my_data) */ + smpi_op_apply(op, + incoming_data + recv_offset*true_extent, + outgoing_data + recv_offset*true_extent, + &size, &datatype ); + buf0_was_inout = buf0_was_inout; + } + else { + /* lower ranked value so need to call op(my_data, received_data) */ + smpi_op_apply( op, + outgoing_data + recv_offset*true_extent, + incoming_data + recv_offset*true_extent, + &size, &datatype); + buf0_was_inout = !buf0_was_inout; + } + + /* the next round of send/recv needs to happen within the block (of size + "size") that we just received and reduced */ + send_offset = recv_offset; + } + + xbt_assert(size == recvcounts[rank]); + + /* copy the reduced data to the recvbuf */ + result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent; + mpi_errno = smpi_datatype_copy(result_ptr, size, datatype, + recvbuf, size, datatype); + if (mpi_errno) return(mpi_errno); + return MPI_SUCCESS; +} + + + +int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[], + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) +{ + int rank, comm_size, i; + MPI_Aint extent, true_extent, true_lb; + int *disps; + void *tmp_recvbuf, *tmp_results; + int mpi_errno = MPI_SUCCESS; + int type_size, dis[2], blklens[2], total_count, nbytes, dst; + int mask, dst_tree_root, my_tree_root, j, k; + int received; + MPI_Datatype sendtype, recvtype; + int nprocs_completed, tmp_mask, tree_root, is_commutative; + comm_size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + extent =smpi_datatype_get_extent(datatype); + smpi_datatype_extent(datatype, &true_lb, &true_extent); + + if (smpi_op_is_commute(op)) { + is_commutative = 1; + } + + disps = (int*)xbt_malloc( comm_size * sizeof(int)); + + total_count = 0; + for (i=0; i> i; + dst_tree_root <<= i; + + my_tree_root = rank >> i; + my_tree_root <<= i; + + /* At step 1, processes exchange (n-n/p) amount of + data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) + amount of data, and so forth. We use derived datatypes for this. + + At each step, a process does not need to send data + indexed from my_tree_root to + my_tree_root+mask-1. Similarly, a process won't receive + data indexed from dst_tree_root to dst_tree_root+mask-1. */ + + /* calculate sendtype */ + blklens[0] = blklens[1] = 0; + for (j=0; j comm_size) { + nprocs_completed = comm_size - my_tree_root - mask; + /* nprocs_completed is the number of processes in this + subtree that have all the data. Send data to others + in a tree fashion. First find root of current tree + that is being divided into two. k is the number of + least-significant bits in this process's rank that + must be zeroed out to find the rank of the root */ + j = mask; + k = 0; + while (j) { + j >>= 1; + k++; + } + k--; + + tmp_mask = mask >> 1; + while (tmp_mask) { + dst = rank ^ tmp_mask; + + tree_root = rank >> k; + tree_root <<= k; + + /* send only if this proc has data and destination + doesn't have data. at any step, multiple processes + can send if they have the data */ + if ((dst > rank) && + (rank < tree_root + nprocs_completed) + && (dst >= tree_root + nprocs_completed)) { + /* send the current result */ + smpi_mpi_send(tmp_recvbuf, 1, recvtype, + dst, MPIR_REDUCE_SCATTER_TAG, + comm); + } + /* recv only if this proc. doesn't have data and sender + has data */ + else if ((dst < rank) && + (dst < tree_root + nprocs_completed) && + (rank >= tree_root + nprocs_completed)) { + smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst, + MPIR_REDUCE_SCATTER_TAG, + comm, MPI_STATUS_IGNORE); + received = 1; + } + tmp_mask >>= 1; + k--; + } + } + + /* The following reduction is done here instead of after + the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is + because to do it above, in the noncommutative + case, we would need an extra temp buffer so as not to + overwrite temp_recvbuf, because temp_recvbuf may have + to be communicated to other processes in the + non-power-of-two case. To avoid that extra allocation, + we do the reduce here. */ + if (received) { + if (is_commutative || (dst_tree_root < my_tree_root)) { + { + smpi_op_apply(op, + tmp_recvbuf, tmp_results, &blklens[0], + &datatype); + smpi_op_apply(op, + ((char *)tmp_recvbuf + dis[1]*extent), + ((char *)tmp_results + dis[1]*extent), + &blklens[1], &datatype); + } + } + else { + { + smpi_op_apply(op, + tmp_results, tmp_recvbuf, &blklens[0], + &datatype); + smpi_op_apply(op, + ((char *)tmp_results + dis[1]*extent), + ((char *)tmp_recvbuf + dis[1]*extent), + &blklens[1], &datatype); + } + /* copy result back into tmp_results */ + mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, + tmp_results, 1, recvtype); + if (mpi_errno) return(mpi_errno); + } + } + + //smpi_datatype_free(&sendtype); + //smpi_datatype_free(&recvtype); + + mask <<= 1; + i++; + } + + /* now copy final results from tmp_results to recvbuf */ + mpi_errno = smpi_datatype_copy(((char *)tmp_results+disps[rank]*extent), + recvcounts[rank], datatype, recvbuf, + recvcounts[rank], datatype); + if (mpi_errno) return(mpi_errno); + + return MPI_SUCCESS; + } + + diff --git a/src/smpi/colls/smpi_mpich_selector.c b/src/smpi/colls/smpi_mpich_selector.c new file mode 100644 index 0000000000..c95613598f --- /dev/null +++ b/src/smpi/colls/smpi_mpich_selector.c @@ -0,0 +1,696 @@ +/* selector for collective algorithms based on mpich decision logic */ + +/* Copyright (c) 2009, 2010. The SimGrid Team. + * All rights reserved. */ + +/* This program is free software; you can redistribute it and/or modify it + * under the terms of the license (GNU LGPL) which comes with this package. */ + +#include "colls_private.h" + +/* This is the default implementation of allreduce. The algorithm is: + + Algorithm: MPI_Allreduce + + For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast + in order to meet the requirement that all processes must have the + same result. For the homogeneous case, we use the following algorithms. + + + For long messages and for builtin ops and if count >= pof2 (where + pof2 is the nearest power-of-two less than or equal to the number + of processes), we use Rabenseifner's algorithm (see + http://www.hlrs.de/mpi/myreduce.html). + This algorithm implements the allreduce in two steps: first a + reduce-scatter, followed by an allgather. A recursive-halving + algorithm (beginning with processes that are distance 1 apart) is + used for the reduce-scatter, and a recursive doubling + algorithm is used for the allgather. The non-power-of-two case is + handled by dropping to the nearest lower power-of-two: the first + few even-numbered processes send their data to their right neighbors + (rank+1), and the reduce-scatter and allgather happen among the remaining + power-of-two processes. At the end, the first few even-numbered + processes get the result from their right neighbors. + + For the power-of-two case, the cost for the reduce-scatter is + lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the + allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the + total cost is: + Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma + + For the non-power-of-two case, + Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma + + + For short messages, for user-defined ops, and for count < pof2 + we use a recursive doubling algorithm (similar to the one in + MPI_Allgather). We use this algorithm in the case of user-defined ops + because in this case derived datatypes are allowed, and the user + could pass basic datatypes on one process and derived on another as + long as the type maps are the same. Breaking up derived datatypes + to do the reduce-scatter is tricky. + + Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma + + Possible improvements: + + End Algorithm: MPI_Allreduce +*/ +int smpi_coll_tuned_allreduce_mpich(void *sbuf, void *rbuf, int count, + MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) +{ + size_t dsize, block_dsize; + int comm_size = smpi_comm_size(comm); + const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE + + dsize = smpi_datatype_size(dtype); + block_dsize = dsize * count; + + + /* find nearest power-of-two less than or equal to comm_size */ + int pof2 = 1; + while (pof2 <= comm_size) pof2 <<= 1; + pof2 >>=1; + + if (block_dsize > large_message && count >= pof2 && smpi_op_is_commute(op)) { + //for long messages + return (smpi_coll_tuned_allreduce_rab_rsag (sbuf, rbuf, + count, dtype, + op, comm)); + }else { + //for short ones and count < pof2 + return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, + count, dtype, + op, comm)); + } +} + + +/* This is the default implementation of alltoall. The algorithm is: + + Algorithm: MPI_Alltoall + + We use four algorithms for alltoall. For short messages and + (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al, + IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that + takes lgp steps. Because of the extra communication, the bandwidth + requirement is (n/2).lgp.beta. + + Cost = lgp.alpha + (n/2).lgp.beta + + where n is the total amount of data a process needs to send to all + other processes. + + For medium size messages and (short messages for comm_size < 8), we + use an algorithm that posts all irecvs and isends and then does a + waitall. We scatter the order of sources and destinations among the + processes, so that all processes don't try to send/recv to/from the + same process at the same time. + + *** Modification: We post only a small number of isends and irecvs + at a time and wait on them as suggested by Tony Ladd. *** + *** See comments below about an additional modification that + we may want to consider *** + + For long messages and power-of-two number of processes, we use a + pairwise exchange algorithm, which takes p-1 steps. We + calculate the pairs by using an exclusive-or algorithm: + for (i=1; i=8 -> bruck + +// medium size messages and (short messages for comm_size < 8), we +// use an algorithm that posts all irecvs and isends and then does a +// waitall. + +// For long messages and power-of-two number of processes, we use a +// pairwise exchange algorithm + +// For a non-power-of-two number of processes, we use an +// algorithm in which, in step i, each process receives from (rank-i) +// and sends to (rank+i). + + + dsize = smpi_datatype_size(sdtype); + block_dsize = dsize * scount; + + if ((block_dsize < short_size) && (communicator_size >= 8)) { + return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); + + } else if (block_dsize < medium_size) { + return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); + }else if (communicator_size%2){ + return smpi_coll_tuned_alltoall_ring(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); + } + + return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); +} + +int smpi_coll_tuned_alltoallv_mpich(void *sbuf, int *scounts, int *sdisps, + MPI_Datatype sdtype, + void *rbuf, int *rcounts, int *rdisps, + MPI_Datatype rdtype, + MPI_Comm comm + ) +{ + /* For starters, just keep the original algorithm. */ + return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, + rbuf, rcounts, rdisps,rdtype, + comm); +} + + +int smpi_coll_tuned_barrier_mpich(MPI_Comm comm) +{ + return smpi_coll_tuned_barrier_ompi_bruck(comm); +} + +/* This is the default implementation of broadcast. The algorithm is: + + Algorithm: MPI_Bcast + + For short messages, we use a binomial tree algorithm. + Cost = lgp.alpha + n.lgp.beta + + For long messages, we do a scatter followed by an allgather. + We first scatter the buffer using a binomial tree algorithm. This costs + lgp.alpha + n.((p-1)/p).beta + If the datatype is contiguous and the communicator is homogeneous, + we treat the data as bytes and divide (scatter) it among processes + by using ceiling division. For the noncontiguous or heterogeneous + cases, we first pack the data into a temporary buffer by using + MPI_Pack, scatter it as bytes, and unpack it after the allgather. + + For the allgather, we use a recursive doubling algorithm for + medium-size messages and power-of-two number of processes. This + takes lgp steps. In each step pairs of processes exchange all the + data they have (we take care of non-power-of-two situations). This + costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately + because it may be slightly more in the non-power-of-two case, but + it's still a logarithmic algorithm.) Therefore, for long messages + Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + + Note that this algorithm has twice the latency as the tree algorithm + we use for short messages, but requires lower bandwidth: 2.n.beta + versus n.lgp.beta. Therefore, for long messages and when lgp > 2, + this algorithm will perform better. + + For long messages and for medium-size messages and non-power-of-two + processes, we use a ring algorithm for the allgather, which + takes p-1 steps, because it performs better than recursive doubling. + Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta + + Possible improvements: + For clusters of SMPs, we may want to do something differently to + take advantage of shared memory on each node. + + End Algorithm: MPI_Bcast +*/ + + +int smpi_coll_tuned_bcast_mpich(void *buff, int count, + MPI_Datatype datatype, int root, + MPI_Comm comm + ) +{ + /* Decision function based on MX results for + messages up to 36MB and communicator sizes up to 64 nodes */ + const size_t small_message_size = 12288; + const size_t intermediate_message_size = 524288; + + int communicator_size; + //int segsize = 0; + size_t message_size, dsize; + + communicator_size = smpi_comm_size(comm); + + /* else we need data size for decision function */ + dsize = smpi_datatype_size(datatype); + message_size = dsize * (unsigned long)count; /* needed for decision */ + + /* Handle messages of small and intermediate size, and + single-element broadcasts */ + if ((message_size < small_message_size) || (communicator_size <= 8)) { + /* Binomial without segmentation */ + return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, + root, comm); + + } else if (message_size < intermediate_message_size && !(communicator_size%2)) { + // SplittedBinary with 1KB segments + return smpi_coll_tuned_bcast_scatter_rdb_allgather(buff, count, datatype, + root, comm); + + } + //Handle large message sizes + return smpi_coll_tuned_bcast_scatter_LR_allgather (buff, count, datatype, + root, comm); + +} + + + +/* This is the default implementation of reduce. The algorithm is: + + Algorithm: MPI_Reduce + + For long messages and for builtin ops and if count >= pof2 (where + pof2 is the nearest power-of-two less than or equal to the number + of processes), we use Rabenseifner's algorithm (see + http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ). + This algorithm implements the reduce in two steps: first a + reduce-scatter, followed by a gather to the root. A + recursive-halving algorithm (beginning with processes that are + distance 1 apart) is used for the reduce-scatter, and a binomial tree + algorithm is used for the gather. The non-power-of-two case is + handled by dropping to the nearest lower power-of-two: the first + few odd-numbered processes send their data to their left neighbors + (rank-1), and the reduce-scatter happens among the remaining + power-of-two processes. If the root is one of the excluded + processes, then after the reduce-scatter, rank 0 sends its result to + the root and exits; the root now acts as rank 0 in the binomial tree + algorithm for gather. + + For the power-of-two case, the cost for the reduce-scatter is + lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the + gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the + total cost is: + Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma + + For the non-power-of-two case, assuming the root is not one of the + odd-numbered processes that get excluded in the reduce-scatter, + Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + + n.(1+(p-1)/p).gamma + + + For short messages, user-defined ops, and count < pof2, we use a + binomial tree algorithm for both short and long messages. + + Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma + + + We use the binomial tree algorithm in the case of user-defined ops + because in this case derived datatypes are allowed, and the user + could pass basic datatypes on one process and derived on another as + long as the type maps are the same. Breaking up derived datatypes + to do the reduce-scatter is tricky. + + FIXME: Per the MPI-2.1 standard this case is not possible. We + should be able to use the reduce-scatter/gather approach as long as + count >= pof2. [goodell@ 2009-01-21] + + Possible improvements: + + End Algorithm: MPI_Reduce +*/ + + +int smpi_coll_tuned_reduce_mpich( void *sendbuf, void *recvbuf, + int count, MPI_Datatype datatype, + MPI_Op op, int root, + MPI_Comm comm + ) +{ + int communicator_size=0; + //int segsize = 0; + size_t message_size, dsize; + communicator_size = smpi_comm_size(comm); + + /* need data size for decision function */ + dsize=smpi_datatype_size(datatype); + message_size = dsize * count; /* needed for decision */ + + int pof2 = 1; + while (pof2 <= communicator_size) pof2 <<= 1; + pof2 >>= 1; + + + if ((count < pof2) || (message_size < 2048) || !smpi_op_is_commute(op)) { + return smpi_coll_tuned_reduce_binomial (sendbuf, recvbuf, count, datatype, op, root, comm); + } + return smpi_coll_tuned_reduce_scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, + segsize, max_requests*/); +} + + + +/* This is the default implementation of reduce_scatter. The algorithm is: + + Algorithm: MPI_Reduce_scatter + + If the operation is commutative, for short and medium-size + messages, we use a recursive-halving + algorithm in which the first p/2 processes send the second n/2 data + to their counterparts in the other half and receive the first n/2 + data from them. This procedure continues recursively, halving the + data communicated at each step, for a total of lgp steps. If the + number of processes is not a power-of-two, we convert it to the + nearest lower power-of-two by having the first few even-numbered + processes send their data to the neighboring odd-numbered process + at (rank+1). Those odd-numbered processes compute the result for + their left neighbor as well in the recursive halving algorithm, and + then at the end send the result back to the processes that didn't + participate. + Therefore, if p is a power-of-two, + Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma + If p is not a power-of-two, + Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma + The above cost in the non power-of-two case is approximate because + there is some imbalance in the amount of work each process does + because some processes do the work of their neighbors as well. + + For commutative operations and very long messages we use + we use a pairwise exchange algorithm similar to + the one used in MPI_Alltoall. At step i, each process sends n/p + amount of data to (rank+i) and receives n/p amount of data from + (rank-i). + Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma + + + If the operation is not commutative, we do the following: + + We use a recursive doubling algorithm, which + takes lgp steps. At step 1, processes exchange (n-n/p) amount of + data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) + amount of data, and so forth. + + Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma + + Possible improvements: + + End Algorithm: MPI_Reduce_scatter +*/ + + +int smpi_coll_tuned_reduce_scatter_mpich( void *sbuf, void *rbuf, + int *rcounts, + MPI_Datatype dtype, + MPI_Op op, + MPI_Comm comm + ) +{ + int comm_size, i; + size_t total_message_size, dsize; + int zerocounts = 0; + + XBT_DEBUG("smpi_coll_tuned_reduce_scatter_mpich"); + + comm_size = smpi_comm_size(comm); + // We need data size for decision function + dsize=smpi_datatype_size(dtype); + total_message_size = 0; + for (i = 0; i < comm_size; i++) { + total_message_size += rcounts[i]; + if (0 == rcounts[i]) { + zerocounts = 1; + } + } + + if( smpi_op_is_commute(op) && total_message_size > 524288) { + return smpi_coll_tuned_reduce_scatter_mpich_pair (sbuf, rbuf, rcounts, + dtype, op, + comm); + }else if (!smpi_op_is_commute(op)) { + int is_block_regular = 1; + for (i = 0; i < (comm_size - 1); ++i) { + if (rcounts[i] != rcounts[i+1]) { + is_block_regular = 0; + break; + } + } + + /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */ + int pof2 = 1; + while (pof2 < comm_size) pof2 <<= 1; + + if (pof2 == comm_size && is_block_regular) { + /* noncommutative, pof2 size, and block regular */ + return smpi_coll_tuned_reduce_scatter_mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm); + } + + return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm); + }else{ + return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm); + } +} + + +/* This is the default implementation of allgather. The algorithm is: + + Algorithm: MPI_Allgather + + For short messages and non-power-of-two no. of processes, we use + the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97 + paper. It is a variant of the disemmination algorithm for + barrier. It takes ceiling(lg p) steps. + + Cost = lgp.alpha + n.((p-1)/p).beta + where n is total size of data gathered on each process. + + For short or medium-size messages and power-of-two no. of + processes, we use the recursive doubling algorithm. + + Cost = lgp.alpha + n.((p-1)/p).beta + + TODO: On TCP, we may want to use recursive doubling instead of the Bruck + algorithm in all cases because of the pairwise-exchange property of + recursive doubling (see Benson et al paper in Euro PVM/MPI + 2003). + + It is interesting to note that either of the above algorithms for + MPI_Allgather has the same cost as the tree algorithm for MPI_Gather! + + For long messages or medium-size messages and non-power-of-two + no. of processes, we use a ring algorithm. In the first step, each + process i sends its contribution to process i+1 and receives + the contribution from process i-1 (with wrap-around). From the + second step onwards, each process i forwards to process i+1 the + data it received from process i-1 in the previous step. This takes + a total of p-1 steps. + + Cost = (p-1).alpha + n.((p-1)/p).beta + + We use this algorithm instead of recursive doubling for long + messages because we find that this communication pattern (nearest + neighbor) performs twice as fast as recursive doubling for long + messages (on Myrinet and IBM SP). + + Possible improvements: + + End Algorithm: MPI_Allgather +*/ + +int smpi_coll_tuned_allgather_mpich(void *sbuf, int scount, + MPI_Datatype sdtype, + void* rbuf, int rcount, + MPI_Datatype rdtype, + MPI_Comm comm + ) +{ + int communicator_size, pow2_size; + size_t dsize, total_dsize; + + communicator_size = smpi_comm_size(comm); + + /* Determine complete data size */ + dsize=smpi_datatype_size(sdtype); + total_dsize = dsize * scount * communicator_size; + + for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1); + + /* Decision as in MPICH-2 + presented in Thakur et.al. "Optimization of Collective Communication + Operations in MPICH", International Journal of High Performance Computing + Applications, Vol. 19, No. 1, 49-66 (2005) + - for power-of-two processes and small and medium size messages + (up to 512KB) use recursive doubling + - for non-power-of-two processes and small messages (80KB) use bruck, + - for everything else use ring. + */ + if ((pow2_size == communicator_size) && (total_dsize < 524288)) { + return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); + } else if (total_dsize <= 81920) { + return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); + } + return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm); +} + + +/* This is the default implementation of allgatherv. The algorithm is: + + Algorithm: MPI_Allgatherv + + For short messages and non-power-of-two no. of processes, we use + the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97 + paper. It is a variant of the disemmination algorithm for + barrier. It takes ceiling(lg p) steps. + + Cost = lgp.alpha + n.((p-1)/p).beta + where n is total size of data gathered on each process. + + For short or medium-size messages and power-of-two no. of + processes, we use the recursive doubling algorithm. + + Cost = lgp.alpha + n.((p-1)/p).beta + + TODO: On TCP, we may want to use recursive doubling instead of the Bruck + algorithm in all cases because of the pairwise-exchange property of + recursive doubling (see Benson et al paper in Euro PVM/MPI + 2003). + + For long messages or medium-size messages and non-power-of-two + no. of processes, we use a ring algorithm. In the first step, each + process i sends its contribution to process i+1 and receives + the contribution from process i-1 (with wrap-around). From the + second step onwards, each process i forwards to process i+1 the + data it received from process i-1 in the previous step. This takes + a total of p-1 steps. + + Cost = (p-1).alpha + n.((p-1)/p).beta + + Possible improvements: + + End Algorithm: MPI_Allgatherv +*/ +int smpi_coll_tuned_allgatherv_mpich(void *sbuf, int scount, + MPI_Datatype sdtype, + void* rbuf, int *rcounts, + int *rdispls, + MPI_Datatype rdtype, + MPI_Comm comm + ) +{ + int communicator_size, pow2_size; + size_t dsize, total_dsize; + + communicator_size = smpi_comm_size(comm); + + /* Determine complete data size */ + dsize=smpi_datatype_size(sdtype); + total_dsize = dsize * scount * communicator_size; + + for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1); + + if ((pow2_size == communicator_size) && (total_dsize < 524288)) { + return smpi_coll_tuned_allgatherv_mpich_rdb(sbuf, scount, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm); + } else if (total_dsize <= 81920) { + return smpi_coll_tuned_allgatherv_ompi_bruck(sbuf, scount, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm); + } + return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, + rbuf, rcounts, rdispls, rdtype, + comm); +} + +/* This is the default implementation of gather. The algorithm is: + + Algorithm: MPI_Gather + + We use a binomial tree algorithm for both short and long + messages. At nodes other than leaf nodes we need to allocate a + temporary buffer to store the incoming message. If the root is not + rank 0, for very small messages, we pack it into a temporary + contiguous buffer and reorder it to be placed in the right + order. For small (but not very small) messages, we use a derived + datatype to unpack the incoming data into non-contiguous buffers in + the right order. In the heterogeneous case we first pack the + buffers by using MPI_Pack and then do the gather. + + Cost = lgp.alpha + n.((p-1)/p).beta + where n is the total size of the data gathered at the root. + + Possible improvements: + + End Algorithm: MPI_Gather +*/ + +int smpi_coll_tuned_gather_mpich(void *sbuf, int scount, + MPI_Datatype sdtype, + void* rbuf, int rcount, + MPI_Datatype rdtype, + int root, + MPI_Comm comm + ) +{ + return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); +} + +/* This is the default implementation of scatter. The algorithm is: + + Algorithm: MPI_Scatter + + We use a binomial tree algorithm for both short and + long messages. At nodes other than leaf nodes we need to allocate + a temporary buffer to store the incoming message. If the root is + not rank 0, we reorder the sendbuf in order of relative ranks by + copying it into a temporary buffer, so that all the sends from the + root are contiguous and in the right order. In the heterogeneous + case, we first pack the buffer by using MPI_Pack and then do the + scatter. + + Cost = lgp.alpha + n.((p-1)/p).beta + where n is the total size of the data to be scattered from the root. + + Possible improvements: + + End Algorithm: MPI_Scatter +*/ + + +int smpi_coll_tuned_scatter_mpich(void *sbuf, int scount, + MPI_Datatype sdtype, + void* rbuf, int rcount, + MPI_Datatype rdtype, + int root, MPI_Comm comm + ) +{ + return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm); +} + diff --git a/src/smpi/smpi_mpi_dt.c b/src/smpi/smpi_mpi_dt.c index e42513b15f..b233c413d4 100644 --- a/src/smpi/smpi_mpi_dt.c +++ b/src/smpi/smpi_mpi_dt.c @@ -424,7 +424,7 @@ int smpi_datatype_contiguous(int count, MPI_Datatype old_type, MPI_Datatype* new int smpi_datatype_vector(int count, int blocklen, int stride, MPI_Datatype old_type, MPI_Datatype* new_type) { int retval; - if (blocklen<=0) return MPI_ERR_ARG; + if (blocklen<0) return MPI_ERR_ARG; MPI_Aint lb = 0; MPI_Aint ub = 0; if(count>0){ @@ -565,7 +565,7 @@ void free_hvector(MPI_Datatype* d){ int smpi_datatype_hvector(int count, int blocklen, MPI_Aint stride, MPI_Datatype old_type, MPI_Datatype* new_type) { int retval; - if (blocklen<=0) return MPI_ERR_ARG; + if (blocklen<0) return MPI_ERR_ARG; MPI_Aint lb = 0; MPI_Aint ub = 0; if(count>0){ @@ -726,7 +726,7 @@ int smpi_datatype_indexed(int count, int* blocklens, int* indices, MPI_Datatype } for(i=0; i< count; i++){ - if (blocklens[i]<=0) + if (blocklens[i]<0) return MPI_ERR_ARG; size += blocklens[i]; @@ -886,7 +886,7 @@ int smpi_datatype_hindexed(int count, int* blocklens, MPI_Aint* indices, MPI_Dat ub=indices[0] + blocklens[0]*smpi_datatype_ub(old_type); } for(i=0; i< count; i++){ - if (blocklens[i]<=0) + if (blocklens[i]<0) return MPI_ERR_ARG; size += blocklens[i]; @@ -1054,7 +1054,7 @@ int smpi_datatype_struct(int count, int* blocklens, MPI_Aint* indices, MPI_Datat int forced_lb=0; int forced_ub=0; for(i=0; i< count; i++){ - if (blocklens[i]<=0) + if (blocklens[i]<0) return MPI_ERR_ARG; if (old_types[i]->has_subtype == 1) contiguous=0; diff --git a/teshsuite/smpi/barrier_coll.tesh b/teshsuite/smpi/barrier_coll.tesh index a65b0de18c..f5d2c79901 100644 --- a/teshsuite/smpi/barrier_coll.tesh +++ b/teshsuite/smpi/barrier_coll.tesh @@ -3,8 +3,41 @@ ! output sort p Test scatter -$ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/../../examples/msg/small_platform.xml -np 16 --log=xbt_cfg.thres:critical ./barrier_coll -> ... Barrier .... +$ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/../../examples/msg/small_platform.xml -np 16 --log=xbt_cfg.thres:critical ./scatter +> [0] ok. +> [0] ok. +> [10] ok. +> [10] ok. +> [11] ok. +> [11] ok. +> [12] ok. +> [12] ok. +> [13] ok. +> [13] ok. +> [14] ok. +> [14] ok. +> [15] ok. +> [15] ok. +> [1] ok. +> [1] ok. +> [2] ok. +> [2] ok. +> [3] ok. +> [3] ok. +> [4] ok. +> [4] ok. +> [5] ok. +> [5] ok. +> [6] ok. +> [6] ok. +> [7] ok. +> [7] ok. +> [8] ok. +> [8] ok. +> [9] ok. +> [9] ok. +> ** IBM Test Result: ... +> ** Small Test Result: ... > You requested to use 16 processes, but there is only 5 processes in your hostfile... > [0.000000] [surf_config/INFO] Switching workstation model to compound since you changed the network and/or cpu model(s) > [rank 0] -> Tremblay @@ -23,4 +56,3 @@ $ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/ > [rank 7] -> Fafard > [rank 8] -> Ginette > [rank 9] -> Bourassa -