From: Augustin Degomme Date: Tue, 11 Jun 2013 13:26:33 +0000 (+0200) Subject: add allgatherv algo from ompi X-Git-Tag: v3_9_90~276^2~15 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/c25acb156967c222e2512e0168767fe488e7d25c?hp=d77e8454ea1c527ebec2b4629dcd3e90ecccf51e add allgatherv algo from ompi --- diff --git a/buildtools/Cmake/AddTests.cmake b/buildtools/Cmake/AddTests.cmake index e7c8abb716..bef08edd8d 100644 --- a/buildtools/Cmake/AddTests.cmake +++ b/buildtools/Cmake/AddTests.cmake @@ -376,7 +376,7 @@ if(NOT enable_memcheck) smp_simple spreading_simple ompi ompi_neighborexchange) ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh) ENDFOREACH() - FOREACH (ALLGATHERV_COLL default GB pair ring ompi) + FOREACH (ALLGATHERV_COLL default GB pair ring ompi ompi_neighborexchange) ADD_TEST(smpi-allgatherv-coll-${ALLGATHERV_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgatherv:${ALLGATHERV_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgatherv_coll.tesh) ENDFOREACH() FOREACH (ALLREDUCE_COLL default lr NTS rab1 rab2 rab_rdb diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake index ce0f758b6f..2e31a12106 100644 --- a/buildtools/Cmake/DefinePackages.cmake +++ b/buildtools/Cmake/DefinePackages.cmake @@ -132,6 +132,7 @@ set(SMPI_SRC src/smpi/colls/allgatherv-GB.c src/smpi/colls/allgatherv-pair.c src/smpi/colls/allgatherv-ring.c + src/smpi/colls/allgatherv-ompi-neighborexchange.c src/smpi/colls/allreduce-lr.c src/smpi/colls/allreduce-NTS.c src/smpi/colls/allreduce-rab1.c diff --git a/src/smpi/colls/allgatherv-ompi-neighborexchange.c b/src/smpi/colls/allgatherv-ompi-neighborexchange.c new file mode 100644 index 0000000000..f80e681f64 --- /dev/null +++ b/src/smpi/colls/allgatherv-ompi-neighborexchange.c @@ -0,0 +1,214 @@ + +/* + * ompi_coll_tuned_allgatherv_intra_neighborexchange + * + * Function: allgatherv using N/2 steps (O(N)) + * Accepts: Same arguments as MPI_Allgatherv + * Returns: MPI_SUCCESS or error code + * + * Description: Neighbor Exchange algorithm for allgather adapted for + * allgatherv. + * Described by Chen et.al. in + * "Performance Evaluation of Allgather Algorithms on + * Terascale Linux Cluster with Fast Ethernet", + * Proceedings of the Eighth International Conference on + * High-Performance Computing inn Asia-Pacific Region + * (HPCASIA'05), 2005 + * + * Rank r exchanges message with one of its neighbors and + * forwards the data further in the next step. + * + * No additional memory requirements. + * + * Limitations: Algorithm works only on even number of processes. + * For odd number of processes we switch to ring algorithm. + * + * Example on 6 nodes: + * Initial state + * # 0 1 2 3 4 5 + * [0] [ ] [ ] [ ] [ ] [ ] + * [ ] [1] [ ] [ ] [ ] [ ] + * [ ] [ ] [2] [ ] [ ] [ ] + * [ ] [ ] [ ] [3] [ ] [ ] + * [ ] [ ] [ ] [ ] [4] [ ] + * [ ] [ ] [ ] [ ] [ ] [5] + * Step 0: + * # 0 1 2 3 4 5 + * [0] [0] [ ] [ ] [ ] [ ] + * [1] [1] [ ] [ ] [ ] [ ] + * [ ] [ ] [2] [2] [ ] [ ] + * [ ] [ ] [3] [3] [ ] [ ] + * [ ] [ ] [ ] [ ] [4] [4] + * [ ] [ ] [ ] [ ] [5] [5] + * Step 1: + * # 0 1 2 3 4 5 + * [0] [0] [0] [ ] [ ] [0] + * [1] [1] [1] [ ] [ ] [1] + * [ ] [2] [2] [2] [2] [ ] + * [ ] [3] [3] [3] [3] [ ] + * [4] [ ] [ ] [4] [4] [4] + * [5] [ ] [ ] [5] [5] [5] + * Step 2: + * # 0 1 2 3 4 5 + * [0] [0] [0] [0] [0] [0] + * [1] [1] [1] [1] [1] [1] + * [2] [2] [2] [2] [2] [2] + * [3] [3] [3] [3] [3] [3] + * [4] [4] [4] [4] [4] [4] + * [5] [5] [5] [5] [5] [5] + */ + + #include "colls_private.h" + #define MCA_COLL_BASE_TAG_ALLGATHERV 444 + +int +smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount, + MPI_Datatype sdtype, + void* rbuf, int *rcounts, int *rdispls, + MPI_Datatype rdtype, + MPI_Comm comm) +{ + int line = -1; + int rank, size; + int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; + + int i, even_rank; + int err = 0; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; + + + size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + if (size % 2) { + XBT_DEBUG( + "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", + size); + return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, + rbuf, rcounts, + rdispls, rdtype, + comm); + } + + XBT_DEBUG( + "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank); + + err = smpi_datatype_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + err = smpi_datatype_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to + the appropriate block of receive buffer + */ + tmprecv = (char*) rbuf + rdispls[rank] * rext; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = smpi_datatype_copy(tmpsend, scount, sdtype, + tmprecv, rcounts[rank], rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + + /* Determine neighbors, order in which blocks will arrive, etc. */ + even_rank = !(rank % 2); + if (even_rank) { + neighbor[0] = (rank + 1) % size; + neighbor[1] = (rank - 1 + size) % size; + recv_data_from[0] = rank; + recv_data_from[1] = rank; + offset_at_step[0] = (+2); + offset_at_step[1] = (-2); + } else { + neighbor[0] = (rank - 1 + size) % size; + neighbor[1] = (rank + 1) % size; + recv_data_from[0] = neighbor[0]; + recv_data_from[1] = neighbor[0]; + offset_at_step[0] = (-2); + offset_at_step[1] = (+2); + } + + /* Communication loop: + - First step is special: exchange a single block with neighbor[0]. + - Rest of the steps: + update recv_data_from according to offset, and + exchange two blocks with appropriate neighbor. + the send location becomes previous receve location. + Note, we need to create indexed datatype to send and receive these + blocks properly. + */ + tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext; + tmpsend = (char*)rbuf + rdispls[rank] * rext; + smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, + neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV, + tmprecv, rcounts[neighbor[0]], rdtype, + neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV, + comm, MPI_STATUS_IGNORE); + + + + + + /* Determine initial sending counts and displacements*/ + if (even_rank) { + send_data_from = rank; + } else { + send_data_from = recv_data_from[0]; + } + + for (i = 1; i < (size / 2); i++) { + MPI_Datatype new_rdtype, new_sdtype; + int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2]; + const int i_parity = i % 2; + recv_data_from[i_parity] = + (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; + + /* Create new indexed types for sending and receiving. + We are sending data from ranks (send_data_from) and (send_data_from+1) + We are receiving data from ranks (recv_data_from[i_parity]) and + (recv_data_from[i_parity]+1). + */ + + new_scounts[0] = rcounts[send_data_from]; + new_scounts[1] = rcounts[(send_data_from + 1)]; + new_sdispls[0] = rdispls[send_data_from]; + new_sdispls[1] = rdispls[(send_data_from + 1)]; + err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, + &new_sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + smpi_datatype_commit(&new_sdtype); + + new_rcounts[0] = rcounts[recv_data_from[i_parity]]; + new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)]; + new_rdispls[0] = rdispls[recv_data_from[i_parity]]; + new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)]; + err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, + &new_rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + smpi_datatype_commit(&new_rdtype); + + tmprecv = (char*)rbuf; + tmpsend = (char*)rbuf; + + /* Sendreceive */ + smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity], + MCA_COLL_BASE_TAG_ALLGATHERV, + tmprecv, 1, new_rdtype, neighbor[i_parity], + MCA_COLL_BASE_TAG_ALLGATHERV, + comm, MPI_STATUS_IGNORE); + + send_data_from = recv_data_from[i_parity]; + + smpi_datatype_free(&new_sdtype); + smpi_datatype_free(&new_rdtype); + } + + return MPI_SUCCESS; + + err_hndl: + XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank); + return err; +} diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index 6743650295..ee51cacf82 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -62,7 +62,8 @@ COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep) COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \ COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring) COLL_sep \ -COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) +COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \ +COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep) diff --git a/src/smpi/colls/smpi_openmpi_selector.c b/src/smpi/colls/smpi_openmpi_selector.c index 2ea07e3178..36f901feae 100644 --- a/src/smpi/colls/smpi_openmpi_selector.c +++ b/src/smpi/colls/smpi_openmpi_selector.c @@ -448,17 +448,17 @@ int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, - for everything else use ring. */ if ((pow2_size == communicator_size) && (total_dsize < 524288)) { - return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, + return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, rbuf, rcount, rdtype, - comm, module); + comm); } else if (total_dsize <= 81920) { - return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, + return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, - comm, module); + comm); } - return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, + return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, - comm, module); + comm); #endif /* defined(USE_MPICH2_DECISION) */ } @@ -500,15 +500,15 @@ int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, comm); } else { -// if (communicator_size % 2) { + if (communicator_size % 2) { return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); -/* } else { - return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype, + } else { + return smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, - comm, module); - }*/ + comm); + } } } /*