From: Augustin Degomme Date: Mon, 10 Jun 2013 16:13:37 +0000 (+0200) Subject: add one more bcast algo X-Git-Tag: v3_9_90~295 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/0716a43321411df330eded3271a921b18574a26d add one more bcast algo --- diff --git a/buildtools/Cmake/AddTests.cmake b/buildtools/Cmake/AddTests.cmake index cda54a3256..7bd0ec8782 100644 --- a/buildtools/Cmake/AddTests.cmake +++ b/buildtools/Cmake/AddTests.cmake @@ -397,7 +397,7 @@ if(NOT enable_memcheck) ENDFOREACH() FOREACH (BCAST_COLL default arrival_nb arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather - scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi) + scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi ompi_split_bintree) ADD_TEST(smpi-bcast-coll-${BCAST_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/bcast:${BCAST_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast_coll.tesh) ENDFOREACH() FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi) diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake index 9b251020ee..f9587289a9 100644 --- a/buildtools/Cmake/DefinePackages.cmake +++ b/buildtools/Cmake/DefinePackages.cmake @@ -183,6 +183,7 @@ set(SMPI_SRC src/smpi/colls/bcast-SMP-binary.c src/smpi/colls/bcast-SMP-binomial.c src/smpi/colls/bcast-SMP-linear.c + src/smpi/colls/bcast-ompi-split-bintree.c src/smpi/colls/reduce-arrival-pattern-aware.c src/smpi/colls/reduce-binomial.c src/smpi/colls/reduce-flat-tree.c diff --git a/src/smpi/colls/bcast-ompi-split-bintree.c b/src/smpi/colls/bcast-ompi-split-bintree.c new file mode 100644 index 0000000000..a68ebd2bca --- /dev/null +++ b/src/smpi/colls/bcast-ompi-split-bintree.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 University of Houston. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer listed + * in this license in the documentation and/or other materials + * provided with the distribution. + + * - Neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * The copyright holders provide no reassurances that the source code + * provided does not infringe any patent, copyright, or any other + * intellectual property rights of third parties. The copyright holders + * disclaim any liability to any recipient for claims brought against + * recipient by any third party for infringement of that parties + * intellectual property rights. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + */ + + #include "colls_private.h" + #define MAXTREEFANOUT 32 + typedef struct ompi_coll_tree_t { + int32_t tree_root; + int32_t tree_fanout; + int32_t tree_bmtree; + int32_t tree_prev; + int32_t tree_next[MAXTREEFANOUT]; + int32_t tree_nextsize; + } ompi_coll_tree_t; + + ompi_coll_tree_t* + ompi_coll_tuned_topo_build_tree( int fanout, + MPI_Comm com, + int root ); + + +/* + * Some static helpers. + */ +static int pown( int fanout, int num ) +{ + int j, p = 1; + if( num < 0 ) return 0; + if (1==num) return fanout; + if (2==fanout) { + return p<MAXTREEFANOUT) { + XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT); + return NULL; + } + + /* + * Get size and rank of the process in this communicator + */ + size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); + if (!tree) { + XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory"); + return NULL; + } + + tree->tree_root = MPI_UNDEFINED; + tree->tree_nextsize = MPI_UNDEFINED; + + /* + * Set root + */ + tree->tree_root = root; + + /* + * Initialize tree + */ + tree->tree_fanout = fanout; + tree->tree_bmtree = 0; + tree->tree_root = root; + tree->tree_prev = -1; + tree->tree_nextsize = 0; + for( i = 0; i < fanout; i++ ) { + tree->tree_next[i] = -1; + } + + /* return if we have less than 2 processes */ + if( size < 2 ) { + return tree; + } + + /* + * Shift all ranks by root, so that the algorithm can be + * designed as if root would be always 0 + * shiftedrank should be used in calculating distances + * and position in tree + */ + shiftedrank = rank - root; + if( shiftedrank < 0 ) { + shiftedrank += size; + } + + /* calculate my level */ + level = calculate_level( fanout, shiftedrank ); + delta = pown( fanout, level ); + + /* find my children */ + for( i = 0; i < fanout; i++ ) { + schild = shiftedrank + delta * (i+1); + if( schild < size ) { + tree->tree_next[i] = (schild+root)%size; + tree->tree_nextsize = tree->tree_nextsize + 1; + } else { + break; + } + } + + /* find my parent */ + slimit = calculate_num_nodes_up_to_level( fanout, level ); + sparent = shiftedrank; + if( sparent < fanout ) { + sparent = 0; + } else { + while( sparent >= slimit ) { + sparent -= delta/fanout; + } + } + tree->tree_prev = (sparent+root)%size; + + return tree; +} + + +int +smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer, + int count, + MPI_Datatype datatype, + int root, + MPI_Comm comm) +{ + int segsize ; + int rank, size; + int segindex, i, lr, pair; + int segcount[2]; /* Number ompi_request_wait_allof elements sent with each segment */ + uint32_t counts[2]; + int num_segments[2]; /* Number of segmenets */ + int sendcount[2]; /* the same like segcount, except for the last segment */ + size_t realsegsize[2]; + char *tmpbuf[2]; + size_t type_size; + ptrdiff_t type_extent; + + + MPI_Request base_req, new_req; + ompi_coll_tree_t *tree; +// mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; +// mca_coll_tuned_comm_t *data = tuned_module->tuned_data; + + size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + + //compute again segsize + const size_t intermediate_message_size = 370728; + size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count; + if(message_size < intermediate_message_size) + segsize = 1024 ; + else + segsize = 1024 << 3; + + XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize); + + if (size == 1) { + return MPI_SUCCESS; + } + + /* setup the binary tree topology. */ + tree = ompi_coll_tuned_topo_build_tree(2,comm,root); + + type_size = smpi_datatype_size( datatype ); + + /* Determine number of segments and number of elements per segment */ + counts[0] = count/2; + if (count % 2 != 0) counts[0]++; + counts[1] = count - counts[0]; + if ( segsize > 0 ) { + /* Note that ompi_datatype_type_size() will never return a negative + value in typelng; it returns an int [vs. an unsigned type] + because of the MPI spec. */ + if (segsize < ((uint32_t) type_size)) { + segsize = type_size; /* push segsize up to hold one type */ + } + segcount[0] = segcount[1] = segsize / type_size; + num_segments[0] = counts[0]/segcount[0]; + if ((counts[0] % segcount[0]) != 0) num_segments[0]++; + num_segments[1] = counts[1]/segcount[1]; + if ((counts[1] % segcount[1]) != 0) num_segments[1]++; + } else { + segcount[0] = counts[0]; + segcount[1] = counts[1]; + num_segments[0] = num_segments[1] = 1; + } + + /* if the message is too small to be split into segments */ + if( (counts[0] == 0 || counts[1] == 0) || + (segsize > counts[0] * type_size) || + (segsize > counts[1] * type_size) ) { + /* call linear version here ! */ + return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype, + root, comm)); + } + type_extent = smpi_datatype_get_extent(datatype); + + + /* Determine real segment size */ + realsegsize[0] = segcount[0] * type_extent; + realsegsize[1] = segcount[1] * type_extent; + + /* set the buffer pointers */ + tmpbuf[0] = (char *) buffer; + tmpbuf[1] = (char *) buffer+counts[0] * type_extent; + + /* Step 1: + Root splits the buffer in 2 and sends segmented message down the branches. + Left subtree of the tree receives first half of the buffer, while right + subtree receives the remaining message. + */ + + /* determine if I am left (0) or right (1), (root is right) */ + lr = ((rank + size - root)%size + 1)%2; + + /* root code */ + if( rank == root ) { + /* determine segment count */ + sendcount[0] = segcount[0]; + sendcount[1] = segcount[1]; + /* for each segment */ + for (segindex = 0; segindex < num_segments[0]; segindex++) { + /* for each child */ + for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) { + if (segindex >= num_segments[i]) { /* no more segments */ + continue; + } + /* determine how many elements are being sent in this round */ + if(segindex == (num_segments[i] - 1)) + sendcount[i] = counts[i] - segindex*segcount[i]; + /* send data */ + smpi_mpi_send(tmpbuf[i], sendcount[i], datatype, + tree->tree_next[i], 777, comm); + /* update tmp buffer */ + tmpbuf[i] += realsegsize[i]; + } + } + } + + /* intermediate nodes code */ + else if( tree->tree_nextsize > 0 ) { + /* Intermediate nodes: + * It will receive segments only from one half of the data. + * Which one is determined by whether the node belongs to the "left" or "right" + * subtree. Topoloby building function builds binary tree such that + * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree, + * and even on the right subtree. + * + * Create the pipeline. We first post the first receive, then in the loop we + * post the next receive and after that wait for the previous receive to complete + * and we disseminating the data to all children. + */ + sendcount[lr] = segcount[lr]; + base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_prev, 777, + comm); + + for( segindex = 1; segindex < num_segments[lr]; segindex++ ) { + /* determine how many elements to expect in this round */ + if( segindex == (num_segments[lr] - 1)) + sendcount[lr] = counts[lr] - segindex*segcount[lr]; + /* post new irecv */ + new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr], + datatype, tree->tree_prev, 777, + comm); + + /* wait for and forward current segment */ + smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */ + smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype, + tree->tree_next[i], 777, + comm); + } /* end of for each child */ + + /* upate the base request */ + base_req = new_req; + /* go to the next buffer (ie. the one corresponding to the next recv) */ + tmpbuf[lr] += realsegsize[lr]; + } /* end of for segindex */ + + /* wait for the last segment and forward current segment */ + smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */ + smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_next[i], 777, comm); + } /* end of for each child */ + } + + /* leaf nodes */ + else { + /* Just consume segments as fast as possible */ + sendcount[lr] = segcount[lr]; + for (segindex = 0; segindex < num_segments[lr]; segindex++) { + /* determine how many elements to expect in this round */ + if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr]; + /* receive segments */ + smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_prev, 777, + comm, MPI_STATUS_IGNORE); + /* update the initial pointer to the buffer */ + tmpbuf[lr] += realsegsize[lr]; + } + } + + /* reset the buffer pointers */ + tmpbuf[0] = (char *) buffer; + tmpbuf[1] = (char *) buffer+counts[0] * type_extent; + + /* Step 2: + Find your immediate pair (identical node in opposite subtree) and SendRecv + data buffer with them. + The tree building function ensures that + if (we are not root) + if we are in the left subtree (lr == 0) our pair is (rank+1)%size. + if we are in the right subtree (lr == 1) our pair is (rank-1)%size + If we have even number of nodes the rank (size-1) will pair up with root. + */ + if (lr == 0) { + pair = (rank+1)%size; + } else { + pair = (rank+size-1)%size; + } + + if ( (size%2) != 0 && rank != root) { + + smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype, + pair, 777, + tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, + pair, 777, + comm, MPI_STATUS_IGNORE); + } else if ( (size%2) == 0 ) { + /* root sends right buffer to the last node */ + if( rank == root ) { + smpi_mpi_send(tmpbuf[1], counts[1], datatype, + (root+size-1)%size, 777, comm); + + } + /* last node receives right buffer from the root */ + else if (rank == (root+size-1)%size) { + smpi_mpi_recv(tmpbuf[1], counts[1], datatype, + root, 777, + comm, MPI_STATUS_IGNORE); + } + /* everyone else exchanges buffers */ + else { + smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype, + pair, 777, + tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, + pair, 777, + comm, MPI_STATUS_IGNORE); + } + } + return (MPI_SUCCESS); + + +} + diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h index 54647a00d7..ddbf183a8c 100644 --- a/src/smpi/colls/colls.h +++ b/src/smpi/colls/colls.h @@ -165,7 +165,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, scatter_rdb_allgather) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, SMP_binary) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \ COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \ -COLL_APPLY(action, COLL_BCAST_SIG, ompi) +COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \ +COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_BCASTS(COLL_PROTO, COLL_NOsep) diff --git a/src/smpi/colls/smpi_openmpi_selector.c b/src/smpi/colls/smpi_openmpi_selector.c index 6b463731c8..13e5a41ebb 100644 --- a/src/smpi/colls/smpi_openmpi_selector.c +++ b/src/smpi/colls/smpi_openmpi_selector.c @@ -129,7 +129,7 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count, { /* Decision function based on MX results for messages up to 36MB and communicator sizes up to 64 nodes */ - //const size_t small_message_size = 2048; + const size_t small_message_size = 2048; const size_t intermediate_message_size = 370728; //const double a_p16 = 3.2118e-6; /* [1 / byte] */ //const double b_p16 = 8.7936; @@ -138,11 +138,11 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count, //const double a_p128 = 1.6134e-6; /* [1 / byte] */ //const double b_p128 = 2.1102; - //int communicator_size; + int communicator_size; //int segsize = 0; size_t message_size, dsize; - //communicator_size = smpi_comm_size(comm); + communicator_size = smpi_comm_size(comm); /* else we need data size for decision function */ dsize = smpi_datatype_size(datatype); @@ -150,21 +150,17 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count, /* Handle messages of small and intermediate size, and single-element broadcasts */ - if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) { + if ((message_size < small_message_size) || (count <= 1)) { /* Binomial without segmentation */ - //segsize = 0; return smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, - root, comm/* - segsize*/); + root, comm); - } /*else if (message_size < intermediate_message_size) { + } else if (message_size < intermediate_message_size) { // SplittedBinary with 1KB segments - segsize = 1024; - return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype, - root, comm - segsize); + return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, + root, comm); - } + } /* Handle large message sizes else if (communicator_size < (a_p128 * message_size + b_p128)) { Pipeline with 128KB segments @@ -173,14 +169,12 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count, root, comm, module, segsize); - } else if (communicator_size < 13) { + }*/ else if (communicator_size < 13) { // Split Binary with 8KB segments - segsize = 1024 << 3; - return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, - root, comm, module, - segsize); + return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, + root, comm); - } else if (communicator_size < (a_p64 * message_size + b_p64)) { + } /*else if (communicator_size < (a_p64 * message_size + b_p64)) { // Pipeline with 64KB segments segsize = 1024 << 6; return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,