--- /dev/null
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ * University Research and Technology
+ * Corporation. All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * of Tennessee Research Foundation. All rights
+ * reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ * University of Stuttgart. All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 2009 University of Houston. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer listed
+ * in this license in the documentation and/or other materials
+ * provided with the distribution.
+
+ * - Neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * The copyright holders provide no reassurances that the source code
+ * provided does not infringe any patent, copyright, or any other
+ * intellectual property rights of third parties. The copyright holders
+ * disclaim any liability to any recipient for claims brought against
+ * recipient by any third party for infringement of that parties
+ * intellectual property rights.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+ #include "colls_private.h"
+ #define MAXTREEFANOUT 32
+ typedef struct ompi_coll_tree_t {
+ int32_t tree_root;
+ int32_t tree_fanout;
+ int32_t tree_bmtree;
+ int32_t tree_prev;
+ int32_t tree_next[MAXTREEFANOUT];
+ int32_t tree_nextsize;
+ } ompi_coll_tree_t;
+
+ ompi_coll_tree_t*
+ ompi_coll_tuned_topo_build_tree( int fanout,
+ MPI_Comm com,
+ int root );
+
+
+/*
+ * Some static helpers.
+ */
+static int pown( int fanout, int num )
+{
+ int j, p = 1;
+ if( num < 0 ) return 0;
+ if (1==num) return fanout;
+ if (2==fanout) {
+ return p<<num;
+ }
+ else {
+ for( j = 0; j < num; j++ ) { p*= fanout; }
+ }
+ return p;
+}
+
+static int calculate_level( int fanout, int rank )
+{
+ int level, num;
+ if( rank < 0 ) return -1;
+ for( level = 0, num = 0; num <= rank; level++ ) {
+ num += pown(fanout, level);
+ }
+ return level-1;
+}
+
+static int calculate_num_nodes_up_to_level( int fanout, int level )
+{
+ /* just use geometric progression formula for sum:
+ a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
+ return ((pown(fanout,level) - 1)/(fanout - 1));
+}
+
+/*
+ * And now the building functions.
+ *
+ * An example for fanout = 2, comm_size = 7
+ *
+ * 0 <-- delta = 1 (fanout^0)
+ * / \
+ * 1 2 <-- delta = 2 (fanout^1)
+ * / \ / \
+ * 3 5 4 6 <-- delta = 4 (fanout^2)
+ */
+
+ompi_coll_tree_t*
+ompi_coll_tuned_topo_build_tree( int fanout,
+ MPI_Comm comm,
+ int root )
+{
+ int rank, size;
+ int schild, sparent;
+ int level; /* location of my rank in the tree structure of size */
+ int delta; /* number of nodes on my level */
+ int slimit; /* total number of nodes on levels above me */
+ int shiftedrank;
+ int i;
+ ompi_coll_tree_t* tree;
+
+ XBT_DEBUG( "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
+
+ if (fanout<1) {
+ XBT_DEBUG( "coll:tuned:topo_build_tree invalid fanout %d", fanout);
+ return NULL;
+ }
+ if (fanout>MAXTREEFANOUT) {
+ XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
+ return NULL;
+ }
+
+ /*
+ * Get size and rank of the process in this communicator
+ */
+ size = smpi_comm_size(comm);
+ rank = smpi_comm_rank(comm);
+
+ tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
+ if (!tree) {
+ XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
+ return NULL;
+ }
+
+ tree->tree_root = MPI_UNDEFINED;
+ tree->tree_nextsize = MPI_UNDEFINED;
+
+ /*
+ * Set root
+ */
+ tree->tree_root = root;
+
+ /*
+ * Initialize tree
+ */
+ tree->tree_fanout = fanout;
+ tree->tree_bmtree = 0;
+ tree->tree_root = root;
+ tree->tree_prev = -1;
+ tree->tree_nextsize = 0;
+ for( i = 0; i < fanout; i++ ) {
+ tree->tree_next[i] = -1;
+ }
+
+ /* return if we have less than 2 processes */
+ if( size < 2 ) {
+ return tree;
+ }
+
+ /*
+ * Shift all ranks by root, so that the algorithm can be
+ * designed as if root would be always 0
+ * shiftedrank should be used in calculating distances
+ * and position in tree
+ */
+ shiftedrank = rank - root;
+ if( shiftedrank < 0 ) {
+ shiftedrank += size;
+ }
+
+ /* calculate my level */
+ level = calculate_level( fanout, shiftedrank );
+ delta = pown( fanout, level );
+
+ /* find my children */
+ for( i = 0; i < fanout; i++ ) {
+ schild = shiftedrank + delta * (i+1);
+ if( schild < size ) {
+ tree->tree_next[i] = (schild+root)%size;
+ tree->tree_nextsize = tree->tree_nextsize + 1;
+ } else {
+ break;
+ }
+ }
+
+ /* find my parent */
+ slimit = calculate_num_nodes_up_to_level( fanout, level );
+ sparent = shiftedrank;
+ if( sparent < fanout ) {
+ sparent = 0;
+ } else {
+ while( sparent >= slimit ) {
+ sparent -= delta/fanout;
+ }
+ }
+ tree->tree_prev = (sparent+root)%size;
+
+ return tree;
+}
+
+
+int
+smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer,
+ int count,
+ MPI_Datatype datatype,
+ int root,
+ MPI_Comm comm)
+{
+ int segsize ;
+ int rank, size;
+ int segindex, i, lr, pair;
+ int segcount[2]; /* Number ompi_request_wait_allof elements sent with each segment */
+ uint32_t counts[2];
+ int num_segments[2]; /* Number of segmenets */
+ int sendcount[2]; /* the same like segcount, except for the last segment */
+ size_t realsegsize[2];
+ char *tmpbuf[2];
+ size_t type_size;
+ ptrdiff_t type_extent;
+
+
+ MPI_Request base_req, new_req;
+ ompi_coll_tree_t *tree;
+// mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
+// mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
+
+ size = smpi_comm_size(comm);
+ rank = smpi_comm_rank(comm);
+
+
+ //compute again segsize
+ const size_t intermediate_message_size = 370728;
+ size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count;
+ if(message_size < intermediate_message_size)
+ segsize = 1024 ;
+ else
+ segsize = 1024 << 3;
+
+ XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize);
+
+ if (size == 1) {
+ return MPI_SUCCESS;
+ }
+
+ /* setup the binary tree topology. */
+ tree = ompi_coll_tuned_topo_build_tree(2,comm,root);
+
+ type_size = smpi_datatype_size( datatype );
+
+ /* Determine number of segments and number of elements per segment */
+ counts[0] = count/2;
+ if (count % 2 != 0) counts[0]++;
+ counts[1] = count - counts[0];
+ if ( segsize > 0 ) {
+ /* Note that ompi_datatype_type_size() will never return a negative
+ value in typelng; it returns an int [vs. an unsigned type]
+ because of the MPI spec. */
+ if (segsize < ((uint32_t) type_size)) {
+ segsize = type_size; /* push segsize up to hold one type */
+ }
+ segcount[0] = segcount[1] = segsize / type_size;
+ num_segments[0] = counts[0]/segcount[0];
+ if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
+ num_segments[1] = counts[1]/segcount[1];
+ if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
+ } else {
+ segcount[0] = counts[0];
+ segcount[1] = counts[1];
+ num_segments[0] = num_segments[1] = 1;
+ }
+
+ /* if the message is too small to be split into segments */
+ if( (counts[0] == 0 || counts[1] == 0) ||
+ (segsize > counts[0] * type_size) ||
+ (segsize > counts[1] * type_size) ) {
+ /* call linear version here ! */
+ return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype,
+ root, comm));
+ }
+ type_extent = smpi_datatype_get_extent(datatype);
+
+
+ /* Determine real segment size */
+ realsegsize[0] = segcount[0] * type_extent;
+ realsegsize[1] = segcount[1] * type_extent;
+
+ /* set the buffer pointers */
+ tmpbuf[0] = (char *) buffer;
+ tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
+
+ /* Step 1:
+ Root splits the buffer in 2 and sends segmented message down the branches.
+ Left subtree of the tree receives first half of the buffer, while right
+ subtree receives the remaining message.
+ */
+
+ /* determine if I am left (0) or right (1), (root is right) */
+ lr = ((rank + size - root)%size + 1)%2;
+
+ /* root code */
+ if( rank == root ) {
+ /* determine segment count */
+ sendcount[0] = segcount[0];
+ sendcount[1] = segcount[1];
+ /* for each segment */
+ for (segindex = 0; segindex < num_segments[0]; segindex++) {
+ /* for each child */
+ for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
+ if (segindex >= num_segments[i]) { /* no more segments */
+ continue;
+ }
+ /* determine how many elements are being sent in this round */
+ if(segindex == (num_segments[i] - 1))
+ sendcount[i] = counts[i] - segindex*segcount[i];
+ /* send data */
+ smpi_mpi_send(tmpbuf[i], sendcount[i], datatype,
+ tree->tree_next[i], 777, comm);
+ /* update tmp buffer */
+ tmpbuf[i] += realsegsize[i];
+ }
+ }
+ }
+
+ /* intermediate nodes code */
+ else if( tree->tree_nextsize > 0 ) {
+ /* Intermediate nodes:
+ * It will receive segments only from one half of the data.
+ * Which one is determined by whether the node belongs to the "left" or "right"
+ * subtree. Topoloby building function builds binary tree such that
+ * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
+ * and even on the right subtree.
+ *
+ * Create the pipeline. We first post the first receive, then in the loop we
+ * post the next receive and after that wait for the previous receive to complete
+ * and we disseminating the data to all children.
+ */
+ sendcount[lr] = segcount[lr];
+ base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype,
+ tree->tree_prev, 777,
+ comm);
+
+ for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
+ /* determine how many elements to expect in this round */
+ if( segindex == (num_segments[lr] - 1))
+ sendcount[lr] = counts[lr] - segindex*segcount[lr];
+ /* post new irecv */
+ new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
+ datatype, tree->tree_prev, 777,
+ comm);
+
+ /* wait for and forward current segment */
+ smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
+ for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */
+ smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype,
+ tree->tree_next[i], 777,
+ comm);
+ } /* end of for each child */
+
+ /* upate the base request */
+ base_req = new_req;
+ /* go to the next buffer (ie. the one corresponding to the next recv) */
+ tmpbuf[lr] += realsegsize[lr];
+ } /* end of for segindex */
+
+ /* wait for the last segment and forward current segment */
+ smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
+ for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
+ smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype,
+ tree->tree_next[i], 777, comm);
+ } /* end of for each child */
+ }
+
+ /* leaf nodes */
+ else {
+ /* Just consume segments as fast as possible */
+ sendcount[lr] = segcount[lr];
+ for (segindex = 0; segindex < num_segments[lr]; segindex++) {
+ /* determine how many elements to expect in this round */
+ if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
+ /* receive segments */
+ smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype,
+ tree->tree_prev, 777,
+ comm, MPI_STATUS_IGNORE);
+ /* update the initial pointer to the buffer */
+ tmpbuf[lr] += realsegsize[lr];
+ }
+ }
+
+ /* reset the buffer pointers */
+ tmpbuf[0] = (char *) buffer;
+ tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
+
+ /* Step 2:
+ Find your immediate pair (identical node in opposite subtree) and SendRecv
+ data buffer with them.
+ The tree building function ensures that
+ if (we are not root)
+ if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
+ if we are in the right subtree (lr == 1) our pair is (rank-1)%size
+ If we have even number of nodes the rank (size-1) will pair up with root.
+ */
+ if (lr == 0) {
+ pair = (rank+1)%size;
+ } else {
+ pair = (rank+size-1)%size;
+ }
+
+ if ( (size%2) != 0 && rank != root) {
+
+ smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
+ pair, 777,
+ tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
+ pair, 777,
+ comm, MPI_STATUS_IGNORE);
+ } else if ( (size%2) == 0 ) {
+ /* root sends right buffer to the last node */
+ if( rank == root ) {
+ smpi_mpi_send(tmpbuf[1], counts[1], datatype,
+ (root+size-1)%size, 777, comm);
+
+ }
+ /* last node receives right buffer from the root */
+ else if (rank == (root+size-1)%size) {
+ smpi_mpi_recv(tmpbuf[1], counts[1], datatype,
+ root, 777,
+ comm, MPI_STATUS_IGNORE);
+ }
+ /* everyone else exchanges buffers */
+ else {
+ smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
+ pair, 777,
+ tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
+ pair, 777,
+ comm, MPI_STATUS_IGNORE);
+ }
+ }
+ return (MPI_SUCCESS);
+
+
+}
+