Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
add one more bcast algo
authorAugustin Degomme <degomme@idpann.imag.fr>
Mon, 10 Jun 2013 16:13:37 +0000 (18:13 +0200)
committerAugustin Degomme <degomme@idpann.imag.fr>
Mon, 10 Jun 2013 16:13:37 +0000 (18:13 +0200)
buildtools/Cmake/AddTests.cmake
buildtools/Cmake/DefinePackages.cmake
src/smpi/colls/bcast-ompi-split-bintree.c [new file with mode: 0644]
src/smpi/colls/colls.h
src/smpi/colls/smpi_openmpi_selector.c

index cda54a3..7bd0ec8 100644 (file)
@@ -397,7 +397,7 @@ if(NOT enable_memcheck)
     ENDFOREACH()
     FOREACH (BCAST_COLL default arrival_nb arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter
                        binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather
-                       scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi)
+                       scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi ompi_split_bintree)
                ADD_TEST(smpi-bcast-coll-${BCAST_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/bcast:${BCAST_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast_coll.tesh)
     ENDFOREACH()
     FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi)
index 9b25102..f958728 100644 (file)
@@ -183,6 +183,7 @@ set(SMPI_SRC
   src/smpi/colls/bcast-SMP-binary.c
   src/smpi/colls/bcast-SMP-binomial.c
   src/smpi/colls/bcast-SMP-linear.c
+  src/smpi/colls/bcast-ompi-split-bintree.c
   src/smpi/colls/reduce-arrival-pattern-aware.c
   src/smpi/colls/reduce-binomial.c
   src/smpi/colls/reduce-flat-tree.c
diff --git a/src/smpi/colls/bcast-ompi-split-bintree.c b/src/smpi/colls/bcast-ompi-split-bintree.c
new file mode 100644 (file)
index 0000000..a68ebd2
--- /dev/null
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      University of Houston. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *  Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer listed
+ *   in this license in the documentation and/or other materials
+ *   provided with the distribution.
+
+ * - Neither the name of the copyright holders nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+
+ * The copyright holders provide no reassurances that the source code
+ * provided does not infringe any patent, copyright, or any other
+ * intellectual property rights of third parties.  The copyright holders
+ * disclaim any liability to any recipient for claims brought against
+ * recipient by any third party for infringement of that parties
+ * intellectual property rights.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+  #include "colls_private.h"
+  #define MAXTREEFANOUT 32
+ typedef struct ompi_coll_tree_t {
+        int32_t tree_root;
+        int32_t tree_fanout;
+        int32_t tree_bmtree;
+        int32_t tree_prev;
+        int32_t tree_next[MAXTREEFANOUT];
+        int32_t tree_nextsize;
+    } ompi_coll_tree_t;
+
+    ompi_coll_tree_t*
+    ompi_coll_tuned_topo_build_tree( int fanout,
+                                     MPI_Comm com,
+                                     int root );
+                                     
+
+/*
+ * Some static helpers.
+ */
+static int pown( int fanout, int num )
+{
+    int j, p = 1;
+    if( num < 0 ) return 0;
+    if (1==num) return fanout;
+    if (2==fanout) {
+        return p<<num;
+    }
+    else {
+        for( j = 0; j < num; j++ ) { p*= fanout; }
+    }
+    return p;
+}
+
+static int calculate_level( int fanout, int rank )
+{
+    int level, num;
+    if( rank < 0 ) return -1;
+    for( level = 0, num = 0; num <= rank; level++ ) {
+        num += pown(fanout, level);
+    }
+    return level-1;
+}
+
+static int calculate_num_nodes_up_to_level( int fanout, int level )
+{
+    /* just use geometric progression formula for sum:
+       a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
+    return ((pown(fanout,level) - 1)/(fanout - 1));
+}
+
+/*
+ * And now the building functions.
+ *
+ * An example for fanout = 2, comm_size = 7
+ *
+ *              0           <-- delta = 1 (fanout^0)
+ *            /   \
+ *           1     2        <-- delta = 2 (fanout^1)
+ *          / \   / \
+ *         3   5 4   6      <-- delta = 4 (fanout^2)
+ */
+
+ompi_coll_tree_t*
+ompi_coll_tuned_topo_build_tree( int fanout,
+                                 MPI_Comm comm,
+                                 int root )
+{
+    int rank, size;
+    int schild, sparent;
+    int level; /* location of my rank in the tree structure of size */
+    int delta; /* number of nodes on my level */
+    int slimit; /* total number of nodes on levels above me */ 
+    int shiftedrank;
+    int i;
+    ompi_coll_tree_t* tree;
+
+    XBT_DEBUG( "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root);
+
+    if (fanout<1) {
+        XBT_DEBUG( "coll:tuned:topo_build_tree invalid fanout %d", fanout);
+        return NULL;
+    }
+    if (fanout>MAXTREEFANOUT) {
+        XBT_DEBUG("coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT);
+        return NULL;
+    }
+
+    /* 
+     * Get size and rank of the process in this communicator 
+     */
+    size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+    tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
+    if (!tree) {
+        XBT_DEBUG("coll:tuned:topo_build_tree PANIC::out of memory");
+        return NULL;
+    }
+
+    tree->tree_root     = MPI_UNDEFINED;
+    tree->tree_nextsize = MPI_UNDEFINED;
+
+    /*
+     * Set root
+     */
+    tree->tree_root = root;
+  
+    /* 
+     * Initialize tree
+     */
+    tree->tree_fanout   = fanout;
+    tree->tree_bmtree   = 0;
+    tree->tree_root     = root;
+    tree->tree_prev     = -1;
+    tree->tree_nextsize = 0;
+    for( i = 0; i < fanout; i++ ) {
+        tree->tree_next[i] = -1;
+    }
+
+    /* return if we have less than 2 processes */
+    if( size < 2 ) {
+        return tree;
+    }
+  
+    /*
+     * Shift all ranks by root, so that the algorithm can be 
+     * designed as if root would be always 0
+     * shiftedrank should be used in calculating distances 
+     * and position in tree
+     */
+    shiftedrank = rank - root;
+    if( shiftedrank < 0 ) {
+        shiftedrank += size;
+    }
+
+    /* calculate my level */
+    level = calculate_level( fanout, shiftedrank );
+    delta = pown( fanout, level );
+
+    /* find my children */
+    for( i = 0; i < fanout; i++ ) {
+        schild = shiftedrank + delta * (i+1);
+        if( schild < size ) {
+            tree->tree_next[i] = (schild+root)%size;
+            tree->tree_nextsize = tree->tree_nextsize + 1;
+        } else {
+            break;
+        }
+    }
+    
+    /* find my parent */
+    slimit = calculate_num_nodes_up_to_level( fanout, level );
+    sparent = shiftedrank;
+    if( sparent < fanout ) {
+        sparent = 0;
+    } else {
+        while( sparent >= slimit ) {
+            sparent -= delta/fanout;
+        }
+    }
+    tree->tree_prev = (sparent+root)%size;
+  
+    return tree;
+}
+
+int
+smpi_coll_tuned_bcast_ompi_split_bintree ( void* buffer,
+                                            int count, 
+                                            MPI_Datatype datatype, 
+                                            int root,
+                                            MPI_Comm comm)
+{
+    int segsize ;
+    int rank, size;
+    int segindex, i, lr, pair;
+    int segcount[2];       /* Number ompi_request_wait_allof elements sent with each segment */
+    uint32_t counts[2];
+    int num_segments[2];   /* Number of segmenets */
+    int sendcount[2];      /* the same like segcount, except for the last segment */ 
+    size_t realsegsize[2];
+    char *tmpbuf[2];
+    size_t type_size;
+    ptrdiff_t type_extent;
+    
+    
+    MPI_Request base_req, new_req;
+    ompi_coll_tree_t *tree;
+//    mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
+//    mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
+
+    size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+
+    //compute again segsize
+    const size_t intermediate_message_size = 370728;
+    size_t message_size = smpi_datatype_size(datatype) * (unsigned long)count;
+    if(message_size < intermediate_message_size) 
+      segsize = 1024 ;
+    else
+      segsize = 1024 << 3;
+      
+    XBT_DEBUG("ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize);
+
+    if (size == 1) {
+        return MPI_SUCCESS;
+    }
+
+    /* setup the binary tree topology. */
+    tree = ompi_coll_tuned_topo_build_tree(2,comm,root);
+
+    type_size = smpi_datatype_size( datatype );
+
+    /* Determine number of segments and number of elements per segment */
+    counts[0] = count/2;
+    if (count % 2 != 0) counts[0]++;
+    counts[1] = count - counts[0];
+    if ( segsize > 0 ) {
+        /* Note that ompi_datatype_type_size() will never return a negative
+           value in typelng; it returns an int [vs. an unsigned type]
+           because of the MPI spec. */
+       if (segsize < ((uint32_t) type_size)) {
+            segsize = type_size; /* push segsize up to hold one type */
+        }
+        segcount[0] = segcount[1] = segsize / type_size; 
+        num_segments[0] = counts[0]/segcount[0];
+        if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
+        num_segments[1] = counts[1]/segcount[1];
+        if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
+    } else {
+        segcount[0]     = counts[0];
+        segcount[1]     = counts[1];
+        num_segments[0] = num_segments[1] = 1;
+    }
+
+    /* if the message is too small to be split into segments */
+    if( (counts[0] == 0 || counts[1] == 0) ||
+        (segsize > counts[0] * type_size) ||
+        (segsize > counts[1] * type_size) ) {
+        /* call linear version here ! */
+        return (smpi_coll_tuned_bcast_SMP_linear ( buffer, count, datatype, 
+                                                    root, comm));
+    }
+    type_extent = smpi_datatype_get_extent(datatype);
+
+    
+    /* Determine real segment size */
+    realsegsize[0] = segcount[0] * type_extent;
+    realsegsize[1] = segcount[1] * type_extent;
+  
+    /* set the buffer pointers */
+    tmpbuf[0] = (char *) buffer;
+    tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
+
+    /* Step 1:
+       Root splits the buffer in 2 and sends segmented message down the branches.
+       Left subtree of the tree receives first half of the buffer, while right
+       subtree receives the remaining message.
+    */
+
+    /* determine if I am left (0) or right (1), (root is right) */
+    lr = ((rank + size - root)%size + 1)%2;
+  
+    /* root code */
+    if( rank == root ) {
+        /* determine segment count */
+        sendcount[0] = segcount[0]; 
+        sendcount[1] = segcount[1];
+        /* for each segment */
+        for (segindex = 0; segindex < num_segments[0]; segindex++) {
+            /* for each child */
+            for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
+                if (segindex >= num_segments[i]) { /* no more segments */
+                    continue;
+                }
+                /* determine how many elements are being sent in this round */
+                if(segindex == (num_segments[i] - 1)) 
+                    sendcount[i] = counts[i] - segindex*segcount[i];
+                /* send data */
+                smpi_mpi_send(tmpbuf[i], sendcount[i], datatype,
+                                  tree->tree_next[i], 777, comm);
+                /* update tmp buffer */
+                tmpbuf[i] += realsegsize[i];
+            }
+        }
+    } 
+    
+    /* intermediate nodes code */
+    else if( tree->tree_nextsize > 0 ) { 
+        /* Intermediate nodes:
+         * It will receive segments only from one half of the data.
+         * Which one is determined by whether the node belongs to the "left" or "right" 
+         * subtree. Topoloby building function builds binary tree such that
+         * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
+         * and even on the right subtree.
+         *
+         * Create the pipeline. We first post the first receive, then in the loop we
+         * post the next receive and after that wait for the previous receive to complete 
+         * and we disseminating the data to all children.
+         */
+        sendcount[lr] = segcount[lr];
+        base_req=smpi_mpi_irecv(tmpbuf[lr], sendcount[lr], datatype,
+                           tree->tree_prev, 777,
+                           comm);
+
+        for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
+            /* determine how many elements to expect in this round */
+            if( segindex == (num_segments[lr] - 1)) 
+                sendcount[lr] = counts[lr] - segindex*segcount[lr];
+            /* post new irecv */
+            new_req = smpi_mpi_irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
+                                datatype, tree->tree_prev, 777, 
+                                comm);
+
+            /* wait for and forward current segment */
+            smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
+            for( i = 0; i < tree->tree_nextsize; i++ ) {  /* send data to children (segcount[lr]) */
+                smpi_mpi_send( tmpbuf[lr], segcount[lr], datatype,
+                                   tree->tree_next[i], 777,
+                                   comm);
+            } /* end of for each child */
+
+            /* upate the base request */
+            base_req = new_req;     
+            /* go to the next buffer (ie. the one corresponding to the next recv) */
+            tmpbuf[lr] += realsegsize[lr];
+        } /* end of for segindex */
+
+        /* wait for the last segment and forward current segment */
+        smpi_mpi_waitall( 1, &base_req, MPI_STATUSES_IGNORE );
+        for( i = 0; i < tree->tree_nextsize; i++ ) {  /* send data to children */
+            smpi_mpi_send(tmpbuf[lr], sendcount[lr], datatype,
+                              tree->tree_next[i], 777, comm);
+        } /* end of for each child */
+    } 
+  
+    /* leaf nodes */
+    else { 
+        /* Just consume segments as fast as possible */
+        sendcount[lr] = segcount[lr];
+        for (segindex = 0; segindex < num_segments[lr]; segindex++) {
+            /* determine how many elements to expect in this round */
+            if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
+            /* receive segments */
+            smpi_mpi_recv(tmpbuf[lr], sendcount[lr], datatype,
+                              tree->tree_prev, 777,
+                              comm, MPI_STATUS_IGNORE);
+            /* update the initial pointer to the buffer */
+            tmpbuf[lr] += realsegsize[lr];
+        }
+    }
+
+    /* reset the buffer pointers */
+    tmpbuf[0] = (char *) buffer;
+    tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
+
+    /* Step 2:
+       Find your immediate pair (identical node in opposite subtree) and SendRecv 
+       data buffer with them.
+       The tree building function ensures that 
+       if (we are not root)
+       if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
+       if we are in the right subtree (lr == 1) our pair is (rank-1)%size
+       If we have even number of nodes the rank (size-1) will pair up with root.
+    */
+    if (lr == 0) {
+        pair = (rank+1)%size;
+    } else {
+        pair = (rank+size-1)%size;
+    }
+
+    if ( (size%2) != 0 && rank != root) { 
+
+        smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
+                                        pair, 777,
+                                        tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
+                                        pair, 777,
+                                        comm, MPI_STATUS_IGNORE);
+    } else if ( (size%2) == 0 ) {
+        /* root sends right buffer to the last node */
+        if( rank == root ) {
+            smpi_mpi_send(tmpbuf[1], counts[1], datatype,
+                              (root+size-1)%size, 777, comm);
+
+        } 
+        /* last node receives right buffer from the root */
+        else if (rank == (root+size-1)%size) {
+            smpi_mpi_recv(tmpbuf[1], counts[1], datatype,
+                              root, 777,
+                              comm, MPI_STATUS_IGNORE);
+        } 
+        /* everyone else exchanges buffers */
+        else {
+            smpi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
+                                            pair, 777,
+                                            tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
+                                            pair, 777,
+                                            comm, MPI_STATUS_IGNORE); 
+        }
+    }
+    return (MPI_SUCCESS);
+  
+
+}
+
index 54647a0..ddbf183 100644 (file)
@@ -165,7 +165,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, scatter_rdb_allgather) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binary) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \
-COLL_APPLY(action, COLL_BCAST_SIG, ompi)
+COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree)
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
 
index 6b46373..13e5a41 100644 (file)
@@ -129,7 +129,7 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 {
     /* Decision function based on MX results for 
        messages up to 36MB and communicator sizes up to 64 nodes */
-    //const size_t small_message_size = 2048;
+    const size_t small_message_size = 2048;
     const size_t intermediate_message_size = 370728;
     //const double a_p16  = 3.2118e-6; /* [1 / byte] */
     //const double b_p16  = 8.7936;   
@@ -138,11 +138,11 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count,
     //const double a_p128 = 1.6134e-6; /* [1 / byte] */
     //const double b_p128 = 2.1102;
 
-    //int communicator_size;
+    int communicator_size;
     //int segsize = 0;
     size_t message_size, dsize;
 
-    //communicator_size = smpi_comm_size(comm);
+    communicator_size = smpi_comm_size(comm);
 
     /* else we need data size for decision function */
     dsize = smpi_datatype_size(datatype);
@@ -150,21 +150,17 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count,
 
     /* Handle messages of small and intermediate size, and 
        single-element broadcasts */
-    if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) {
+    if ((message_size < small_message_size) || (count <= 1)) {
         /* Binomial without segmentation */
-        //segsize = 0;
         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
-                                                      root, comm/*
-                                                      segsize*/);
+                                                      root, comm);
 
-    } /*else if (message_size < intermediate_message_size) {
+    } else if (message_size < intermediate_message_size) {
         // SplittedBinary with 1KB segments
-        segsize = 1024;
-        return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype, 
-                                                         root, comm
-                                                         segsize);
+        return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
+                                                         root, comm);
 
-    } 
+    } /*
      Handle large message sizes 
     else if (communicator_size < (a_p128 * message_size + b_p128)) {
          Pipeline with 128KB segments 
@@ -173,14 +169,12 @@ int smpi_coll_tuned_bcast_ompi(void *buff, int count,
                                                      root, comm, module,
                                                      segsize);
 
-    } else if (communicator_size < 13) {
+    }*/ else if (communicator_size < 13) {
         // Split Binary with 8KB segments 
-        segsize = 1024 << 3;
-        return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, 
-                                                         root, comm, module,
-                                                         segsize);
+        return smpi_coll_tuned_bcast_ompi_split_bintree(buff, count, datatype, 
+                                                         root, comm);
        
-    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
+    } /*else if (communicator_size < (a_p64 * message_size + b_p64)) {
         // Pipeline with 64KB segments 
         segsize = 1024 << 6;
         return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,