Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
add new allgather algo from ompi
authorAugustin Degomme <degomme@idpann.imag.fr>
Tue, 11 Jun 2013 09:02:21 +0000 (11:02 +0200)
committerAugustin Degomme <degomme@idpann.imag.fr>
Tue, 11 Jun 2013 15:53:59 +0000 (17:53 +0200)
buildtools/Cmake/AddTests.cmake
buildtools/Cmake/DefinePackages.cmake
src/smpi/colls/allgather-ompi-neighborexchange.c [new file with mode: 0644]
src/smpi/colls/colls.h
src/smpi/colls/smpi_openmpi_selector.c

index 139057c..e7c8abb 100644 (file)
@@ -373,7 +373,7 @@ if(NOT enable_memcheck)
 
     FOREACH (ALLGATHER_COLL default  2dmesh 3dmesh bruck GB loosely_lr lr
                            NTSLR NTSLR_NB pair rdb  rhv ring SMP_NTS
-                           smp_simple spreading_simple ompi)
+                           smp_simple spreading_simple ompi ompi_neighborexchange)
         ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh)
     ENDFOREACH()
     FOREACH (ALLGATHERV_COLL default GB pair ring ompi)
index 866eb45..ce0f758 100644 (file)
@@ -128,6 +128,7 @@ set(SMPI_SRC
   src/smpi/colls/allgather-SMP-NTS.c
   src/smpi/colls/allgather-smp-simple.c
   src/smpi/colls/allgather-spreading-simple.c
+  src/smpi/colls/allgather-ompi-neighborexchange.c
   src/smpi/colls/allgatherv-GB.c  
   src/smpi/colls/allgatherv-pair.c
   src/smpi/colls/allgatherv-ring.c
diff --git a/src/smpi/colls/allgather-ompi-neighborexchange.c b/src/smpi/colls/allgather-ompi-neighborexchange.c
new file mode 100644 (file)
index 0000000..238163e
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * ompi_coll_tuned_allgather_intra_neighborexchange
+ *
+ * Function:     allgather using N/2 steps (O(N))
+ * Accepts:      Same arguments as MPI_Allgather
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description:  Neighbor Exchange algorithm for allgather.
+ *               Described by Chen et.al. in 
+ *               "Performance Evaluation of Allgather Algorithms on 
+ *                Terascale Linux Cluster with Fast Ethernet",
+ *               Proceedings of the Eighth International Conference on 
+ *               High-Performance Computing inn Asia-Pacific Region
+ *               (HPCASIA'05), 2005
+ * 
+ *               Rank r exchanges message with one of its neighbors and
+ *               forwards the data further in the next step.
+ *
+ *               No additional memory requirements.
+ * 
+ * Limitations:  Algorithm works only on even number of processes.
+ *               For odd number of processes we switch to ring algorithm.
+ * 
+ * Example on 6 nodes:
+ *  Initial state
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
+ *   Step 0:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
+ *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
+ *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
+ *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
+ *   Step 1:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [ ]    [ ]    [0]
+ *         [1]    [1]    [1]    [ ]    [ ]    [1]
+ *         [ ]    [2]    [2]    [2]    [2]    [ ]
+ *         [ ]    [3]    [3]    [3]    [3]    [ ]
+ *         [4]    [ ]    [ ]    [4]    [4]    [4]
+ *         [5]    [ ]    [ ]    [5]    [5]    [5]
+ *   Step 2:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [0]    [0]    [0]
+ *         [1]    [1]    [1]    [1]    [1]    [1]
+ *         [2]    [2]    [2]    [2]    [2]    [2]
+ *         [3]    [3]    [3]    [3]    [3]    [3]
+ *         [4]    [4]    [4]    [4]    [4]    [4]
+ *         [5]    [5]    [5]    [5]    [5]    [5]
+ */
+ #include "colls_private.h"
+ #define MCA_COLL_BASE_TAG_ALLGATHER 555
+int 
+smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
+                                                 MPI_Datatype sdtype,
+                                                 void* rbuf, int rcount,
+                                                 MPI_Datatype rdtype,
+                                                 MPI_Comm comm
+)
+{
+   int line = -1;
+   int rank, size;
+   int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
+   int i, even_rank;
+   int err = 0;
+   ptrdiff_t slb, rlb, sext, rext;
+   char *tmpsend = NULL, *tmprecv = NULL;
+
+   size = smpi_comm_size(comm);
+   rank = smpi_comm_rank(comm);
+
+   if (size % 2) {
+      XBT_DEBUG(
+                   "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
+                   size);
+      return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
+                                                  rbuf, rcount, rdtype,
+                                                  comm);
+   }
+
+   XBT_DEBUG(
+                "coll:tuned:allgather_intra_neighborexchange rank %d", rank);
+
+   err = smpi_datatype_extent (sdtype, &slb, &sext);
+   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+   err = smpi_datatype_extent (rdtype, &rlb, &rext);
+   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+   /* Initialization step:
+      - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
+        of receive buffer
+   */
+   tmprecv = (char*) rbuf + rank * rcount * rext;
+   if (MPI_IN_PLACE != sbuf) {
+      tmpsend = (char*) sbuf;
+      smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
+   } 
+
+   /* Determine neighbors, order in which blocks will arrive, etc. */
+   even_rank = !(rank % 2);
+   if (even_rank) {
+      neighbor[0] = (rank + 1) % size;
+      neighbor[1] = (rank - 1 + size) % size;
+      recv_data_from[0] = rank;
+      recv_data_from[1] = rank;
+      offset_at_step[0] = (+2);
+      offset_at_step[1] = (-2);
+   } else {
+      neighbor[0] = (rank - 1 + size) % size;
+      neighbor[1] = (rank + 1) % size;
+      recv_data_from[0] = neighbor[0];
+      recv_data_from[1] = neighbor[0];
+      offset_at_step[0] = (-2);
+      offset_at_step[1] = (+2);
+   }
+
+   /* Communication loop:
+      - First step is special: exchange a single block with neighbor[0].
+      - Rest of the steps: 
+        update recv_data_from according to offset, and 
+        exchange two blocks with appropriate neighbor.
+        the send location becomes previous receve location.
+   */
+   tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
+   tmpsend = (char*)rbuf + rank * rcount * rext;
+   /* Sendreceive */
+   smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
+                                  MCA_COLL_BASE_TAG_ALLGATHER,
+                                  tmprecv, rcount, rdtype, neighbor[0],
+                                  MCA_COLL_BASE_TAG_ALLGATHER,
+                                  comm, MPI_STATUS_IGNORE);
+
+   /* Determine initial sending location */
+   if (even_rank) {
+      send_data_from = rank;
+   } else {
+      send_data_from = recv_data_from[0];
+   }
+
+   for (i = 1; i < (size / 2); i++) {
+      const int i_parity = i % 2;
+      recv_data_from[i_parity] = 
+         (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
+
+      tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
+      tmpsend = (char*)rbuf + send_data_from * rcount * rext;
+      
+      /* Sendreceive */
+      smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
+                                     neighbor[i_parity], 
+                                     MCA_COLL_BASE_TAG_ALLGATHER,
+                                     tmprecv, 2 * rcount, rdtype,
+                                     neighbor[i_parity],
+                                     MCA_COLL_BASE_TAG_ALLGATHER,
+                                     comm, MPI_STATUS_IGNORE);
+
+      send_data_from = recv_data_from[i_parity];
+   }
+
+   return MPI_SUCCESS;
+
+ err_hndl:
+   XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
+                __FILE__, line, err, rank);
+   return err;
+}
index 9af9ad3..6743650 100644 (file)
@@ -45,7 +45,8 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi)
+COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange)
 
 COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 
index 55f9879..2ea07e3 100644 (file)
@@ -426,15 +426,15 @@ int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
                                                          comm);
         }
     } else {
-        //if (communicator_size % 2) {
+        if (communicator_size % 2) {
             return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         comm);
-        /*} else {
-            return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
+        } else {
+            return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                      rbuf, rcount, rdtype,
-                                                                     comm, module);
-        }*/
+                                                                     comm);
+        }
     }
    
 #if defined(USE_MPICH2_DECISION)