Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
add allgatherv algo from ompi
authorAugustin Degomme <degomme@idpann.imag.fr>
Tue, 11 Jun 2013 13:26:33 +0000 (15:26 +0200)
committerAugustin Degomme <degomme@idpann.imag.fr>
Tue, 11 Jun 2013 15:53:59 +0000 (17:53 +0200)
buildtools/Cmake/AddTests.cmake
buildtools/Cmake/DefinePackages.cmake
src/smpi/colls/allgatherv-ompi-neighborexchange.c [new file with mode: 0644]
src/smpi/colls/colls.h
src/smpi/colls/smpi_openmpi_selector.c

index e7c8abb..bef08ed 100644 (file)
@@ -376,7 +376,7 @@ if(NOT enable_memcheck)
                            smp_simple spreading_simple ompi ompi_neighborexchange)
         ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh)
     ENDFOREACH()
-    FOREACH (ALLGATHERV_COLL default GB pair ring ompi)
+    FOREACH (ALLGATHERV_COLL default GB pair ring ompi ompi_neighborexchange)
         ADD_TEST(smpi-allgatherv-coll-${ALLGATHERV_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgatherv:${ALLGATHERV_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgatherv_coll.tesh)
     ENDFOREACH()
     FOREACH (ALLREDUCE_COLL default lr NTS rab1 rab2 rab_rdb
index ce0f758..2e31a12 100644 (file)
@@ -132,6 +132,7 @@ set(SMPI_SRC
   src/smpi/colls/allgatherv-GB.c  
   src/smpi/colls/allgatherv-pair.c
   src/smpi/colls/allgatherv-ring.c
+  src/smpi/colls/allgatherv-ompi-neighborexchange.c
   src/smpi/colls/allreduce-lr.c
   src/smpi/colls/allreduce-NTS.c
   src/smpi/colls/allreduce-rab1.c
diff --git a/src/smpi/colls/allgatherv-ompi-neighborexchange.c b/src/smpi/colls/allgatherv-ompi-neighborexchange.c
new file mode 100644 (file)
index 0000000..f80e681
--- /dev/null
@@ -0,0 +1,214 @@
+
+/*
+ * ompi_coll_tuned_allgatherv_intra_neighborexchange
+ *
+ * Function:     allgatherv using N/2 steps (O(N))
+ * Accepts:      Same arguments as MPI_Allgatherv
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description:  Neighbor Exchange algorithm for allgather adapted for 
+ *               allgatherv.
+ *               Described by Chen et.al. in 
+ *               "Performance Evaluation of Allgather Algorithms on 
+ *                Terascale Linux Cluster with Fast Ethernet",
+ *               Proceedings of the Eighth International Conference on 
+ *               High-Performance Computing inn Asia-Pacific Region
+ *               (HPCASIA'05), 2005
+ * 
+ *               Rank r exchanges message with one of its neighbors and
+ *               forwards the data further in the next step.
+ *
+ *               No additional memory requirements.
+ * 
+ * Limitations:  Algorithm works only on even number of processes.
+ *               For odd number of processes we switch to ring algorithm.
+ * 
+ * Example on 6 nodes:
+ *  Initial state
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
+ *   Step 0:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [ ]    [ ]    [ ]    [ ]
+ *         [1]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [2]    [ ]    [ ]
+ *         [ ]    [ ]    [3]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [4]
+ *         [ ]    [ ]    [ ]    [ ]    [5]    [5]
+ *   Step 1:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [ ]    [ ]    [0]
+ *         [1]    [1]    [1]    [ ]    [ ]    [1]
+ *         [ ]    [2]    [2]    [2]    [2]    [ ]
+ *         [ ]    [3]    [3]    [3]    [3]    [ ]
+ *         [4]    [ ]    [ ]    [4]    [4]    [4]
+ *         [5]    [ ]    [ ]    [5]    [5]    [5]
+ *   Step 2:
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [0]    [0]    [0]
+ *         [1]    [1]    [1]    [1]    [1]    [1]
+ *         [2]    [2]    [2]    [2]    [2]    [2]
+ *         [3]    [3]    [3]    [3]    [3]    [3]
+ *         [4]    [4]    [4]    [4]    [4]    [4]
+ *         [5]    [5]    [5]    [5]    [5]    [5]
+ */
+ #include "colls_private.h"
+ #define  MCA_COLL_BASE_TAG_ALLGATHERV 444
+int 
+smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
+                                                  MPI_Datatype sdtype,
+                                                  void* rbuf, int *rcounts, int *rdispls,
+                                                  MPI_Datatype rdtype,
+                                                  MPI_Comm comm)
+{
+    int line = -1;
+    int rank, size;
+    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
+  
+    int i, even_rank;
+    int err = 0;
+    ptrdiff_t slb, rlb, sext, rext;
+    char *tmpsend = NULL, *tmprecv = NULL;
+
+
+    size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+    if (size % 2) {
+        XBT_DEBUG(
+                     "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
+                     size);
+        return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
+                                                     rbuf, rcounts, 
+                                                     rdispls, rdtype,
+                                                     comm);
+    }
+
+    XBT_DEBUG(
+                 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
+
+    err = smpi_datatype_extent (sdtype, &slb, &sext);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    err = smpi_datatype_extent (rdtype, &rlb, &rext);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    /* Initialization step:
+       - if send buffer is not MPI_IN_PLACE, copy send buffer to 
+       the appropriate block of receive buffer
+    */
+    tmprecv = (char*) rbuf + rdispls[rank] * rext;
+    if (MPI_IN_PLACE != sbuf) {
+        tmpsend = (char*) sbuf;
+        err = smpi_datatype_copy(tmpsend, scount, sdtype, 
+                              tmprecv, rcounts[rank], rdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
+    } 
+
+    /* Determine neighbors, order in which blocks will arrive, etc. */
+    even_rank = !(rank % 2);
+    if (even_rank) {
+        neighbor[0] = (rank + 1) % size;
+        neighbor[1] = (rank - 1 + size) % size;
+        recv_data_from[0] = rank;
+        recv_data_from[1] = rank;
+        offset_at_step[0] = (+2);
+        offset_at_step[1] = (-2);
+    } else {
+        neighbor[0] = (rank - 1 + size) % size;
+        neighbor[1] = (rank + 1) % size;
+        recv_data_from[0] = neighbor[0];
+        recv_data_from[1] = neighbor[0];
+        offset_at_step[0] = (-2);
+        offset_at_step[1] = (+2);
+    }
+
+    /* Communication loop:
+       - First step is special: exchange a single block with neighbor[0].
+       - Rest of the steps: 
+       update recv_data_from according to offset, and 
+       exchange two blocks with appropriate neighbor.
+       the send location becomes previous receve location.
+       Note, we need to create indexed datatype to send and receive these
+       blocks properly.
+    */
+    tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
+    tmpsend = (char*)rbuf + rdispls[rank] * rext;
+    smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, 
+                                   neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
+                                   tmprecv, rcounts[neighbor[0]], rdtype, 
+                                   neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
+                                   comm, MPI_STATUS_IGNORE);
+
+
+
+  
+   
+    /* Determine initial sending counts and displacements*/
+    if (even_rank) {
+        send_data_from = rank;
+    } else {
+        send_data_from = recv_data_from[0];
+    }
+
+    for (i = 1; i < (size / 2); i++) {
+        MPI_Datatype new_rdtype, new_sdtype;
+        int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
+        const int i_parity = i % 2;
+        recv_data_from[i_parity] = 
+            (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
+
+        /* Create new indexed types for sending and receiving.
+           We are sending data from ranks (send_data_from) and (send_data_from+1)
+           We are receiving data from ranks (recv_data_from[i_parity]) and
+           (recv_data_from[i_parity]+1).
+        */
+        
+        new_scounts[0] = rcounts[send_data_from];
+        new_scounts[1] = rcounts[(send_data_from + 1)];
+        new_sdispls[0] = rdispls[send_data_from];
+        new_sdispls[1] = rdispls[(send_data_from + 1)];
+        err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, 
+                                      &new_sdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+        smpi_datatype_commit(&new_sdtype);
+
+        new_rcounts[0] = rcounts[recv_data_from[i_parity]];
+        new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
+        new_rdispls[0] = rdispls[recv_data_from[i_parity]];
+        new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
+        err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, 
+                                      &new_rdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+        smpi_datatype_commit(&new_rdtype);
+      
+        tmprecv = (char*)rbuf;
+        tmpsend = (char*)rbuf;
+      
+        /* Sendreceive */
+        smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
+                                       MCA_COLL_BASE_TAG_ALLGATHERV,
+                                       tmprecv, 1, new_rdtype, neighbor[i_parity],
+                                       MCA_COLL_BASE_TAG_ALLGATHERV,
+                                       comm, MPI_STATUS_IGNORE);
+
+        send_data_from = recv_data_from[i_parity];
+      
+        smpi_datatype_free(&new_sdtype);
+        smpi_datatype_free(&new_rdtype);
+    }
+
+    return MPI_SUCCESS;
+
+ err_hndl:
+    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank);
+    return err;
+}
index 6743650..ee51cac 100644 (file)
@@ -62,7 +62,8 @@ COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi)
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange)
 
 COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep)
 
index 2ea07e3..36f901f 100644 (file)
@@ -448,17 +448,17 @@ int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount,
        - for everything else use ring.
     */
     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
-        return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
+        return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
                                                                  rbuf, rcount, rdtype, 
-                                                                 comm, module);
+                                                                 comm);
     } else if (total_dsize <= 81920) { 
-        return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
+        return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype,
-                                                     comm, module);
+                                                     comm);
     } 
-    return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
+    return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                 rbuf, rcount, rdtype,
-                                                comm, module);
+                                                comm);
 #endif  /* defined(USE_MPICH2_DECISION) */
 }
 
@@ -500,15 +500,15 @@ int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount,
                                                       comm);
 
     } else {
-//        if (communicator_size % 2) {
+        if (communicator_size % 2) {
             return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
                                                          rbuf, rcounts, rdispls, rdtype, 
                                                          comm);
-/*        } else {
-            return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
+        } else {
+            return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                       rbuf, rcounts, rdispls, rdtype, 
-                                                                      comm, module);
-        }*/
+                                                                      comm);
+        }
     }
 }
 /*