Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add Bcast SMP Collectives from MVAPICH2
authorAugustin Degomme <augustin.degomme@imag.fr>
Thu, 7 Aug 2014 22:31:40 +0000 (00:31 +0200)
committerAugustin Degomme <augustin.degomme@imag.fr>
Thu, 7 Aug 2014 22:31:48 +0000 (00:31 +0200)
Still a few problems with pipelined algo from openmpi .. for now default to another one for these pipelined algos

buildtools/Cmake/AddTests.cmake
buildtools/Cmake/DefinePackages.cmake
src/smpi/colls/bcast-mvapich-smp.c [new file with mode: 0644]
src/smpi/colls/bcast-ompi-pipeline.c
src/smpi/colls/colls.h
src/smpi/colls/smpi_mvapich2_selector.c
src/smpi/colls/smpi_mvapich2_selector_stampede.h
src/smpi/smpi_comm.c

index 54e669e..c3b7424 100644 (file)
@@ -403,7 +403,7 @@ IF(NOT enable_memcheck)
     ENDFOREACH()
     FOREACH (BCAST_COLL default arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter
                         binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather
-                        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline mvapich2 impi)
+                        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline mvapich2 mvapich2_intra_node mvapich2_knomial_intra_node impi)
       ADD_TESH(tesh-smpi-bcast-coll-${BCAST_COLL} --cfg smpi/bcast:${BCAST_COLL} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/bcast --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast bcast_coll.tesh)
     ENDFOREACH()
     FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi mpich ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary mvapich2 mvapich2_knomial mvapich2_two_level impi rab)
index 50cc1c5..c4ebec6 100644 (file)
@@ -210,6 +210,7 @@ set(SMPI_SRC
   src/smpi/colls/bcast-flattree.c
   src/smpi/colls/bcast-ompi-pipeline.c
   src/smpi/colls/bcast-ompi-split-bintree.c
+  src/smpi/colls/bcast-mvapich-smp.c
   src/smpi/colls/bcast-scatter-LR-allgather.c
   src/smpi/colls/bcast-scatter-rdb-allgather.c
   src/smpi/colls/coll_tuned_topo.c
diff --git a/src/smpi/colls/bcast-mvapich-smp.c b/src/smpi/colls/bcast-mvapich-smp.c
new file mode 100644 (file)
index 0000000..d40f2db
--- /dev/null
@@ -0,0 +1,388 @@
+/* Copyright (c) 2013-2014. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ *
+ * Additional copyrights may follow
+ */
+ /* -*- Mode: C; c-basic-offset:4 ; -*- */
+/* Copyright (c) 2001-2014, The Ohio State University. All rights
+ * reserved.
+ *
+ * This file is part of the MVAPICH2 software package developed by the
+ * team members of The Ohio State University's Network-Based Computing
+ * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * For detailed copyright and licensing information, please refer to the
+ * copyright file COPYRIGHT in the top level MVAPICH2 directory.
+ */
+/*
+ *
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "colls_private.h"
+
+
+extern int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
+                           int root, MPI_Comm comm_ptr);
+
+extern int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
+                                      int root, MPI_Comm comm_ptr);
+                                      
+extern int zcpy_knomial_factor;
+extern int mv2_pipelined_zcpy_knomial_factor;
+extern int bcast_segment_size;
+extern int mv2_inter_node_knomial_factor;
+extern int mv2_intra_node_knomial_factor;
+extern int mv2_bcast_two_level_system_size;
+#define INTRA_NODE_ROOT 0
+
+#define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
+#define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
+#define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
+#define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
+#define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
+#define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
+#define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
+#define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
+#define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
+#define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
+#define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
+
+extern int zcpy_knomial_factor;
+extern int mv2_pipelined_zcpy_knomial_factor;
+extern int bcast_segment_size;
+extern int mv2_inter_node_knomial_factor;
+extern int mv2_intra_node_knomial_factor;
+#define mv2_bcast_two_level_system_size  64
+#define mv2_bcast_short_msg             16384
+#define mv2_bcast_large_msg            512*1024
+#define mv2_knomial_intra_node_threshold 131072
+#define mv2_scatter_rd_inter_leader_bcast 1
+int smpi_coll_tuned_bcast_mvapich2_inter_node(void *buffer,
+                                                 int count,
+                                                 MPI_Datatype datatype,
+                                                 int root,
+                                                 MPI_Comm  comm)
+{
+    int rank;
+    int mpi_errno = MPI_SUCCESS;
+    MPI_Comm shmem_comm, leader_comm;
+    int local_rank, local_size, global_rank = -1;
+    int leader_root, leader_of_root;
+
+
+    rank = smpi_comm_rank(comm);
+    //comm_size = smpi_comm_size(comm);
+
+
+    if (MV2_Bcast_function==NULL){
+      MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if (MV2_Bcast_intra_node_function==NULL){
+      MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+      smpi_comm_init_smp(comm);
+    }
+    
+    shmem_comm = smpi_comm_get_intra_comm(comm);
+    local_rank = smpi_comm_rank(shmem_comm);
+    local_size = smpi_comm_size(shmem_comm);
+
+    leader_comm = smpi_comm_get_leaders_comm(comm);
+
+    if ((local_rank == 0) && (local_size > 1)) {
+      global_rank = smpi_comm_rank(leader_comm);
+    }
+
+    int* leaders_map = smpi_comm_get_leaders_map(comm);
+    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
+    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);
+    
+    
+    if (local_size > 1) {
+        if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) {
+            smpi_mpi_recv(buffer, count, datatype, root,
+                                     COLL_TAG_BCAST, comm, MPI_STATUS_IGNORE);
+        }
+        if ((local_rank != 0) && (root == rank)) {
+            smpi_mpi_send(buffer, count, datatype,
+                                     leader_of_root, COLL_TAG_BCAST, comm);
+        }
+    }
+#if defined(_MCST_SUPPORT_)
+    if (comm_ptr->ch.is_mcast_ok) {
+        mpi_errno = MPIR_Mcast_inter_node_MV2(buffer, count, datatype, root, comm_ptr,
+                                              errflag);
+        if (mpi_errno == MPI_SUCCESS) {
+            goto fn_exit;
+        } else {
+            goto fn_fail;
+        }
+    }
+#endif
+/*
+    if (local_rank == 0) {
+        leader_comm = smpi_comm_get_leaders_comm(comm);
+        root = leader_root;
+    }
+
+    if (MV2_Bcast_function == &MPIR_Pipelined_Bcast_MV2) {
+        mpi_errno = MPIR_Pipelined_Bcast_MV2(buffer, count, datatype,
+                                             root, comm);
+    } else if (MV2_Bcast_function == &MPIR_Bcast_scatter_ring_allgather_shm_MV2) {
+        mpi_errno = MPIR_Bcast_scatter_ring_allgather_shm_MV2(buffer, count,
+                                                              datatype, root,
+                                                              comm);
+    } else */{
+        if (local_rank == 0) {
+      /*      if (MV2_Bcast_function == &MPIR_Knomial_Bcast_inter_node_wrapper_MV2) {
+                mpi_errno = MPIR_Knomial_Bcast_inter_node_wrapper_MV2(buffer, count,
+                                                              datatype, root,
+                                                              comm);
+            } else {*/
+                mpi_errno = MV2_Bcast_function(buffer, count, datatype,
+                                               leader_root, leader_comm);
+          //  }
+        }
+    }
+
+    return mpi_errno;
+}
+
+
+int smpi_coll_tuned_bcast_mvapich2_knomial_intra_node(void *buffer,
+                                      int count,
+                                      MPI_Datatype datatype,
+                                      int root, MPI_Comm  comm)
+{
+    int local_size = 0, rank;
+    int mpi_errno = MPI_SUCCESS;
+    MPI_Request *reqarray = NULL;
+    MPI_Status *starray = NULL;
+    int src, dst, mask, relative_rank;
+    int k;
+    if (MV2_Bcast_function==NULL){
+      MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if (MV2_Bcast_intra_node_function==NULL){
+      MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+      smpi_comm_init_smp(comm);
+    }
+    
+    local_size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+
+    reqarray=(MPI_Request *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Request));
+
+    starray=(MPI_Status *)xbt_malloc(2 * mv2_intra_node_knomial_factor * sizeof (MPI_Status));
+
+    /* intra-node k-nomial bcast  */
+    if (local_size > 1) {
+        relative_rank = (rank >= root) ? rank - root : rank - root + local_size;
+        mask = 0x1;
+
+        while (mask < local_size) {
+            if (relative_rank % (mv2_intra_node_knomial_factor * mask)) {
+                src = relative_rank / (mv2_intra_node_knomial_factor * mask) *
+                    (mv2_intra_node_knomial_factor * mask) + root;
+                if (src >= local_size) {
+                    src -= local_size;
+                }
+
+                smpi_mpi_recv(buffer, count, datatype, src,
+                                         COLL_TAG_BCAST, comm,
+                                         MPI_STATUS_IGNORE);
+                break;
+            }
+            mask *= mv2_intra_node_knomial_factor;
+        }
+        mask /= mv2_intra_node_knomial_factor;
+
+        while (mask > 0) {
+            int reqs = 0;
+            for (k = 1; k < mv2_intra_node_knomial_factor; k++) {
+                if (relative_rank + mask * k < local_size) {
+                    dst = rank + mask * k;
+                    if (dst >= local_size) {
+                        dst -= local_size;
+                    }
+                    reqarray[reqs++]=smpi_mpi_isend(buffer, count, datatype, dst,
+                                              COLL_TAG_BCAST, comm);
+                }
+            }
+            smpi_mpi_waitall(reqs, reqarray, starray);
+
+            mask /= mv2_intra_node_knomial_factor;
+        }
+    }
+
+    return mpi_errno;
+}
+
+
+int smpi_coll_tuned_bcast_mvapich2_intra_node(void *buffer,
+                         int count,
+                         MPI_Datatype datatype,
+                         int root, MPI_Comm  comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int comm_size;
+    int two_level_bcast = 1;
+    size_t nbytes = 0; 
+    int is_homogeneous, is_contig;
+    MPI_Aint type_size;
+    void *tmp_buf = NULL;
+    MPI_Comm shmem_comm;
+
+    if (count == 0)
+        return MPI_SUCCESS;
+    if (MV2_Bcast_function==NULL){
+      MV2_Bcast_function=smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if (MV2_Bcast_intra_node_function==NULL){
+      MV2_Bcast_intra_node_function= smpi_coll_tuned_bcast_mpich;
+    }
+    
+    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+      smpi_comm_init_smp(comm);
+    }
+    
+    comm_size = smpi_comm_size(comm);
+   // rank = smpi_comm_rank(comm);
+/*
+    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
+        is_contig = 1;
+/*    else {
+        MPID_Datatype_get_ptr(datatype, dtp);
+        is_contig = dtp->is_contig;
+    }
+*/
+    is_homogeneous = 1;
+#ifdef MPID_HAS_HETERO
+    if (comm_ptr->is_hetero)
+        is_homogeneous = 0;
+#endif
+
+    /* MPI_Type_size() might not give the accurate size of the packed
+     * datatype for heterogeneous systems (because of padding, encoding,
+     * etc). On the other hand, MPI_Pack_size() can become very
+     * expensive, depending on the implementation, especially for
+     * heterogeneous systems. We want to use MPI_Type_size() wherever
+     * possible, and MPI_Pack_size() in other places.
+     */
+    if (is_homogeneous) {
+        type_size=smpi_datatype_size(datatype);
+    } /*else {*/
+/*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
+/*    }*/
+    nbytes = (size_t) (count) * (type_size);
+    if (comm_size <= mv2_bcast_two_level_system_size) {
+        if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
+            two_level_bcast = 1;
+        } else {
+            two_level_bcast = 0;
+        }
+    }
+
+    if ((two_level_bcast == 1
+#if defined(_MCST_SUPPORT_)
+            || comm_ptr->ch.is_mcast_ok
+#endif
+        )) {
+
+        if (!is_contig || !is_homogeneous) {
+            tmp_buf=(void *)xbt_malloc(nbytes);
+
+            /* TODO: Pipeline the packing and communication */
+           // position = 0;
+/*            if (rank == root) {*/
+/*                mpi_errno =*/
+/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
+/*                if (mpi_errno)*/
+/*                    MPIU_ERR_POP(mpi_errno);*/
+/*            }*/
+        }
+
+        shmem_comm = smpi_comm_get_intra_comm(comm);
+        if (!is_contig || !is_homogeneous) {
+            mpi_errno =
+                MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
+                                                 root, comm);
+        } else {
+            mpi_errno =
+                MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
+                                                 comm);
+        }
+
+        /* We are now done with the inter-node phase */
+            if (nbytes <= mv2_knomial_intra_node_threshold) {
+                if (!is_contig || !is_homogeneous) {
+                    mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE,
+                                                     root, shmem_comm);
+                } else {
+                    mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
+                                                     root, shmem_comm);
+                }
+            } else {
+                if (!is_contig || !is_homogeneous) {
+                    mpi_errno =
+                        MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes,
+                                                          MPI_BYTE,
+                                                          INTRA_NODE_ROOT,
+                                                          shmem_comm);
+                } else {
+                    mpi_errno =
+                        MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
+                                                          datatype,
+                                                          INTRA_NODE_ROOT,
+                                                          shmem_comm);
+                }
+            }
+
+    } else {
+        if (nbytes <= mv2_bcast_short_msg) {
+            mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
+                                                comm);
+        } else {
+            if (mv2_scatter_rd_inter_leader_bcast) {
+                mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
+                                                                  datatype,
+                                                                  root,
+                                                                  comm);
+            } else {
+                mpi_errno =
+                    MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
+                                                              datatype, root,
+                                                              comm);
+            }
+        }
+    }
+
+
+    return mpi_errno;
+
+}
index 2547d19..3aebd8c 100644 (file)
@@ -44,7 +44,7 @@ int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer,
 
     size = smpi_comm_size(comm);
     rank = smpi_comm_rank(comm);
-    xbt_assert( size > 1 );
+    if(size==1)return MPI_SUCCESS;
 
 
     const double a_p16  = 3.2118e-6; /* [1 / byte] */
index ee84cf5..ff58270 100644 (file)
@@ -226,6 +226,9 @@ COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, mpich) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, mvapich2)   COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, mvapich2_inter_node)   COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, mvapich2_intra_node)   COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, mvapich2_knomial_intra_node)   COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, impi)   COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, automatic)
 
index 1ede76b..c3a6022 100644 (file)
@@ -640,9 +640,9 @@ int smpi_coll_tuned_bcast_mvapich2(void *buffer,
 
             /* We are now done with the inter-node phase */
 
-                if (MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {
+
                     root = INTRA_NODE_ROOT;
-                }
+   
 
                 if (!is_contig || !is_homogeneous) {
                     mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes,
index bf44fb7..65796c2 100644 (file)
@@ -1002,18 +1002,23 @@ int mv2_pipelined_zcpy_knomial_factor = -1;
 int bcast_segment_size = 8192;
 int mv2_inter_node_knomial_factor = 4;
 int mv2_intra_node_knomial_factor = 4;
+#define mv2_bcast_two_level_system_size  64
+#define mv2_bcast_short_msg             16384
+#define mv2_bcast_large_msg            512*1024
+
 #define INTRA_NODE_ROOT 0
 
 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_mpich
+#define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
+#define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
+#define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
+#define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mpich
-#define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mpich
+#define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
+#define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
+#define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
+#define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
 
 static void init_mv2_bcast_tables_stampede(){
  //Stampede,
index 3cf01de..2b23e7c 100644 (file)
@@ -277,7 +277,6 @@ void smpi_comm_init_smp(MPI_Comm comm){
   int comm_size =smpi_comm_size(comm);
 
   if(smpi_privatize_global_variables){ //we need to switch here, as the called function may silently touch global variables
-     XBT_VERB("Applying operation, switch to the right data frame ");
      switch_data_segment(smpi_process_index());
    }
   //identify neighbours in comm