Update copyright lines for 2022.

[simgrid.git] / src / smpi / colls / smpi_mpich_selector.cpp
diff --git a/src/smpi/colls/smpi_mpich_selector.cpp b/src/smpi/colls/smpi_mpich_selector.cpp

index 7fd3910..16f01f0 100644 (file)
--- a/src/smpi/colls/smpi_mpich_selector.cpp
+++ b/src/smpi/colls/smpi_mpich_selector.cpp
@@ -1,15 +1,17 @@
  /* selector for collective algorithms based on mpich decision logic */
  
-/* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
+/* Copyright (c) 2009-2022. The SimGrid Team.
   * All rights reserved.                                                     */
  
  /* This program is free software; you can redistribute it and/or modify it
   * under the terms of the license (GNU LGPL) which comes with this package. */
  
-#include "colls_private.h"
+#include "colls_private.hpp"
+
+#include <memory>
  
  /* This is the default implementation of allreduce. The algorithm is:
-   
+
     Algorithm: MPI_Allreduce
  
     For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
@@ -19,12 +21,12 @@
  
     For long messages and for builtin ops and if count >= pof2 (where
     pof2 is the nearest power-of-two less than or equal to the number
-   of processes), we use Rabenseifner's algorithm (see 
+   of processes), we use Rabenseifner's algorithm (see
     http://www.hlrs.de/mpi/myreduce.html).
     This algorithm implements the allreduce in two steps: first a
     reduce-scatter, followed by an allgather. A recursive-halving
     algorithm (beginning with processes that are distance 1 apart) is
-   used for the reduce-scatter, and a recursive doubling 
+   used for the reduce-scatter, and a recursive doubling
     algorithm is used for the allgather. The non-power-of-two case is
     handled by dropping to the nearest lower power-of-two: the first
     few even-numbered processes send their data to their right neighbors
@@ -32,33 +34,33 @@
     power-of-two processes. At the end, the first few even-numbered
     processes get the result from their right neighbors.
  
-   For the power-of-two case, the cost for the reduce-scatter is 
+   For the power-of-two case, the cost for the reduce-scatter is
     lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
     allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
     total cost is:
     Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  
-   For the non-power-of-two case, 
+   For the non-power-of-two case,
     Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
  
-   
-   For short messages, for user-defined ops, and for count < pof2 
+
+   For short messages, for user-defined ops, and for count < pof2
     we use a recursive doubling algorithm (similar to the one in
     MPI_Allgather). We use this algorithm in the case of user-defined ops
     because in this case derived datatypes are allowed, and the user
     could pass basic datatypes on one process and derived on another as
     long as the type maps are the same. Breaking up derived datatypes
-   to do the reduce-scatter is tricky. 
+   to do the reduce-scatter is tricky.
  
     Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Allreduce
  */
  namespace simgrid{
  namespace smpi{
-int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
+int allreduce__mpich(const void *sbuf, void *rbuf, int count,
                          MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  {
      size_t dsize, block_dsize;
@@ -68,6 +70,14 @@ int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
      dsize = dtype->size();
      block_dsize = dsize * count;
  
+    /*MPICH uses SMP algorithms for all commutative ops now*/
+    if (not comm->is_smp_comm()) {
+      if(comm->get_leaders_comm()==MPI_COMM_NULL){
+        comm->init_smp();
+      }
+      if(op->is_commutative())
+        return allreduce__mvapich2_two_level(sbuf, rbuf,count, dtype, op, comm);
+    }
  
      /* find nearest power-of-two less than or equal to comm_size */
      int pof2 = 1;
@@ -76,20 +86,16 @@ int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
  
      if (block_dsize > large_message && count >= pof2 && (op==MPI_OP_NULL || op->is_commutative())) {
        //for long messages
-       return (Coll_allreduce_rab_rdb::allreduce (sbuf, rbuf, 
-                                                                   count, dtype,
-                                                                   op, comm));
+       return allreduce__rab_rdb(sbuf, rbuf, count, dtype, op, comm);
      }else {
        //for short ones and count < pof2
-      return (Coll_allreduce_rdb::allreduce (sbuf, rbuf, 
-                                                                   count, dtype,
-                                                                   op, comm));
+      return allreduce__rdb(sbuf, rbuf, count, dtype, op, comm);
      }
  }
  
  
  /* This is the default implementation of alltoall. The algorithm is:
-   
+
     Algorithm: MPI_Alltoall
  
     We use four algorithms for alltoall. For short messages and
@@ -109,9 +115,9 @@ int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
     processes, so that all processes don't try to send/recv to/from the
     same process at the same time.
  
-   *** Modification: We post only a small number of isends and irecvs 
+   *** Modification: We post only a small number of isends and irecvs
     at a time and wait on them as suggested by Tony Ladd. ***
-   *** See comments below about an additional modification that 
+   *** See comments below about an additional modification that
     we may want to consider ***
  
     For long messages and power-of-two number of processes, we use a
@@ -122,23 +128,23 @@ int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
     This algorithm doesn't work if the number of processes is not a power of
     two. For a non-power-of-two number of processes, we use an
     algorithm in which, in step i, each process  receives from (rank-i)
-   and sends to (rank+i). 
+   and sends to (rank+i).
  
     Cost = (p-1).alpha + n.beta
  
     where n is the total amount of data a process needs to send to all
     other processes.
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Alltoall
  */
  
-int Coll_alltoall_mpich::alltoall( void *sbuf, int scount, 
-                                             MPI_Datatype sdtype,
-                                             void* rbuf, int rcount, 
-                                             MPI_Datatype rdtype, 
-                                             MPI_Comm comm)
+int alltoall__mpich(const void *sbuf, int scount,
+                    MPI_Datatype sdtype,
+                    void* rbuf, int rcount,
+                    MPI_Datatype rdtype,
+                    MPI_Comm comm)
  {
      int communicator_size;
      size_t dsize, block_dsize;
@@ -147,69 +153,69 @@ int Coll_alltoall_mpich::alltoall( void *sbuf, int scount,
      unsigned int short_size=256;
      unsigned int medium_size=32768;
      //short size and comm_size >=8   -> bruck
-    
+
  //     medium size messages and (short messages for comm_size < 8), we
  //     use an algorithm that posts all irecvs and isends and then does a
-//     waitall. 
-    
+//     waitall.
+
  //    For long messages and power-of-two number of processes, we use a
  //   pairwise exchange algorithm
  
  //   For a non-power-of-two number of processes, we use an
  //   algorithm in which, in step i, each process  receives from (rank-i)
-//   and sends to (rank+i). 
+//   and sends to (rank+i).
  
  
      dsize = sdtype->size();
      block_dsize = dsize * scount;
  
      if ((block_dsize < short_size) && (communicator_size >= 8)) {
-        return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype, 
-                                                    rbuf, rcount, rdtype,
-                                                    comm);
+        return alltoall__bruck(sbuf, scount, sdtype,
+                               rbuf, rcount, rdtype,
+                               comm);
  
      } else if (block_dsize < medium_size) {
-        return Coll_alltoall_basic_linear::alltoall(sbuf, scount, sdtype, 
-                                                           rbuf, rcount, rdtype, 
-                                                           comm);
+        return alltoall__mvapich2_scatter_dest(sbuf, scount, sdtype,
+                                               rbuf, rcount, rdtype,
+                                               comm);
      }else if (communicator_size%2){
-        return Coll_alltoall_ring::alltoall(sbuf, scount, sdtype, 
-                                                           rbuf, rcount, rdtype, 
-                                                           comm);
+        return alltoall__pair(sbuf, scount, sdtype,
+                              rbuf, rcount, rdtype,
+                              comm);
      }
  
-    return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
-                                                    rbuf, rcount, rdtype,
-                                                    comm);
+    return alltoall__ring(sbuf, scount, sdtype,
+                          rbuf, rcount, rdtype,
+                          comm);
  }
  
-int Coll_alltoallv_mpich::alltoallv(void *sbuf, int *scounts, int *sdisps,
-                                              MPI_Datatype sdtype,
-                                              void *rbuf, int *rcounts, int *rdisps,
-                                              MPI_Datatype rdtype,
-                                              MPI_Comm  comm
-                                              )
+int alltoallv__mpich(const void *sbuf, const int *scounts, const int *sdisps,
+                     MPI_Datatype sdtype,
+                     void *rbuf, const int *rcounts, const int *rdisps,
+                     MPI_Datatype rdtype,
+                     MPI_Comm  comm
+                     )
  {
      /* For starters, just keep the original algorithm. */
-    return Coll_alltoallv_bruck::alltoallv(sbuf, scounts, sdisps, sdtype, 
-                                                        rbuf, rcounts, rdisps,rdtype,
-                                                        comm);
+    return alltoallv__bruck(sbuf, scounts, sdisps, sdtype,
+                            rbuf, rcounts, rdisps,rdtype,
+                            comm);
  }
  
  
-int Coll_barrier_mpich::barrier(MPI_Comm  comm)
-{   
-    return Coll_barrier_ompi_bruck::barrier(comm);
+int barrier__mpich(MPI_Comm  comm)
+{
+    return barrier__ompi_bruck(comm);
  }
  
  /* This is the default implementation of broadcast. The algorithm is:
-   
+
     Algorithm: MPI_Bcast
  
-   For short messages, we use a binomial tree algorithm. 
+   For short messages, we use a binomial tree algorithm.
     Cost = lgp.alpha + n.lgp.beta
  
-   For long messages, we do a scatter followed by an allgather. 
+   For long messages, we do a scatter followed by an allgather.
     We first scatter the buffer using a binomial tree algorithm. This costs
     lgp.alpha + n.((p-1)/p).beta
     If the datatype is contiguous and the communicator is homogeneous,
@@ -218,7 +224,7 @@ int Coll_barrier_mpich::barrier(MPI_Comm  comm)
     cases, we first pack the data into a temporary buffer by using
     MPI_Pack, scatter it as bytes, and unpack it after the allgather.
  
-   For the allgather, we use a recursive doubling algorithm for 
+   For the allgather, we use a recursive doubling algorithm for
     medium-size messages and power-of-two number of processes. This
     takes lgp steps. In each step pairs of processes exchange all the
     data they have (we take care of non-power-of-two situations). This
@@ -232,12 +238,12 @@ int Coll_barrier_mpich::barrier(MPI_Comm  comm)
     versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
     this algorithm will perform better.
  
-   For long messages and for medium-size messages and non-power-of-two 
-   processes, we use a ring algorithm for the allgather, which 
+   For long messages and for medium-size messages and non-power-of-two
+   processes, we use a ring algorithm for the allgather, which
     takes p-1 steps, because it performs better than recursive doubling.
     Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
  
-   Possible improvements: 
+   Possible improvements:
     For clusters of SMPs, we may want to do something differently to
     take advantage of shared memory on each node.
  
@@ -245,12 +251,12 @@ int Coll_barrier_mpich::barrier(MPI_Comm  comm)
  */
  
  
-int Coll_bcast_mpich::bcast(void *buff, int count,
-                                          MPI_Datatype datatype, int root,
-                                          MPI_Comm  comm
+int bcast__mpich(void *buff, int count,
+                 MPI_Datatype datatype, int root,
+                 MPI_Comm  comm
                                            )
  {
-    /* Decision function based on MX results for 
+    /* Decision function based on MX results for
         messages up to 36MB and communicator sizes up to 64 nodes */
      const size_t small_message_size = 12288;
      const size_t intermediate_message_size = 524288;
@@ -259,40 +265,45 @@ int Coll_bcast_mpich::bcast(void *buff, int count,
      //int segsize = 0;
      size_t message_size, dsize;
  
+    if (not comm->is_smp_comm()) {
+      if(comm->get_leaders_comm()==MPI_COMM_NULL){
+        comm->init_smp();
+      }
+      if(comm->is_uniform())
+        return bcast__SMP_binomial(buff, count, datatype, root, comm);
+    }
+
      communicator_size = comm->size();
  
      /* else we need data size for decision function */
      dsize = datatype->size();
      message_size = dsize * (unsigned long)count;   /* needed for decision */
  
-    /* Handle messages of small and intermediate size, and 
+    /* Handle messages of small and intermediate size, and
         single-element broadcasts */
      if ((message_size < small_message_size) || (communicator_size <= 8)) {
          /* Binomial without segmentation */
-        return  Coll_bcast_binomial_tree::bcast (buff, count, datatype, 
-                                                      root, comm);
+        return  bcast__binomial_tree(buff, count, datatype, root, comm);
  
      } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
          // SplittedBinary with 1KB segments
-        return Coll_bcast_scatter_rdb_allgather::bcast(buff, count, datatype, 
-                                                         root, comm);
+        return bcast__scatter_rdb_allgather(buff, count, datatype, root, comm);
  
      }
-     //Handle large message sizes 
-     return Coll_bcast_scatter_LR_allgather::bcast (buff, count, datatype, 
-                                                     root, comm);
-                                                         
+     //Handle large message sizes
+     return bcast__scatter_LR_allgather(buff, count, datatype, root, comm);
+
  }
  
  
  
  /* This is the default implementation of reduce. The algorithm is:
-   
+
     Algorithm: MPI_Reduce
  
     For long messages and for builtin ops and if count >= pof2 (where
     pof2 is the nearest power-of-two less than or equal to the number
-   of processes), we use Rabenseifner's algorithm (see 
+   of processes), we use Rabenseifner's algorithm (see
     http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
     This algorithm implements the reduce in two steps: first a
     reduce-scatter, followed by a gather to the root. A
@@ -307,7 +318,7 @@ int Coll_bcast_mpich::bcast(void *buff, int count,
     the root and exits; the root now acts as rank 0 in the binomial tree
     algorithm for gather.
  
-   For the power-of-two case, the cost for the reduce-scatter is 
+   For the power-of-two case, the cost for the reduce-scatter is
     lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
     gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
     total cost is:
@@ -315,12 +326,12 @@ int Coll_bcast_mpich::bcast(void *buff, int count,
  
     For the non-power-of-two case, assuming the root is not one of the
     odd-numbered processes that get excluded in the reduce-scatter,
-   Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
+   Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
             n.(1+(p-1)/p).gamma
  
  
     For short messages, user-defined ops, and count < pof2, we use a
-   binomial tree algorithm for both short and long messages. 
+   binomial tree algorithm for both short and long messages.
  
     Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  
@@ -335,21 +346,29 @@ int Coll_bcast_mpich::bcast(void *buff, int count,
     should be able to use the reduce-scatter/gather approach as long as
     count >= pof2.  [goodell@ 2009-01-21]
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Reduce
  */
  
  
-int Coll_reduce_mpich::reduce( void *sendbuf, void *recvbuf,
+int reduce__mpich(const void *sendbuf, void *recvbuf,
                                              int count, MPI_Datatype  datatype,
                                              MPI_Op   op, int root,
                                              MPI_Comm   comm
                                              )
  {
      int communicator_size=0;
-    //int segsize = 0;
      size_t message_size, dsize;
+
+    if (not comm->is_smp_comm()) {
+      if(comm->get_leaders_comm()==MPI_COMM_NULL){
+        comm->init_smp();
+      }
+      if (op->is_commutative() == 1)
+        return reduce__mvapich2_two_level(sendbuf, recvbuf, count, datatype, op, root, comm);
+    }
+
      communicator_size = comm->size();
  
      /* need data size for decision function */
@@ -361,10 +380,9 @@ int Coll_reduce_mpich::reduce( void *sendbuf, void *recvbuf,
      pof2 >>= 1;
  
      if ((count < pof2) || (message_size < 2048) || (op != MPI_OP_NULL && not op->is_commutative())) {
-      return Coll_reduce_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm); 
+      return reduce__binomial(sendbuf, recvbuf, count, datatype, op, root, comm);
      }
-        return Coll_reduce_scatter_gather::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
-                                                     segsize, max_requests*/);
+    return reduce__scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm);
  }
  
  
@@ -417,12 +435,12 @@ int Coll_reduce_mpich::reduce( void *sendbuf, void *recvbuf,
  */
  
  
-int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
-                                                    int *rcounts,
-                                                    MPI_Datatype dtype,
-                                                    MPI_Op  op,
-                                                    MPI_Comm  comm
-                                                    )
+int reduce_scatter__mpich(const void *sbuf, void *rbuf,
+                          const int *rcounts,
+                          MPI_Datatype dtype,
+                          MPI_Op  op,
+                          MPI_Comm  comm
+                          )
  {
      int comm_size, i;
      size_t total_message_size;
@@ -430,23 +448,21 @@ int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
      if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
  
      XBT_DEBUG("Coll_reduce_scatter_mpich::reduce");
-    
+
      comm_size = comm->size();
-    // We need data size for decision function 
+    // We need data size for decision function
      total_message_size = 0;
-    for (i = 0; i < comm_size; i++) { 
+    for (i = 0; i < comm_size; i++) {
          total_message_size += rcounts[i];
      }
  
-    if( (op==MPI_OP_NULL || op->is_commutative()) &&  total_message_size > 524288) { 
-        return Coll_reduce_scatter_mpich_pair::reduce_scatter (sbuf, rbuf, rcounts, 
-                                                                    dtype, op, 
-                                                                    comm);
+    if( (op==MPI_OP_NULL || op->is_commutative()) &&  total_message_size > 524288) {
+        return reduce_scatter__mpich_pair(sbuf, rbuf, rcounts, dtype, op, comm);
      } else if ((op != MPI_OP_NULL && not op->is_commutative())) {
-      int is_block_regular = 1;
+      bool is_block_regular = true;
        for (i = 0; i < (comm_size - 1); ++i) {
          if (rcounts[i] != rcounts[i + 1]) {
-          is_block_regular = 0;
+          is_block_regular = false;
            break;
          }
        }
@@ -458,18 +474,18 @@ int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
  
        if (pof2 == comm_size && is_block_regular) {
          /* noncommutative, pof2 size, and block regular */
-        return Coll_reduce_scatter_mpich_noncomm::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
+        return reduce_scatter__mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm);
        }
  
-      return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
-    }else{      
-       return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
+      return reduce_scatter__mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
+    }else{
+       return reduce_scatter__mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
      }
  }
  
  
  /* This is the default implementation of allgather. The algorithm is:
-   
+
     Algorithm: MPI_Allgather
  
     For short messages and non-power-of-two no. of processes, we use
@@ -508,17 +524,17 @@ int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
     neighbor) performs twice as fast as recursive doubling for long
     messages (on Myrinet and IBM SP).
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Allgather
  */
  
-int Coll_allgather_mpich::allgather(void *sbuf, int scount, 
-                                              MPI_Datatype sdtype,
-                                              void* rbuf, int rcount, 
-                                              MPI_Datatype rdtype, 
-                                              MPI_Comm  comm
-                                              )
+int allgather__mpich(const void *sbuf, int scount,
+                     MPI_Datatype sdtype,
+                     void* rbuf, int rcount,
+                     MPI_Datatype rdtype,
+                     MPI_Comm  comm
+                     )
  {
      int communicator_size, pow2_size;
      size_t dsize, total_dsize;
@@ -527,36 +543,30 @@ int Coll_allgather_mpich::allgather(void *sbuf, int scount,
  
      /* Determine complete data size */
      dsize=sdtype->size();
-    total_dsize = dsize * scount * communicator_size;   
-   
-    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+    total_dsize = dsize * scount * communicator_size;
+
+    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
  
-    /* Decision as in MPICH-2 
-       presented in Thakur et.al. "Optimization of Collective Communication 
-       Operations in MPICH", International Journal of High Performance Computing 
+    /* Decision as in MPICH-2
+       presented in Thakur et.al. "Optimization of Collective Communication
+       Operations in MPICH", International Journal of High Performance Computing
         Applications, Vol. 19, No. 1, 49-66 (2005)
-       - for power-of-two processes and small and medium size messages 
+       - for power-of-two processes and small and medium size messages
         (up to 512KB) use recursive doubling
         - for non-power-of-two processes and small messages (80KB) use bruck,
         - for everything else use ring.
      */
      if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
-        return Coll_allgather_rdb::allgather(sbuf, scount, sdtype, 
-                                                                 rbuf, rcount, rdtype, 
-                                                                 comm);
-    } else if (total_dsize <= 81920) { 
-        return Coll_allgather_bruck::allgather(sbuf, scount, sdtype, 
-                                                     rbuf, rcount, rdtype,
-                                                     comm);
-    } 
-    return Coll_allgather_ring::allgather(sbuf, scount, sdtype, 
-                                                rbuf, rcount, rdtype,
-                                                comm);
+        return allgather__rdb(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
+    } else if (total_dsize <= 81920) {
+        return allgather__bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
+    }
+    return allgather__ring(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
  }
  
  
  /* This is the default implementation of allgatherv. The algorithm is:
-   
+
     Algorithm: MPI_Allgatherv
  
     For short messages and non-power-of-two no. of processes, we use
@@ -587,17 +597,17 @@ int Coll_allgather_mpich::allgather(void *sbuf, int scount,
  
     Cost = (p-1).alpha + n.((p-1)/p).beta
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Allgatherv
  */
-int Coll_allgatherv_mpich::allgatherv(void *sbuf, int scount, 
-                                               MPI_Datatype sdtype,
-                                               void* rbuf, int *rcounts, 
-                                               int *rdispls,
-                                               MPI_Datatype rdtype, 
-                                               MPI_Comm  comm
-                                               )
+int allgatherv__mpich(const void *sbuf, int scount,
+                      MPI_Datatype sdtype,
+                      void* rbuf, const int *rcounts,
+                      const int *rdispls,
+                      MPI_Datatype rdtype,
+                      MPI_Comm  comm
+                      )
  {
      int communicator_size, pow2_size,i;
      size_t total_dsize;
@@ -610,25 +620,19 @@ int Coll_allgatherv_mpich::allgatherv(void *sbuf, int scount,
          total_dsize += rcounts[i];
      if (total_dsize == 0)
        return MPI_SUCCESS;
-    
-    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+
+    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
  
      if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
-        return Coll_allgatherv_mpich_rdb::allgatherv(sbuf, scount, sdtype, 
-                                                                 rbuf, rcounts, rdispls, rdtype, 
-                                                                 comm);
-    } else if (total_dsize <= 81920) { 
-        return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype, 
-                                                     rbuf, rcounts, rdispls, rdtype,
-                                                     comm);
-    } 
-    return Coll_allgatherv_mpich_ring::allgatherv(sbuf, scount, sdtype,
-                                                rbuf, rcounts, rdispls, rdtype,
-                                                comm);
+        return allgatherv__mpich_rdb(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
+    } else if (total_dsize <= 81920) {
+        return allgatherv__ompi_bruck(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
+    }
+    return allgatherv__mpich_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
  }
  
  /* This is the default implementation of gather. The algorithm is:
-   
+
     Algorithm: MPI_Gather
  
     We use a binomial tree algorithm for both short and long
@@ -644,65 +648,61 @@ int Coll_allgatherv_mpich::allgatherv(void *sbuf, int scount,
     Cost = lgp.alpha + n.((p-1)/p).beta
     where n is the total size of the data gathered at the root.
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Gather
  */
  
-int Coll_gather_mpich::gather(void *sbuf, int scount, 
-                                           MPI_Datatype sdtype,
-                                           void* rbuf, int rcount, 
-                                           MPI_Datatype rdtype, 
-                                           int root,
-                                           MPI_Comm  comm
-                                           )
+int gather__mpich(const void *sbuf, int scount,
+                  MPI_Datatype sdtype,
+                  void* rbuf, int rcount,
+                  MPI_Datatype rdtype,
+                  int root,
+                  MPI_Comm  comm
+                  )
  {
-        return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype, 
-                                                      rbuf, rcount, rdtype, 
-                                                      root, comm);
+    return gather__ompi_binomial(sbuf, scount, sdtype,
+                                 rbuf, rcount, rdtype,
+                                 root, comm);
  }
  
  /* This is the default implementation of scatter. The algorithm is:
-   
+
     Algorithm: MPI_Scatter
  
     We use a binomial tree algorithm for both short and
     long messages. At nodes other than leaf nodes we need to allocate
     a temporary buffer to store the incoming message. If the root is
-   not rank 0, we reorder the sendbuf in order of relative ranks by 
+   not rank 0, we reorder the sendbuf in order of relative ranks by
     copying it into a temporary buffer, so that all the sends from the
     root are contiguous and in the right order. In the heterogeneous
     case, we first pack the buffer by using MPI_Pack and then do the
-   scatter. 
+   scatter.
  
     Cost = lgp.alpha + n.((p-1)/p).beta
     where n is the total size of the data to be scattered from the root.
  
-   Possible improvements: 
+   Possible improvements:
  
     End Algorithm: MPI_Scatter
  */
  
  
-int Coll_scatter_mpich::scatter(void *sbuf, int scount, 
-                                            MPI_Datatype sdtype,
-                                            void* rbuf, int rcount, 
-                                            MPI_Datatype rdtype, 
-                                            int root, MPI_Comm  comm
-                                            )
+int scatter__mpich(const void *sbuf, int scount,
+                   MPI_Datatype sdtype,
+                   void* rbuf, int rcount,
+                   MPI_Datatype rdtype,
+                   int root, MPI_Comm  comm
+                   )
  {
+  std::unique_ptr<unsigned char[]> tmp_buf;
    if(comm->rank()!=root){
-      sbuf=xbt_malloc(rcount*rdtype->get_extent());
-      scount=rcount;
-      sdtype=rdtype;
-  }
-  int ret= Coll_scatter_ompi_binomial::scatter (sbuf, scount, sdtype,
-                                                       rbuf, rcount, rdtype, 
-                                                       root, comm);
-  if(comm->rank()!=root){
-      xbt_free(sbuf);
+    tmp_buf = std::make_unique<unsigned char[]>(rcount * rdtype->get_extent());
+    sbuf   = tmp_buf.get();
+    scount = rcount;
+    sdtype = rdtype;
    }
-  return ret;
+  return scatter__ompi_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
  }
  }
  }