From: degomme <degomme@debian.localdomain>
Date: Thu, 13 Jun 2013 22:51:18 +0000 (+0200)
Subject: we now have an mpich selector, and a bunch of new algorithms (--cfg=smpi/coll_selecto... 
X-Git-Tag: v3_9_90~271
X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/8293a833f9c68986f8bd174d5bf3d04eb62918d7

we now have an mpich selector, and a bunch of new algorithms (--cfg=smpi/coll_selector:mpich)
---

diff --git a/buildtools/Cmake/AddTests.cmake b/buildtools/Cmake/AddTests.cmake
index d46d860a6e..55fe3bda45 100644
--- a/buildtools/Cmake/AddTests.cmake
+++ b/buildtools/Cmake/AddTests.cmake
@@ -372,23 +372,23 @@ if(NOT enable_memcheck)
     endif()
 
 
-    FOREACH (GATHER_COLL default ompi ompi_basic_linear ompi_linear_sync ompi_binomial)
+    FOREACH (GATHER_COLL default ompi mpich ompi_basic_linear ompi_linear_sync ompi_binomial)
         ADD_TEST(smpi-gather-coll-${GATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/gather:${GATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/gather_coll.tesh)
     ENDFOREACH()
     
     FOREACH (ALLGATHER_COLL default  2dmesh 3dmesh bruck GB loosely_lr lr
 		            NTSLR NTSLR_NB pair rdb  rhv ring SMP_NTS
-		            smp_simple spreading_simple ompi ompi_neighborexchange)
+		            smp_simple spreading_simple ompi mpich ompi_neighborexchange)
         ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh)
     ENDFOREACH()
     
-    FOREACH (ALLGATHERV_COLL default GB pair ring ompi ompi_neighborexchange)
+    FOREACH (ALLGATHERV_COLL default GB pair ring ompi mpich ompi_neighborexchange ompi_bruck)
         ADD_TEST(smpi-allgatherv-coll-${ALLGATHERV_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgatherv:${ALLGATHERV_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgatherv_coll.tesh)
     ENDFOREACH()
     
     FOREACH (ALLREDUCE_COLL default lr NTS rab1 rab2 rab_rdb
 		            rab_rsag rdb smp_binomial smp_binomial_pipeline
-			    smp_rdb smp_rsag smp_rsag_lr smp_rsag_rab redbcast ompi ompi_ring_segmented)
+			    smp_rdb smp_rsag smp_rsag_lr smp_rsag_rab redbcast ompi mpich ompi_ring_segmented)
         ADD_TEST(smpi-allreduce-coll-${ALLREDUCE_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allreduce:${ALLREDUCE_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allreduce_coll.tesh)
     ENDFOREACH()
     
@@ -407,23 +407,23 @@ if(NOT enable_memcheck)
     
     FOREACH (BCAST_COLL default arrival_nb arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter
 		        binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather
-		        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi ompi_split_bintree ompi_pipeline)
+		        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline)
 		ADD_TEST(smpi-bcast-coll-${BCAST_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/bcast:${BCAST_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast_coll.tesh)
     ENDFOREACH()
     
-    FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary)
+    FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi mpich ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary)
         ADD_TEST(smpi-reduce-coll-${REDUCE_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/reduce:${REDUCE_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce_coll.tesh)
     ENDFOREACH()
 
-    FOREACH (REDUCE_SCATTER_COLL default  ompi ompi_basic_recursivehalving ompi_ring)
+    FOREACH (REDUCE_SCATTER_COLL default  ompi mpich ompi_basic_recursivehalving ompi_ring mpich_noncomm mpich_pair mpich_rdb)
         ADD_TEST(smpi-reduce_scatter-coll-${REDUCE_SCATTER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/reduce_scatter:${REDUCE_SCATTER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce_scatter_coll.tesh)
     ENDFOREACH()
 
-    FOREACH (SCATTER_COLL default  ompi ompi_basic_linear ompi_binomial)
+    FOREACH (SCATTER_COLL default  ompi mpich ompi_basic_linear ompi_binomial)
         ADD_TEST(smpi-scatter-coll-${SCATTER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/scatter:${SCATTER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/scatter_coll.tesh)
     ENDFOREACH()
 
-    FOREACH (BARRIER_COLL default  ompi ompi_basic_linear ompi_tree ompi_bruck ompi_recursivedoubling ompi_doublering)
+    FOREACH (BARRIER_COLL default  ompi mpich ompi_basic_linear ompi_tree ompi_bruck ompi_recursivedoubling ompi_doublering)
         ADD_TEST(smpi-barrier-coll-${BARRIER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/barrier:${BARRIER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/barrier_coll.tesh)
     ENDFOREACH()
 
diff --git a/buildtools/Cmake/DefinePackages.cmake b/buildtools/Cmake/DefinePackages.cmake
index 6d5586ff42..b934337529 100644
--- a/buildtools/Cmake/DefinePackages.cmake
+++ b/buildtools/Cmake/DefinePackages.cmake
@@ -112,6 +112,7 @@ set(SMPI_SRC
   src/smpi/smpi_pmpi.c
   src/smpi/smpi_replay.c
   src/smpi/colls/smpi_openmpi_selector.c
+  src/smpi/colls/smpi_mpich_selector.c
   src/smpi/colls/colls_global.c
   src/smpi/colls/allgather-2dmesh.c
   src/smpi/colls/allgather-3dmesh.c
@@ -133,6 +134,8 @@ set(SMPI_SRC
   src/smpi/colls/allgatherv-pair.c
   src/smpi/colls/allgatherv-ring.c
   src/smpi/colls/allgatherv-ompi-neighborexchange.c
+  src/smpi/colls/allgatherv-ompi-bruck.c
+  src/smpi/colls/allgatherv-mpich-rdb.c
   src/smpi/colls/allreduce-lr.c
   src/smpi/colls/allreduce-NTS.c
   src/smpi/colls/allreduce-rab1.c
@@ -197,6 +200,7 @@ set(SMPI_SRC
   src/smpi/colls/reduce-ompi.c
   src/smpi/colls/gather-ompi.c
   src/smpi/colls/reduce_scatter-ompi.c
+  src/smpi/colls/reduce_scatter-mpich.c
   src/smpi/colls/scatter-ompi.c
   src/smpi/colls/barrier-ompi.c
   )
diff --git a/src/smpi/colls/allgatherv-ompi-bruck.c b/src/smpi/colls/allgatherv-ompi-bruck.c
new file mode 100644
index 0000000000..420f4302fd
--- /dev/null
+++ b/src/smpi/colls/allgatherv-ompi-bruck.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      University of Houston. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "colls_private.h"
+#define  MCA_COLL_BASE_TAG_ALLGATHERV 444
+/*
+ * ompi_coll_tuned_allgatherv_intra_bruck
+ *
+ * Function:     allgather using O(log(N)) steps.
+ * Accepts:      Same arguments as MPI_Allgather
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description:  Variation to All-to-all algorithm described by Bruck et al.in
+ *               "Efficient Algorithms for All-to-all Communications
+ *                in Multiport Message-Passing Systems"
+ * Note:         Unlike in case of allgather implementation, we relay on
+ *               indexed datatype to select buffers appropriately.
+ *               The only additional memory requirement is for creation of 
+ *               temporary datatypes.
+ * Example on 7 nodes (memory lay out need not be in-order)
+ *   Initial set up:
+ *    #     0      1      2      3      4      5      6
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [ ]    [6]
+ *   Step 0: send message to (rank - 2^0), receive message from (rank + 2^0)
+ *    #     0      1      2      3      4      5      6
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]    [0]
+ *         [1]    [1]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [2]    [2]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [3]    [3]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [4]    [4]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [5]    [5]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [6]    [6]
+ *   Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
+ *           message contains all blocks from (rank) .. (rank + 2^2) with 
+ *           wrap around.
+ *    #     0      1      2      3      4      5      6
+ *         [0]    [ ]    [ ]    [ ]    [0]    [0]    [0]
+ *         [1]    [1]    [ ]    [ ]    [ ]    [1]    [1]
+ *         [2]    [2]    [2]    [ ]    [ ]    [ ]    [2]
+ *         [3]    [3]    [3]    [3]    [ ]    [ ]    [ ]
+ *         [ ]    [4]    [4]    [4]    [4]    [ ]    [ ]
+ *         [ ]    [ ]    [5]    [5]    [5]    [5]    [ ]
+ *         [ ]    [ ]    [ ]    [6]    [6]    [6]    [6]
+ *   Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
+ *           message size is "all remaining blocks" 
+ *    #     0      1      2      3      4      5      6
+ *         [0]    [0]    [0]    [0]    [0]    [0]    [0]
+ *         [1]    [1]    [1]    [1]    [1]    [1]    [1]
+ *         [2]    [2]    [2]    [2]    [2]    [2]    [2]
+ *         [3]    [3]    [3]    [3]    [3]    [3]    [3]
+ *         [4]    [4]    [4]    [4]    [4]    [4]    [4]
+ *         [5]    [5]    [5]    [5]    [5]    [5]    [5]
+ *         [6]    [6]    [6]    [6]    [6]    [6]    [6]
+ */
+int smpi_coll_tuned_allgatherv_ompi_bruck(void *sbuf, int scount,
+                                           MPI_Datatype sdtype,
+                                           void *rbuf, int *rcounts,
+                                           int *rdispls, 
+                                           MPI_Datatype rdtype,
+                                           MPI_Comm comm)
+{
+   int rank, size;
+   int sendto, recvfrom, distance, blockcount, i;
+   int *new_rcounts = NULL, *new_rdispls = NULL;
+   int *new_scounts = NULL, *new_sdispls = NULL;
+   ptrdiff_t slb, rlb, sext, rext;
+   char *tmpsend = NULL, *tmprecv = NULL;
+   MPI_Datatype new_rdtype, new_sdtype;
+
+   size = smpi_comm_size(comm);
+   rank = smpi_comm_rank(comm);
+
+   XBT_DEBUG(
+                "coll:tuned:allgather_ompi_bruck rank %d", rank);
+   
+   smpi_datatype_extent (sdtype, &slb, &sext);
+
+   smpi_datatype_extent (rdtype, &rlb, &rext);
+
+   /* Initialization step:
+      - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of 
+        the receive buffer.
+   */
+   tmprecv = (char*) rbuf + rdispls[rank] * rext;
+   if (MPI_IN_PLACE != sbuf) {
+      tmpsend = (char*) sbuf;
+      smpi_datatype_copy(tmpsend, scount, sdtype, 
+                            tmprecv, rcounts[rank], rdtype);
+   }
+   
+   /* Communication step:
+      At every step i, rank r:
+      - doubles the distance
+      - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
+        to rank (r - distance)
+      - receives message of blockcount blocks, 
+        (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from 
+        rank (r + distance)
+      - blockcount doubles until the last step when only the remaining data is 
+      exchanged.
+   */
+   blockcount = 1;
+   tmpsend = (char*) rbuf;
+
+   new_rcounts = (int*) calloc(4*size, sizeof(int));
+   new_rdispls = new_rcounts + size;
+   new_scounts = new_rdispls + size;
+   new_sdispls = new_scounts + size;
+
+   for (distance = 1; distance < size; distance<<=1) {
+
+      recvfrom = (rank + distance) % size;
+      sendto = (rank - distance + size) % size;
+
+      if (distance <= (size >> 1)) {
+         blockcount = distance;
+      } else { 
+         blockcount = size - distance;
+      }
+
+      /* create send and receive datatypes */
+      for (i = 0; i < blockcount; i++) {
+          const int tmp_srank = (rank + i) % size;
+          const int tmp_rrank = (recvfrom + i) % size;
+          new_scounts[i] = rcounts[tmp_srank];
+          new_sdispls[i] = rdispls[tmp_srank];
+          new_rcounts[i] = rcounts[tmp_rrank];
+          new_rdispls[i] = rdispls[tmp_rrank];
+      }
+      smpi_datatype_indexed(blockcount, new_scounts, new_sdispls, 
+                                    rdtype, &new_sdtype);
+      smpi_datatype_indexed(blockcount, new_rcounts, new_rdispls,
+                                    rdtype, &new_rdtype);
+
+      smpi_datatype_commit(&new_sdtype);
+      smpi_datatype_commit(&new_rdtype);
+
+      /* Sendreceive */
+      smpi_mpi_sendrecv(rbuf, 1, new_sdtype, sendto,
+                                     MCA_COLL_BASE_TAG_ALLGATHERV,
+                                     rbuf, 1, new_rdtype, recvfrom,
+                                     MCA_COLL_BASE_TAG_ALLGATHERV,
+                                     comm, MPI_STATUS_IGNORE);
+      smpi_datatype_free(&new_sdtype);
+      smpi_datatype_free(&new_rdtype);
+
+   }
+
+   free(new_rcounts);
+
+   return MPI_SUCCESS;
+
+}
+
diff --git a/src/smpi/colls/colls.h b/src/smpi/colls/colls.h
index 09e47ce818..3de368157b 100644
--- a/src/smpi/colls/colls.h
+++ b/src/smpi/colls/colls.h
@@ -32,7 +32,9 @@
 COLL_APPLY(action, COLL_GATHER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_GATHER_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_GATHER_SIG, ompi_binomial) COLL_sep \
-COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) \
+COLL_APPLY(action, COLL_GATHER_SIG, ompi_linear_sync) COLL_sep \
+COLL_APPLY(action, COLL_GATHER_SIG, mpich) \
+
 
 
 COLL_GATHERS(COLL_PROTO, COLL_NOsep)
@@ -62,7 +64,9 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange)
+COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi_neighborexchange) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, mpich) 
+
 
 COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 
@@ -79,7 +83,10 @@ COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange)
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_neighborexchange) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi_bruck) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, mpich_rdb)
 
 COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep)
 
@@ -107,7 +114,8 @@ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_lr) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi) COLL_sep \
-COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented)
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi_ring_segmented) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, mpich)
 
 COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
 
@@ -134,7 +142,8 @@ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, simple) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi)
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, mpich)
 
 COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
 
@@ -156,7 +165,8 @@ COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi)
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, mpich)
 
 COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep)
 
@@ -185,7 +195,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \
-COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline)
+COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, mpich)
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
 
@@ -209,7 +220,8 @@ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_pipeline) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_in_order_binary) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binary) COLL_sep \
-COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial)
+COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, mpich)
 
 COLL_REDUCES(COLL_PROTO, COLL_NOsep)
 
@@ -223,7 +235,12 @@ COLL_REDUCES(COLL_PROTO, COLL_NOsep)
 #define COLL_REDUCE_SCATTERS(action, COLL_sep) \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_basic_recursivehalving) COLL_sep \
-COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring) 
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, ompi_ring)  COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_pair) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_rdb) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) 
+
 
 COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep)
 
@@ -239,7 +256,8 @@ COLL_REDUCE_SCATTERS(COLL_PROTO, COLL_NOsep)
 #define COLL_SCATTERS(action, COLL_sep) \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \
-COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial) 
+COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial)  COLL_sep \
+COLL_APPLY(action, COLL_SCATTER_SIG, mpich) 
 
 COLL_SCATTERS(COLL_PROTO, COLL_NOsep)
 
@@ -256,7 +274,8 @@ COLL_APPLY(action, COLL_BARRIER_SIG, ompi_two_procs)  COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_tree)  COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_bruck)  COLL_sep \
 COLL_APPLY(action, COLL_BARRIER_SIG, ompi_recursivedoubling) COLL_sep \
-COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering)  
+COLL_APPLY(action, COLL_BARRIER_SIG, ompi_doublering) COLL_sep \
+COLL_APPLY(action, COLL_BARRIER_SIG, mpich)  
 
 COLL_BARRIERS(COLL_PROTO, COLL_NOsep)
 
diff --git a/src/smpi/colls/reduce_scatter-mpich.c b/src/smpi/colls/reduce_scatter-mpich.c
new file mode 100644
index 0000000000..77eaef5bf8
--- /dev/null
+++ b/src/smpi/colls/reduce_scatter-mpich.c
@@ -0,0 +1,493 @@
+#include "colls_private.h"
+#define MPIR_REDUCE_SCATTER_TAG 222
+
+static inline int MPIU_Mirror_permutation(unsigned int x, int bits)
+{
+    /* a mask for the high order bits that should be copied as-is */
+    int high_mask = ~((0x1 << bits) - 1);
+    int retval = x & high_mask;
+    int i;
+
+    for (i = 0; i < bits; ++i) {
+        unsigned int bitval = (x & (0x1 << i)) >> i; /* 0x1 or 0x0 */
+        retval |= bitval << ((bits - i) - 1);
+    }
+
+    return retval;
+}
+
+
+int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[],
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+    int   rank, comm_size, i;
+    MPI_Aint extent, true_extent, true_lb; 
+    int  *disps;
+    void *tmp_recvbuf;
+    int mpi_errno = MPI_SUCCESS;
+    int type_size, total_count, nbytes, dst, src;
+    int is_commutative;
+    comm_size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+    extent =smpi_datatype_get_extent(datatype);
+    smpi_datatype_extent(datatype, &true_lb, &true_extent);
+    
+    if (smpi_op_is_commute(op)) {
+        is_commutative = 1;
+    }
+
+    disps = (int*)xbt_malloc( comm_size * sizeof(int));
+
+    total_count = 0;
+    for (i=0; i<comm_size; i++) {
+        disps[i] = total_count;
+        total_count += recvcounts[i];
+    }
+    
+    if (total_count == 0) {
+        return MPI_ERR_COUNT;
+    }
+
+    type_size= smpi_datatype_size(datatype);
+    nbytes = total_count * type_size;
+    
+
+        if (sendbuf != MPI_IN_PLACE) {
+            /* copy local data into recvbuf */
+            smpi_datatype_copy(((char *)sendbuf+disps[rank]*extent),
+                                       recvcounts[rank], datatype, recvbuf,
+                                       recvcounts[rank], datatype);
+        }
+        
+        /* allocate temporary buffer to store incoming data */
+        tmp_recvbuf = (void*)xbt_malloc(recvcounts[rank]*(max(true_extent,extent))+1);
+        /* adjust for potential negative lower bound in datatype */
+        tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
+        
+        for (i=1; i<comm_size; i++) {
+            src = (rank - i + comm_size) % comm_size;
+            dst = (rank + i) % comm_size;
+            
+            /* send the data that dst needs. recv data that this process
+               needs from src into tmp_recvbuf */
+            if (sendbuf != MPI_IN_PLACE) 
+                smpi_mpi_sendrecv(((char *)sendbuf+disps[dst]*extent), 
+                                             recvcounts[dst], datatype, dst,
+                                             MPIR_REDUCE_SCATTER_TAG, tmp_recvbuf,
+                                             recvcounts[rank], datatype, src,
+                                             MPIR_REDUCE_SCATTER_TAG, comm,
+                                             MPI_STATUS_IGNORE);
+            else
+                smpi_mpi_sendrecv(((char *)recvbuf+disps[dst]*extent), 
+                                             recvcounts[dst], datatype, dst,
+                                             MPIR_REDUCE_SCATTER_TAG, tmp_recvbuf,
+                                             recvcounts[rank], datatype, src,
+                                             MPIR_REDUCE_SCATTER_TAG, comm,
+                                             MPI_STATUS_IGNORE);
+            
+            if (is_commutative || (src < rank)) {
+                if (sendbuf != MPI_IN_PLACE) {
+		     smpi_op_apply( op,
+			                          tmp_recvbuf, recvbuf, &recvcounts[rank],
+                               &datatype); 
+                }
+                else {
+		    smpi_op_apply(op, 
+			tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent), 
+			&recvcounts[rank], &datatype);
+                    /* we can't store the result at the beginning of
+                       recvbuf right here because there is useful data
+                       there that other process/processes need. at the
+                       end, we will copy back the result to the
+                       beginning of recvbuf. */
+                }
+            }
+            else {
+                if (sendbuf != MPI_IN_PLACE) {
+		    smpi_op_apply(op, 
+		       recvbuf, tmp_recvbuf, &recvcounts[rank], &datatype);
+                    /* copy result back into recvbuf */
+                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
+                                               datatype, recvbuf,
+                                               recvcounts[rank], datatype);
+                    if (mpi_errno) return(mpi_errno);
+                }
+                else {
+		    smpi_op_apply(op, 
+                        ((char *)recvbuf+disps[rank]*extent),
+			tmp_recvbuf, &recvcounts[rank], &datatype);
+                    /* copy result back into recvbuf */
+                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
+                                               datatype, 
+                                               ((char *)recvbuf +
+                                                disps[rank]*extent), 
+                                               recvcounts[rank], datatype);
+                    if (mpi_errno) return(mpi_errno);
+                }
+            }
+        }
+        
+        /* if MPI_IN_PLACE, move output data to the beginning of
+           recvbuf. already done for rank 0. */
+        if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
+            mpi_errno = smpi_datatype_copy(((char *)recvbuf +
+                                        disps[rank]*extent),  
+                                       recvcounts[rank], datatype,
+                                       recvbuf, 
+                                       recvcounts[rank], datatype );
+            if (mpi_errno) return(mpi_errno);
+        }
+    
+return MPI_SUCCESS;
+}
+    
+
+int smpi_coll_tuned_reduce_scatter_mpich_noncomm(void *sendbuf, void *recvbuf, int recvcounts[],
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+    int mpi_errno = MPI_SUCCESS;
+    int comm_size = smpi_comm_size(comm) ;
+    int rank = smpi_comm_rank(comm);
+    int pof2;
+    int log2_comm_size;
+    int i, k;
+    int recv_offset, send_offset;
+    int block_size, total_count, size;
+    MPI_Aint true_extent, true_lb;
+    int buf0_was_inout;
+    void *tmp_buf0;
+    void *tmp_buf1;
+    void *result_ptr;
+
+    smpi_datatype_extent(datatype, &true_lb, &true_extent);
+
+    pof2 = 1;
+    log2_comm_size = 0;
+    while (pof2 < comm_size) {
+        pof2 <<= 1;
+        ++log2_comm_size;
+    }
+
+    /* begin error checking */
+    xbt_assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */
+
+    for (i = 0; i < (comm_size - 1); ++i) {
+        xbt_assert(recvcounts[i] == recvcounts[i+1]);
+    }
+    /* end error checking */
+
+    /* size of a block (count of datatype per block, NOT bytes per block) */
+    block_size = recvcounts[0];
+    total_count = block_size * comm_size;
+
+    tmp_buf0=( void *)xbt_malloc( true_extent * total_count);
+    tmp_buf1=( void *)xbt_malloc( true_extent * total_count);
+    /* adjust for potential negative lower bound in datatype */
+    tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb);
+    tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb);
+
+    /* Copy our send data to tmp_buf0.  We do this one block at a time and
+       permute the blocks as we go according to the mirror permutation. */
+    for (i = 0; i < comm_size; ++i) {
+        mpi_errno = smpi_datatype_copy((char *)(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size), block_size, datatype,
+                                   (char *)tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size, datatype);
+        if (mpi_errno) return(mpi_errno);
+    }
+    buf0_was_inout = 1;
+
+    send_offset = 0;
+    recv_offset = 0;
+    size = total_count;
+    for (k = 0; k < log2_comm_size; ++k) {
+        /* use a double-buffering scheme to avoid local copies */
+        char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0);
+        char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1);
+        int peer = rank ^ (0x1 << k);
+        size /= 2;
+
+        if (rank > peer) {
+            /* we have the higher rank: send top half, recv bottom half */
+            recv_offset += size;
+        }
+        else {
+            /* we have the lower rank: recv top half, send bottom half */
+            send_offset += size;
+        }
+
+        smpi_mpi_sendrecv(outgoing_data + send_offset*true_extent,
+                                     size, datatype, peer, MPIR_REDUCE_SCATTER_TAG,
+                                     incoming_data + recv_offset*true_extent,
+                                     size, datatype, peer, MPIR_REDUCE_SCATTER_TAG,
+                                     comm, MPI_STATUS_IGNORE);
+        /* always perform the reduction at recv_offset, the data at send_offset
+           is now our peer's responsibility */
+        if (rank > peer) {
+            /* higher ranked value so need to call op(received_data, my_data) */
+            smpi_op_apply(op, 
+                   incoming_data + recv_offset*true_extent,
+                     outgoing_data + recv_offset*true_extent,
+                     &size, &datatype );
+            buf0_was_inout = buf0_was_inout;
+        }
+        else {
+            /* lower ranked value so need to call op(my_data, received_data) */
+	    smpi_op_apply( op,
+		     outgoing_data + recv_offset*true_extent,
+                     incoming_data + recv_offset*true_extent,
+                     &size, &datatype);
+            buf0_was_inout = !buf0_was_inout;
+        }
+
+        /* the next round of send/recv needs to happen within the block (of size
+           "size") that we just received and reduced */
+        send_offset = recv_offset;
+    }
+
+    xbt_assert(size == recvcounts[rank]);
+
+    /* copy the reduced data to the recvbuf */
+    result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
+    mpi_errno = smpi_datatype_copy(result_ptr, size, datatype,
+                               recvbuf, size, datatype);
+    if (mpi_errno) return(mpi_errno);
+    return MPI_SUCCESS;
+}
+
+
+
+int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+    int   rank, comm_size, i;
+    MPI_Aint extent, true_extent, true_lb; 
+    int  *disps;
+    void *tmp_recvbuf, *tmp_results;
+    int mpi_errno = MPI_SUCCESS;
+    int type_size, dis[2], blklens[2], total_count, nbytes, dst;
+    int mask, dst_tree_root, my_tree_root, j, k;
+    int received;
+    MPI_Datatype sendtype, recvtype;
+    int nprocs_completed, tmp_mask, tree_root, is_commutative;
+    comm_size = smpi_comm_size(comm);
+    rank = smpi_comm_rank(comm);
+
+    extent =smpi_datatype_get_extent(datatype);
+    smpi_datatype_extent(datatype, &true_lb, &true_extent);
+    
+    if (smpi_op_is_commute(op)) {
+        is_commutative = 1;
+    }
+
+    disps = (int*)xbt_malloc( comm_size * sizeof(int));
+
+    total_count = 0;
+    for (i=0; i<comm_size; i++) {
+        disps[i] = total_count;
+        total_count += recvcounts[i];
+    }
+    
+    type_size= smpi_datatype_size(datatype);
+    nbytes = total_count * type_size;
+    
+
+            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */
+
+            /* need to allocate temporary buffer to receive incoming data*/
+            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
+            /* adjust for potential negative lower bound in datatype */
+            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
+
+            /* need to allocate another temporary buffer to accumulate
+               results */
+            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
+            /* adjust for potential negative lower bound in datatype */
+            tmp_results = (void *)((char*)tmp_results - true_lb);
+
+            /* copy sendbuf into tmp_results */
+            if (sendbuf != MPI_IN_PLACE)
+                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
+                                           tmp_results, total_count, datatype);
+            else
+                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
+                                           tmp_results, total_count, datatype);
+
+            if (mpi_errno) return(mpi_errno);
+
+            mask = 0x1;
+            i = 0;
+            while (mask < comm_size) {
+                dst = rank ^ mask;
+
+                dst_tree_root = dst >> i;
+                dst_tree_root <<= i;
+
+                my_tree_root = rank >> i;
+                my_tree_root <<= i;
+
+                /* At step 1, processes exchange (n-n/p) amount of
+                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
+                   amount of data, and so forth. We use derived datatypes for this.
+
+                   At each step, a process does not need to send data
+                   indexed from my_tree_root to
+                   my_tree_root+mask-1. Similarly, a process won't receive
+                   data indexed from dst_tree_root to dst_tree_root+mask-1. */
+
+                /* calculate sendtype */
+                blklens[0] = blklens[1] = 0;
+                for (j=0; j<my_tree_root; j++)
+                    blklens[0] += recvcounts[j];
+                for (j=my_tree_root+mask; j<comm_size; j++)
+                    blklens[1] += recvcounts[j];
+
+                dis[0] = 0;
+                dis[1] = blklens[0];
+                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
+                    dis[1] += recvcounts[j];
+
+                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
+                if (mpi_errno) return(mpi_errno);
+                
+                smpi_datatype_commit(&sendtype);
+
+                /* calculate recvtype */
+                blklens[0] = blklens[1] = 0;
+                for (j=0; j<dst_tree_root && j<comm_size; j++)
+                    blklens[0] += recvcounts[j];
+                for (j=dst_tree_root+mask; j<comm_size; j++)
+                    blklens[1] += recvcounts[j];
+
+                dis[0] = 0;
+                dis[1] = blklens[0];
+                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
+                    dis[1] += recvcounts[j];
+
+                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
+                if (mpi_errno) return(mpi_errno);
+                
+                smpi_datatype_commit(&recvtype);
+
+                received = 0;
+                if (dst < comm_size) {
+                    /* tmp_results contains data to be sent in each step. Data is
+                       received in tmp_recvbuf and then accumulated into
+                       tmp_results. accumulation is done later below.   */ 
+
+                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
+                                                 MPIR_REDUCE_SCATTER_TAG, 
+                                                 tmp_recvbuf, 1, recvtype, dst,
+                                                 MPIR_REDUCE_SCATTER_TAG, comm,
+                                                 MPI_STATUS_IGNORE);
+                    received = 1;
+                }
+
+                /* if some processes in this process's subtree in this step
+                   did not have any destination process to communicate with
+                   because of non-power-of-two, we need to send them the
+                   result. We use a logarithmic recursive-halfing algorithm
+                   for this. */
+
+                if (dst_tree_root + mask > comm_size) {
+                    nprocs_completed = comm_size - my_tree_root - mask;
+                    /* nprocs_completed is the number of processes in this
+                       subtree that have all the data. Send data to others
+                       in a tree fashion. First find root of current tree
+                       that is being divided into two. k is the number of
+                       least-significant bits in this process's rank that
+                       must be zeroed out to find the rank of the root */ 
+                    j = mask;
+                    k = 0;
+                    while (j) {
+                        j >>= 1;
+                        k++;
+                    }
+                    k--;
+
+                    tmp_mask = mask >> 1;
+                    while (tmp_mask) {
+                        dst = rank ^ tmp_mask;
+
+                        tree_root = rank >> k;
+                        tree_root <<= k;
+
+                        /* send only if this proc has data and destination
+                           doesn't have data. at any step, multiple processes
+                           can send if they have the data */
+                        if ((dst > rank) && 
+                            (rank < tree_root + nprocs_completed)
+                            && (dst >= tree_root + nprocs_completed)) {
+                            /* send the current result */
+                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
+                                                     dst, MPIR_REDUCE_SCATTER_TAG,
+                                                     comm);
+                        }
+                        /* recv only if this proc. doesn't have data and sender
+                           has data */
+                        else if ((dst < rank) && 
+                                 (dst < tree_root + nprocs_completed) &&
+                                 (rank >= tree_root + nprocs_completed)) {
+                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
+                                                     MPIR_REDUCE_SCATTER_TAG,
+                                                     comm, MPI_STATUS_IGNORE); 
+                            received = 1;
+                        }
+                        tmp_mask >>= 1;
+                        k--;
+                    }
+                }
+
+                /* The following reduction is done here instead of after 
+                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
+                   because to do it above, in the noncommutative 
+                   case, we would need an extra temp buffer so as not to
+                   overwrite temp_recvbuf, because temp_recvbuf may have
+                   to be communicated to other processes in the
+                   non-power-of-two case. To avoid that extra allocation,
+                   we do the reduce here. */
+                if (received) {
+                    if (is_commutative || (dst_tree_root < my_tree_root)) {
+                        {
+			         smpi_op_apply(op, 
+                               tmp_recvbuf, tmp_results, &blklens[0],
+			       &datatype); 
+			        smpi_op_apply(op, 
+                               ((char *)tmp_recvbuf + dis[1]*extent),
+			       ((char *)tmp_results + dis[1]*extent),
+			       &blklens[1], &datatype); 
+                        }
+                    }
+                    else {
+                        {
+			         smpi_op_apply(op,
+                                   tmp_results, tmp_recvbuf, &blklens[0],
+                                   &datatype); 
+			         smpi_op_apply(op,
+                                   ((char *)tmp_results + dis[1]*extent),
+                                   ((char *)tmp_recvbuf + dis[1]*extent),
+                                   &blklens[1], &datatype); 
+                        }
+                        /* copy result back into tmp_results */
+                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
+                                                   tmp_results, 1, recvtype);
+                        if (mpi_errno) return(mpi_errno);
+                    }
+                }
+
+                //smpi_datatype_free(&sendtype);
+                //smpi_datatype_free(&recvtype);
+
+                mask <<= 1;
+                i++;
+            }
+
+            /* now copy final results from tmp_results to recvbuf */
+            mpi_errno = smpi_datatype_copy(((char *)tmp_results+disps[rank]*extent),
+                                       recvcounts[rank], datatype, recvbuf,
+                                       recvcounts[rank], datatype);
+            if (mpi_errno) return(mpi_errno);
+
+    return MPI_SUCCESS;
+        }
+
+
diff --git a/src/smpi/colls/smpi_mpich_selector.c b/src/smpi/colls/smpi_mpich_selector.c
new file mode 100644
index 0000000000..c95613598f
--- /dev/null
+++ b/src/smpi/colls/smpi_mpich_selector.c
@@ -0,0 +1,696 @@
+/* selector for collective algorithms based on mpich decision logic */
+
+/* Copyright (c) 2009, 2010. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+#include "colls_private.h"
+
+/* This is the default implementation of allreduce. The algorithm is:
+   
+   Algorithm: MPI_Allreduce
+
+   For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
+   in order to meet the requirement that all processes must have the
+   same result. For the homogeneous case, we use the following algorithms.
+
+
+   For long messages and for builtin ops and if count >= pof2 (where
+   pof2 is the nearest power-of-two less than or equal to the number
+   of processes), we use Rabenseifner's algorithm (see 
+   http://www.hlrs.de/mpi/myreduce.html).
+   This algorithm implements the allreduce in two steps: first a
+   reduce-scatter, followed by an allgather. A recursive-halving
+   algorithm (beginning with processes that are distance 1 apart) is
+   used for the reduce-scatter, and a recursive doubling 
+   algorithm is used for the allgather. The non-power-of-two case is
+   handled by dropping to the nearest lower power-of-two: the first
+   few even-numbered processes send their data to their right neighbors
+   (rank+1), and the reduce-scatter and allgather happen among the remaining
+   power-of-two processes. At the end, the first few even-numbered
+   processes get the result from their right neighbors.
+
+   For the power-of-two case, the cost for the reduce-scatter is 
+   lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
+   allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
+   total cost is:
+   Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
+
+   For the non-power-of-two case, 
+   Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
+
+   
+   For short messages, for user-defined ops, and for count < pof2 
+   we use a recursive doubling algorithm (similar to the one in
+   MPI_Allgather). We use this algorithm in the case of user-defined ops
+   because in this case derived datatypes are allowed, and the user
+   could pass basic datatypes on one process and derived on another as
+   long as the type maps are the same. Breaking up derived datatypes
+   to do the reduce-scatter is tricky. 
+
+   Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Allreduce
+*/
+int smpi_coll_tuned_allreduce_mpich(void *sbuf, void *rbuf, int count,
+                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+    size_t dsize, block_dsize;
+    int comm_size = smpi_comm_size(comm);
+    const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
+
+    dsize = smpi_datatype_size(dtype);
+    block_dsize = dsize * count;
+
+
+    /* find nearest power-of-two less than or equal to comm_size */
+    int pof2 = 1;
+    while (pof2 <= comm_size) pof2 <<= 1;
+    pof2 >>=1;
+
+    if (block_dsize > large_message && count >= pof2 && smpi_op_is_commute(op)) {
+      //for long messages
+       return (smpi_coll_tuned_allreduce_rab_rsag (sbuf, rbuf, 
+                                                                   count, dtype,
+                                                                   op, comm));
+    }else {
+      //for short ones and count < pof2
+      return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
+                                                                   count, dtype,
+                                                                   op, comm));
+    }
+}
+
+
+/* This is the default implementation of alltoall. The algorithm is:
+   
+   Algorithm: MPI_Alltoall
+
+   We use four algorithms for alltoall. For short messages and
+   (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
+   IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
+   takes lgp steps. Because of the extra communication, the bandwidth
+   requirement is (n/2).lgp.beta.
+
+   Cost = lgp.alpha + (n/2).lgp.beta
+
+   where n is the total amount of data a process needs to send to all
+   other processes.
+
+   For medium size messages and (short messages for comm_size < 8), we
+   use an algorithm that posts all irecvs and isends and then does a
+   waitall. We scatter the order of sources and destinations among the
+   processes, so that all processes don't try to send/recv to/from the
+   same process at the same time.
+
+   *** Modification: We post only a small number of isends and irecvs 
+   at a time and wait on them as suggested by Tony Ladd. ***
+   *** See comments below about an additional modification that 
+   we may want to consider ***
+
+   For long messages and power-of-two number of processes, we use a
+   pairwise exchange algorithm, which takes p-1 steps. We
+   calculate the pairs by using an exclusive-or algorithm:
+           for (i=1; i<comm_size; i++)
+               dest = rank ^ i;
+   This algorithm doesn't work if the number of processes is not a power of
+   two. For a non-power-of-two number of processes, we use an
+   algorithm in which, in step i, each process  receives from (rank-i)
+   and sends to (rank+i). 
+
+   Cost = (p-1).alpha + n.beta
+
+   where n is the total amount of data a process needs to send to all
+   other processes.
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Alltoall
+*/
+
+int smpi_coll_tuned_alltoall_mpich( void *sbuf, int scount, 
+                                             MPI_Datatype sdtype,
+                                             void* rbuf, int rcount, 
+                                             MPI_Datatype rdtype, 
+                                             MPI_Comm comm)
+{
+    int communicator_size;
+    size_t dsize, block_dsize;
+    communicator_size = smpi_comm_size(comm);
+
+    int short_size=256;
+    int medium_size=32768;
+    //short size and comm_size >=8   -> bruck
+    
+//     medium size messages and (short messages for comm_size < 8), we
+//     use an algorithm that posts all irecvs and isends and then does a
+//     waitall. 
+    
+//    For long messages and power-of-two number of processes, we use a
+//   pairwise exchange algorithm
+
+//   For a non-power-of-two number of processes, we use an
+//   algorithm in which, in step i, each process  receives from (rank-i)
+//   and sends to (rank+i). 
+
+
+    dsize = smpi_datatype_size(sdtype);
+    block_dsize = dsize * scount;
+
+    if ((block_dsize < short_size) && (communicator_size >= 8)) {
+        return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
+                                                    rbuf, rcount, rdtype,
+                                                    comm);
+
+    } else if (block_dsize < medium_size) {
+        return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, 
+                                                           rbuf, rcount, rdtype, 
+                                                           comm);
+    }else if (communicator_size%2){
+        return smpi_coll_tuned_alltoall_ring(sbuf, scount, sdtype, 
+                                                           rbuf, rcount, rdtype, 
+                                                           comm);
+    }
+
+    return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, 
+                                                    rbuf, rcount, rdtype,
+                                                    comm);
+}
+
+int smpi_coll_tuned_alltoallv_mpich(void *sbuf, int *scounts, int *sdisps,
+                                              MPI_Datatype sdtype,
+                                              void *rbuf, int *rcounts, int *rdisps,
+                                              MPI_Datatype rdtype,
+                                              MPI_Comm  comm
+                                              )
+{
+    /* For starters, just keep the original algorithm. */
+    return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, 
+                                                        rbuf, rcounts, rdisps,rdtype,
+                                                        comm);
+}
+
+
+int smpi_coll_tuned_barrier_mpich(MPI_Comm  comm)
+{   
+    return smpi_coll_tuned_barrier_ompi_bruck(comm);
+}
+
+/* This is the default implementation of broadcast. The algorithm is:
+   
+   Algorithm: MPI_Bcast
+
+   For short messages, we use a binomial tree algorithm. 
+   Cost = lgp.alpha + n.lgp.beta
+
+   For long messages, we do a scatter followed by an allgather. 
+   We first scatter the buffer using a binomial tree algorithm. This costs
+   lgp.alpha + n.((p-1)/p).beta
+   If the datatype is contiguous and the communicator is homogeneous,
+   we treat the data as bytes and divide (scatter) it among processes
+   by using ceiling division. For the noncontiguous or heterogeneous
+   cases, we first pack the data into a temporary buffer by using
+   MPI_Pack, scatter it as bytes, and unpack it after the allgather.
+
+   For the allgather, we use a recursive doubling algorithm for 
+   medium-size messages and power-of-two number of processes. This
+   takes lgp steps. In each step pairs of processes exchange all the
+   data they have (we take care of non-power-of-two situations). This
+   costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
+   because it may be slightly more in the non-power-of-two case, but
+   it's still a logarithmic algorithm.) Therefore, for long messages
+   Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
+
+   Note that this algorithm has twice the latency as the tree algorithm
+   we use for short messages, but requires lower bandwidth: 2.n.beta
+   versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
+   this algorithm will perform better.
+
+   For long messages and for medium-size messages and non-power-of-two 
+   processes, we use a ring algorithm for the allgather, which 
+   takes p-1 steps, because it performs better than recursive doubling.
+   Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
+
+   Possible improvements: 
+   For clusters of SMPs, we may want to do something differently to
+   take advantage of shared memory on each node.
+
+   End Algorithm: MPI_Bcast
+*/
+
+
+int smpi_coll_tuned_bcast_mpich(void *buff, int count,
+                                          MPI_Datatype datatype, int root,
+                                          MPI_Comm  comm
+                                          )
+{
+    /* Decision function based on MX results for 
+       messages up to 36MB and communicator sizes up to 64 nodes */
+    const size_t small_message_size = 12288;
+    const size_t intermediate_message_size = 524288;
+
+    int communicator_size;
+    //int segsize = 0;
+    size_t message_size, dsize;
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* else we need data size for decision function */
+    dsize = smpi_datatype_size(datatype);
+    message_size = dsize * (unsigned long)count;   /* needed for decision */
+
+    /* Handle messages of small and intermediate size, and 
+       single-element broadcasts */
+    if ((message_size < small_message_size) || (communicator_size <= 8)) {
+        /* Binomial without segmentation */
+        return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
+                                                      root, comm);
+
+    } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
+        // SplittedBinary with 1KB segments
+        return smpi_coll_tuned_bcast_scatter_rdb_allgather(buff, count, datatype, 
+                                                         root, comm);
+
+    }
+     //Handle large message sizes 
+     return smpi_coll_tuned_bcast_scatter_LR_allgather (buff, count, datatype, 
+                                                     root, comm);
+                                                         
+}
+
+
+
+/* This is the default implementation of reduce. The algorithm is:
+   
+   Algorithm: MPI_Reduce
+
+   For long messages and for builtin ops and if count >= pof2 (where
+   pof2 is the nearest power-of-two less than or equal to the number
+   of processes), we use Rabenseifner's algorithm (see 
+   http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
+   This algorithm implements the reduce in two steps: first a
+   reduce-scatter, followed by a gather to the root. A
+   recursive-halving algorithm (beginning with processes that are
+   distance 1 apart) is used for the reduce-scatter, and a binomial tree
+   algorithm is used for the gather. The non-power-of-two case is
+   handled by dropping to the nearest lower power-of-two: the first
+   few odd-numbered processes send their data to their left neighbors
+   (rank-1), and the reduce-scatter happens among the remaining
+   power-of-two processes. If the root is one of the excluded
+   processes, then after the reduce-scatter, rank 0 sends its result to
+   the root and exits; the root now acts as rank 0 in the binomial tree
+   algorithm for gather.
+
+   For the power-of-two case, the cost for the reduce-scatter is 
+   lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
+   gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
+   total cost is:
+   Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
+
+   For the non-power-of-two case, assuming the root is not one of the
+   odd-numbered processes that get excluded in the reduce-scatter,
+   Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta + 
+           n.(1+(p-1)/p).gamma
+
+
+   For short messages, user-defined ops, and count < pof2, we use a
+   binomial tree algorithm for both short and long messages. 
+
+   Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
+
+
+   We use the binomial tree algorithm in the case of user-defined ops
+   because in this case derived datatypes are allowed, and the user
+   could pass basic datatypes on one process and derived on another as
+   long as the type maps are the same. Breaking up derived datatypes
+   to do the reduce-scatter is tricky.
+
+   FIXME: Per the MPI-2.1 standard this case is not possible.  We
+   should be able to use the reduce-scatter/gather approach as long as
+   count >= pof2.  [goodell@ 2009-01-21]
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Reduce
+*/
+
+
+int smpi_coll_tuned_reduce_mpich( void *sendbuf, void *recvbuf,
+                                            int count, MPI_Datatype  datatype,
+                                            MPI_Op   op, int root,
+                                            MPI_Comm   comm
+                                            )
+{
+    int communicator_size=0;
+    //int segsize = 0;
+    size_t message_size, dsize;
+    communicator_size = smpi_comm_size(comm);
+
+    /* need data size for decision function */
+    dsize=smpi_datatype_size(datatype);
+    message_size = dsize * count;   /* needed for decision */
+
+    int pof2 = 1;
+    while (pof2 <= communicator_size) pof2 <<= 1;
+    pof2 >>= 1;
+
+
+    if ((count < pof2) || (message_size < 2048) || !smpi_op_is_commute(op)) {
+        return smpi_coll_tuned_reduce_binomial (sendbuf, recvbuf, count, datatype, op, root, comm); 
+    }
+        return smpi_coll_tuned_reduce_scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
+                                                     segsize, max_requests*/);
+}
+
+
+
+/* This is the default implementation of reduce_scatter. The algorithm is:
+
+   Algorithm: MPI_Reduce_scatter
+
+   If the operation is commutative, for short and medium-size
+   messages, we use a recursive-halving
+   algorithm in which the first p/2 processes send the second n/2 data
+   to their counterparts in the other half and receive the first n/2
+   data from them. This procedure continues recursively, halving the
+   data communicated at each step, for a total of lgp steps. If the
+   number of processes is not a power-of-two, we convert it to the
+   nearest lower power-of-two by having the first few even-numbered
+   processes send their data to the neighboring odd-numbered process
+   at (rank+1). Those odd-numbered processes compute the result for
+   their left neighbor as well in the recursive halving algorithm, and
+   then at  the end send the result back to the processes that didn't
+   participate.
+   Therefore, if p is a power-of-two,
+   Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
+   If p is not a power-of-two,
+   Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
+   The above cost in the non power-of-two case is approximate because
+   there is some imbalance in the amount of work each process does
+   because some processes do the work of their neighbors as well.
+
+   For commutative operations and very long messages we use
+   we use a pairwise exchange algorithm similar to
+   the one used in MPI_Alltoall. At step i, each process sends n/p
+   amount of data to (rank+i) and receives n/p amount of data from
+   (rank-i).
+   Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
+
+
+   If the operation is not commutative, we do the following:
+
+   We use a recursive doubling algorithm, which
+   takes lgp steps. At step 1, processes exchange (n-n/p) amount of
+   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
+   amount of data, and so forth.
+
+   Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
+
+   Possible improvements:
+
+   End Algorithm: MPI_Reduce_scatter
+*/
+
+
+int smpi_coll_tuned_reduce_scatter_mpich( void *sbuf, void *rbuf,
+                                                    int *rcounts,
+                                                    MPI_Datatype dtype,
+                                                    MPI_Op  op,
+                                                    MPI_Comm  comm
+                                                    )
+{
+    int comm_size, i;
+    size_t total_message_size, dsize;
+    int zerocounts = 0;
+
+    XBT_DEBUG("smpi_coll_tuned_reduce_scatter_mpich");
+    
+    comm_size = smpi_comm_size(comm);
+    // We need data size for decision function 
+    dsize=smpi_datatype_size(dtype);
+    total_message_size = 0;
+    for (i = 0; i < comm_size; i++) { 
+        total_message_size += rcounts[i];
+        if (0 == rcounts[i]) {
+            zerocounts = 1;
+        }
+    }
+
+    if( smpi_op_is_commute(op) &&  total_message_size > 524288) { 
+        return smpi_coll_tuned_reduce_scatter_mpich_pair (sbuf, rbuf, rcounts, 
+                                                                    dtype, op, 
+                                                                    comm); 
+    }else if (!smpi_op_is_commute(op)) {
+        int is_block_regular = 1;
+        for (i = 0; i < (comm_size - 1); ++i) {
+            if (rcounts[i] != rcounts[i+1]) {
+                is_block_regular = 0;
+                break;
+            }
+        }
+
+        /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
+        int pof2 = 1;
+        while (pof2 < comm_size) pof2 <<= 1;
+
+        if (pof2 == comm_size && is_block_regular) {
+            /* noncommutative, pof2 size, and block regular */
+            return smpi_coll_tuned_reduce_scatter_mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm);
+        }
+
+       return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
+    }else{      
+       return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
+    }
+}
+
+
+/* This is the default implementation of allgather. The algorithm is:
+   
+   Algorithm: MPI_Allgather
+
+   For short messages and non-power-of-two no. of processes, we use
+   the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
+   paper. It is a variant of the disemmination algorithm for
+   barrier. It takes ceiling(lg p) steps.
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+   where n is total size of data gathered on each process.
+
+   For short or medium-size messages and power-of-two no. of
+   processes, we use the recursive doubling algorithm.
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+
+   TODO: On TCP, we may want to use recursive doubling instead of the Bruck
+   algorithm in all cases because of the pairwise-exchange property of
+   recursive doubling (see Benson et al paper in Euro PVM/MPI
+   2003).
+
+   It is interesting to note that either of the above algorithms for
+   MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
+
+   For long messages or medium-size messages and non-power-of-two
+   no. of processes, we use a ring algorithm. In the first step, each
+   process i sends its contribution to process i+1 and receives
+   the contribution from process i-1 (with wrap-around). From the
+   second step onwards, each process i forwards to process i+1 the
+   data it received from process i-1 in the previous step. This takes
+   a total of p-1 steps.
+
+   Cost = (p-1).alpha + n.((p-1)/p).beta
+
+   We use this algorithm instead of recursive doubling for long
+   messages because we find that this communication pattern (nearest
+   neighbor) performs twice as fast as recursive doubling for long
+   messages (on Myrinet and IBM SP).
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Allgather
+*/
+
+int smpi_coll_tuned_allgather_mpich(void *sbuf, int scount, 
+                                              MPI_Datatype sdtype,
+                                              void* rbuf, int rcount, 
+                                              MPI_Datatype rdtype, 
+                                              MPI_Comm  comm
+                                              )
+{
+    int communicator_size, pow2_size;
+    size_t dsize, total_dsize;
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* Determine complete data size */
+    dsize=smpi_datatype_size(sdtype);
+    total_dsize = dsize * scount * communicator_size;   
+   
+    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+
+    /* Decision as in MPICH-2 
+       presented in Thakur et.al. "Optimization of Collective Communication 
+       Operations in MPICH", International Journal of High Performance Computing 
+       Applications, Vol. 19, No. 1, 49-66 (2005)
+       - for power-of-two processes and small and medium size messages 
+       (up to 512KB) use recursive doubling
+       - for non-power-of-two processes and small messages (80KB) use bruck,
+       - for everything else use ring.
+    */
+    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
+        return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
+                                                                 rbuf, rcount, rdtype, 
+                                                                 comm);
+    } else if (total_dsize <= 81920) { 
+        return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
+                                                     rbuf, rcount, rdtype,
+                                                     comm);
+    } 
+    return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
+                                                rbuf, rcount, rdtype,
+                                                comm);
+}
+
+
+/* This is the default implementation of allgatherv. The algorithm is:
+   
+   Algorithm: MPI_Allgatherv
+
+   For short messages and non-power-of-two no. of processes, we use
+   the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
+   paper. It is a variant of the disemmination algorithm for
+   barrier. It takes ceiling(lg p) steps.
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+   where n is total size of data gathered on each process.
+
+   For short or medium-size messages and power-of-two no. of
+   processes, we use the recursive doubling algorithm.
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+
+   TODO: On TCP, we may want to use recursive doubling instead of the Bruck
+   algorithm in all cases because of the pairwise-exchange property of
+   recursive doubling (see Benson et al paper in Euro PVM/MPI
+   2003).
+
+   For long messages or medium-size messages and non-power-of-two
+   no. of processes, we use a ring algorithm. In the first step, each
+   process i sends its contribution to process i+1 and receives
+   the contribution from process i-1 (with wrap-around). From the
+   second step onwards, each process i forwards to process i+1 the
+   data it received from process i-1 in the previous step. This takes
+   a total of p-1 steps.
+
+   Cost = (p-1).alpha + n.((p-1)/p).beta
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Allgatherv
+*/
+int smpi_coll_tuned_allgatherv_mpich(void *sbuf, int scount, 
+                                               MPI_Datatype sdtype,
+                                               void* rbuf, int *rcounts, 
+                                               int *rdispls,
+                                               MPI_Datatype rdtype, 
+                                               MPI_Comm  comm
+                                               )
+{
+    int communicator_size, pow2_size;
+    size_t dsize, total_dsize;
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* Determine complete data size */
+    dsize=smpi_datatype_size(sdtype);
+    total_dsize = dsize * scount * communicator_size;   
+   
+    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+
+    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
+        return smpi_coll_tuned_allgatherv_mpich_rdb(sbuf, scount, sdtype, 
+                                                                 rbuf, rcounts, rdispls, rdtype, 
+                                                                 comm);
+    } else if (total_dsize <= 81920) { 
+        return smpi_coll_tuned_allgatherv_ompi_bruck(sbuf, scount, sdtype, 
+                                                     rbuf, rcounts, rdispls, rdtype,
+                                                     comm);
+    } 
+    return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
+                                                rbuf, rcounts, rdispls, rdtype,
+                                                comm);
+}
+
+/* This is the default implementation of gather. The algorithm is:
+   
+   Algorithm: MPI_Gather
+
+   We use a binomial tree algorithm for both short and long
+   messages. At nodes other than leaf nodes we need to allocate a
+   temporary buffer to store the incoming message. If the root is not
+   rank 0, for very small messages, we pack it into a temporary
+   contiguous buffer and reorder it to be placed in the right
+   order. For small (but not very small) messages, we use a derived
+   datatype to unpack the incoming data into non-contiguous buffers in
+   the right order. In the heterogeneous case we first pack the
+   buffers by using MPI_Pack and then do the gather.
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+   where n is the total size of the data gathered at the root.
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Gather
+*/
+
+int smpi_coll_tuned_gather_mpich(void *sbuf, int scount, 
+                                           MPI_Datatype sdtype,
+                                           void* rbuf, int rcount, 
+                                           MPI_Datatype rdtype, 
+                                           int root,
+                                           MPI_Comm  comm
+                                           )
+{
+        return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, 
+                                                      rbuf, rcount, rdtype, 
+                                                      root, comm);
+}
+
+/* This is the default implementation of scatter. The algorithm is:
+   
+   Algorithm: MPI_Scatter
+
+   We use a binomial tree algorithm for both short and
+   long messages. At nodes other than leaf nodes we need to allocate
+   a temporary buffer to store the incoming message. If the root is
+   not rank 0, we reorder the sendbuf in order of relative ranks by 
+   copying it into a temporary buffer, so that all the sends from the
+   root are contiguous and in the right order. In the heterogeneous
+   case, we first pack the buffer by using MPI_Pack and then do the
+   scatter. 
+
+   Cost = lgp.alpha + n.((p-1)/p).beta
+   where n is the total size of the data to be scattered from the root.
+
+   Possible improvements: 
+
+   End Algorithm: MPI_Scatter
+*/
+
+
+int smpi_coll_tuned_scatter_mpich(void *sbuf, int scount, 
+                                            MPI_Datatype sdtype,
+                                            void* rbuf, int rcount, 
+                                            MPI_Datatype rdtype, 
+                                            int root, MPI_Comm  comm
+                                            )
+{
+        return smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype, 
+                                                       rbuf, rcount, rdtype, 
+                                                       root, comm);
+}
+
diff --git a/src/smpi/smpi_mpi_dt.c b/src/smpi/smpi_mpi_dt.c
index e42513b15f..b233c413d4 100644
--- a/src/smpi/smpi_mpi_dt.c
+++ b/src/smpi/smpi_mpi_dt.c
@@ -424,7 +424,7 @@ int smpi_datatype_contiguous(int count, MPI_Datatype old_type, MPI_Datatype* new
 int smpi_datatype_vector(int count, int blocklen, int stride, MPI_Datatype old_type, MPI_Datatype* new_type)
 {
   int retval;
-  if (blocklen<=0) return MPI_ERR_ARG;
+  if (blocklen<0) return MPI_ERR_ARG;
   MPI_Aint lb = 0;
   MPI_Aint ub = 0;
   if(count>0){
@@ -565,7 +565,7 @@ void free_hvector(MPI_Datatype* d){
 int smpi_datatype_hvector(int count, int blocklen, MPI_Aint stride, MPI_Datatype old_type, MPI_Datatype* new_type)
 {
   int retval;
-  if (blocklen<=0) return MPI_ERR_ARG;
+  if (blocklen<0) return MPI_ERR_ARG;
   MPI_Aint lb = 0;
   MPI_Aint ub = 0;
   if(count>0){
@@ -726,7 +726,7 @@ int smpi_datatype_indexed(int count, int* blocklens, int* indices, MPI_Datatype
   }
 
   for(i=0; i< count; i++){
-    if   (blocklens[i]<=0)
+    if   (blocklens[i]<0)
       return MPI_ERR_ARG;
     size += blocklens[i];
 
@@ -886,7 +886,7 @@ int smpi_datatype_hindexed(int count, int* blocklens, MPI_Aint* indices, MPI_Dat
     ub=indices[0] + blocklens[0]*smpi_datatype_ub(old_type);
   }
   for(i=0; i< count; i++){
-    if   (blocklens[i]<=0)
+    if   (blocklens[i]<0)
       return MPI_ERR_ARG;
     size += blocklens[i];
 
@@ -1054,7 +1054,7 @@ int smpi_datatype_struct(int count, int* blocklens, MPI_Aint* indices, MPI_Datat
   int forced_lb=0;
   int forced_ub=0;
   for(i=0; i< count; i++){
-    if (blocklens[i]<=0)
+    if (blocklens[i]<0)
       return MPI_ERR_ARG;
     if (old_types[i]->has_subtype == 1)
       contiguous=0;
diff --git a/teshsuite/smpi/barrier_coll.tesh b/teshsuite/smpi/barrier_coll.tesh
index a65b0de18c..f5d2c79901 100644
--- a/teshsuite/smpi/barrier_coll.tesh
+++ b/teshsuite/smpi/barrier_coll.tesh
@@ -3,8 +3,41 @@
 ! output sort
 
 p Test scatter
-$ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/../../examples/msg/small_platform.xml -np 16 --log=xbt_cfg.thres:critical ./barrier_coll
-> ... Barrier ....
+$ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/../../examples/msg/small_platform.xml -np 16 --log=xbt_cfg.thres:critical ./scatter 
+> 	[0] ok.
+> 	[0] ok.
+> 	[10] ok.
+> 	[10] ok.
+> 	[11] ok.
+> 	[11] ok.
+> 	[12] ok.
+> 	[12] ok.
+> 	[13] ok.
+> 	[13] ok.
+> 	[14] ok.
+> 	[14] ok.
+> 	[15] ok.
+> 	[15] ok.
+> 	[1] ok.
+> 	[1] ok.
+> 	[2] ok.
+> 	[2] ok.
+> 	[3] ok.
+> 	[3] ok.
+> 	[4] ok.
+> 	[4] ok.
+> 	[5] ok.
+> 	[5] ok.
+> 	[6] ok.
+> 	[6] ok.
+> 	[7] ok.
+> 	[7] ok.
+> 	[8] ok.
+> 	[8] ok.
+> 	[9] ok.
+> 	[9] ok.
+> ** IBM Test Result: ... 
+> ** Small Test Result: ... 
 > You requested to use 16 processes, but there is only 5 processes in your hostfile...
 > [0.000000] [surf_config/INFO] Switching workstation model to compound since you changed the network and/or cpu model(s)
 > [rank 0] -> Tremblay
@@ -23,4 +56,3 @@ $ ../../bin/smpirun -map -hostfile ${srcdir:=.}/hostfile -platform ${srcdir:=.}/
 > [rank 7] -> Fafard
 > [rank 8] -> Ginette
 > [rank 9] -> Bourassa
-