Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Add last collectives from mvapich selector : bcast reduce reduce_scatter scatter
authorAugustin Degomme <augustin.degomme@imag.fr>
Thu, 24 Jul 2014 00:04:40 +0000 (02:04 +0200)
committerAugustin Degomme <augustin.degomme@imag.fr>
Thu, 24 Jul 2014 13:23:03 +0000 (15:23 +0200)
bcast stilll defaults to mpich one, as they need smp support

buildtools/Cmake/AddTests.cmake
src/smpi/colls/colls.h
src/smpi/colls/smpi_mvapich2_selector.c
src/smpi/colls/smpi_mvapich2_selector_stampede.h

index bb58212..fd3768c 100644 (file)
@@ -402,16 +402,16 @@ IF(NOT enable_memcheck)
     ENDFOREACH()
     FOREACH (BCAST_COLL default arrival_pattern_aware arrival_pattern_aware_wait arrival_scatter
                         binomial_tree flattree flattree_pipeline NTSB NTSL NTSL_Isend scatter_LR_allgather
-                        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline)
+                        scatter_rdb_allgather SMP_binary SMP_binomial SMP_linear ompi mpich ompi_split_bintree ompi_pipeline mvapich2)
       ADD_TESH(tesh-smpi-bcast-coll-${BCAST_COLL} --cfg smpi/bcast:${BCAST_COLL} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/bcast --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/bcast bcast_coll.tesh)
     ENDFOREACH()
-    FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi mpich ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary)
+    FOREACH (REDUCE_COLL default arrival_pattern_aware binomial flat_tree NTSL scatter_gather ompi mpich ompi_chain ompi_binary ompi_basic_linear ompi_binomial ompi_in_order_binary mvapich2)
       ADD_TESH(tesh-smpi-reduce-coll-${REDUCE_COLL} --cfg smpi/reduce:${REDUCE_COLL} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/reduce --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce reduce_coll.tesh)
     ENDFOREACH()
-    FOREACH (REDUCE_SCATTER_COLL default  ompi mpich ompi_basic_recursivehalving ompi_ring mpich_noncomm mpich_pair mpich_rdb)
+    FOREACH (REDUCE_SCATTER_COLL default  ompi mpich ompi_basic_recursivehalving ompi_ring mpich_noncomm mpich_pair mvapich2 mpich_rdb)
       ADD_TESH(tesh-smpi-reduce-scatter-coll-${REDUCE_SCATTER_COLL} --cfg smpi/reduce_scatter:${REDUCE_SCATTER_COLL} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/reduce --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/reduce reduce_scatter_coll.tesh)
     ENDFOREACH()
-    FOREACH (SCATTER_COLL default  ompi mpich ompi_basic_linear ompi_binomial)
+    FOREACH (SCATTER_COLL default  ompi mpich ompi_basic_linear ompi_binomial mvapich2)
       ADD_TESH(tesh-smpi-scatter-coll-${SCATTER_COLL} --cfg smpi/scatter:${SCATTER_COLL} --setenv bindir=${CMAKE_BINARY_DIR}/teshsuite/smpi/scatter --cd ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/scatter scatter_coll.tesh)
     ENDFOREACH()
     FOREACH (BARRIER_COLL default  ompi mpich ompi_basic_linear ompi_tree ompi_bruck ompi_recursivedoubling ompi_doublering mvapich2_pair mvapich2)
@@ -422,7 +422,8 @@ IF(NOT enable_memcheck)
       ADD_TEST(test-smpi-mpich3-coll-thread      ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/coll perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/coll -tests=testlist -execarg=--cfg=contexts/factory:thread -execarg=--cfg=smpi/privatize_global_variables:yes)
       ADD_TEST(test-smpi-mpich3-coll-ompi-thread ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/coll perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/coll -tests=testlist -execarg=--cfg=contexts/factory:thread -execarg=--cfg=smpi/coll_selector:ompi -execarg=--cfg=smpi/send_is_detached_thres:0 -execarg=--cfg=smpi/privatize_global_variables:yes -execarg=--cfg=smpi/bcast:binomial_tree)
       ADD_TEST(test-smpi-mpich3-coll-mpich-thread ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/coll perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/coll -tests=testlist -execarg=--cfg=contexts/factory:thread -execarg=--cfg=smpi/coll_selector:mpich -execarg=--cfg=smpi/privatize_global_variables:yes)
-      SET_TESTS_PROPERTIES(test-smpi-mpich3-coll-thread test-smpi-mpich3-coll-ompi-thread test-smpi-mpich3-coll-mpich-thread PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
+      ADD_TEST(test-smpi-mpich3-coll-mvapich2-thread ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/coll perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/coll -tests=testlist -execarg=--cfg=contexts/factory:thread -execarg=--cfg=smpi/coll_selector:mvapich2 -execarg=--cfg=smpi/privatize_global_variables:yes)
+      SET_TESTS_PROPERTIES(test-smpi-mpich3-coll-thread test-smpi-mpich3-coll-ompi-thread test-smpi-mpich3-coll-mpich-thread test-smpi-mpich3-coll-mvapich2-thread   PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
       IF(CONTEXT_UCONTEXT)
         ADD_TEST(test-smpi-mpich3-coll-ompi-ucontext ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/coll perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/coll -tests=testlist -execarg=--cfg=contexts/factory:ucontext -execarg=--cfg=smpi/coll_selector:ompi -execarg=--cfg=smpi/send_is_detached_thres:0 -execarg=--cfg=smpi/privatize_global_variables:yes -execarg=--cfg=smpi/bcast:binomial_tree)
         SET_TESTS_PROPERTIES(test-smpi-mpich3-coll-ompi-ucontext PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
index 1fbd98b..30d34cd 100644 (file)
@@ -212,6 +212,7 @@ COLL_APPLY(action, COLL_BCAST_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_split_bintree) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, ompi_pipeline) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, mvapich2)   COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, automatic)
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
@@ -238,6 +239,7 @@ COLL_APPLY(action, COLL_REDUCE_SIG, ompi_in_order_binary) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binary) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, ompi_binomial) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, mpich) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, mvapich2) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, automatic)
 
 COLL_REDUCES(COLL_PROTO, COLL_NOsep)
@@ -257,6 +259,7 @@ COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_pair) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_rdb) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mpich_noncomm) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, mvapich2) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SCATTER_SIG, automatic)
 
 
@@ -277,6 +280,7 @@ COLL_APPLY(action, COLL_SCATTER_SIG, ompi) COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi_basic_linear) COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, ompi_binomial)  COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, mpich)   COLL_sep \
+COLL_APPLY(action, COLL_SCATTER_SIG, mvapich2)   COLL_sep \
 COLL_APPLY(action, COLL_SCATTER_SIG, automatic)
 
 COLL_SCATTERS(COLL_PROTO, COLL_NOsep)
index 1442a5c..49906af 100644 (file)
@@ -1191,4 +1191,1101 @@ int smpi_coll_tuned_barrier_mvapich2(MPI_Comm  comm)
 }
 
 
+/*
+static void init_mv2_bcast_tables_stampede(){
+ //Stampede,
+        mv2_size_bcast_tuning_table=8;
+        mv2_bcast_thresholds_table = malloc(mv2_size_bcast_tuning_table *
+                                                 sizeof (mv2_bcast_tuning_table));
+
+       mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
+         {
+            16,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+            11,
+            {
+              {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
+              {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
+              {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
+              {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
+              {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
+            },
+            11,
+            {
+              {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
+              {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
+              {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
+              {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
+              {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
+              {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
+              {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
+              {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
+              {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
+              {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
+              {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
+            }
+         },
+         {
+            32,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1, 1, 1, 1},
+            8,
+            {
+              {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
+            },
+            8,
+            {
+              {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
+              {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
+              {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
+              {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
+              {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
+              {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
+              {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
+              {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
+            }
+         },
+         {
+            64,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1, 1, 1, 1, 1},
+            9,
+            {
+              {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
+            },
+            9,
+            {
+              {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
+              {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
+              {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
+              {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
+              {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
+              {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
+              {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
+              {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
+              {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
+            }
+         },
+         {
+            128,
+            8192, 4, 4,
+            {1, 1, 1, 0},
+            4,
+            {
+              {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
+            },
+            4,
+            {
+              {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
+              {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
+              {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
+              {524288, -1, NULL, -1}
+            }
+         },
+         {
+            256,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1},
+            5,
+            {
+              {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
+              {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
+            },
+            5,
+            {
+              {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
+              {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
+              {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
+              {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
+              {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
+            }
+         },
+         {
+            512,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1},
+            5,
+            {
+              {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
+              {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
+            },
+            5,
+            {
+              {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
+              {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
+              {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
+              {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
+              {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
+            }
+         },
+         {
+            1024,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1},
+            5,
+            {
+              {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
+              {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
+            },
+            5,
+            {
+              {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
+              {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
+              {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
+              {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
+              {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
+            }
+         },
+         {
+            2048,
+            8192, 4, 4,
+            {1, 1, 1, 1, 1, 1, 1},
+            7,
+            {
+              {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
+              {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
+              {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
+              {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
+              {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
+            },
+            7,
+            {
+              {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
+              {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
+              {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
+              {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
+              {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
+              {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
+              {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
+            }
+         }
+       };
+
+        memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
+                    mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
+}*/
+
+
+int smpi_coll_tuned_bcast_mvapich2(void *buffer,
+                              int count,
+                              MPI_Datatype datatype,
+                              int root, MPI_Comm comm)
+{
+
+//TODO : Bcast really needs intra/inter phases in mvapich. Default to mpich if not available
+  return smpi_coll_tuned_bcast_mpich(buffer, count, datatype, root, comm);
+
+}
+
+static void init_mv2_reduce_tables_stampede(){
+ /*Stampede*/
+        mv2_size_reduce_tuning_table = 8;
+        mv2_reduce_thresholds_table = malloc(mv2_size_reduce_tuning_table *
+                                                  sizeof (mv2_reduce_tuning_table));
+        mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
+         {
+           16,
+           4,
+           4,
+           {1, 0, 0},
+           3,
+           {
+             {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {262144, 1048576, &MPIR_Reduce_binomial_MV2},
+             {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           2,
+           {
+             {0, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536,-1,  &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           32,
+           4,
+           4,
+           {1, 1, 1, 1, 0, 0, 0},
+           7,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {32768, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {262144, 1048576, &MPIR_Reduce_binomial_MV2},
+             {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           6,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 32768, &MPIR_Reduce_shmem_MV2},
+             {32768, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_shmem_MV2},
+             {262144,-1,  &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           64,
+           4,
+           4,
+           {1, 1, 1, 1, 0},
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           128,
+           4,
+           4,
+           {1, 0, 1, 0, 1, 0},
+           6,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {262144, 1048576, &MPIR_Reduce_binomial_MV2},
+             {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           256,
+           4,
+           4,
+           {1, 1, 1, 0, 1, 1, 0},
+           7,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 32768, &MPIR_Reduce_binomial_MV2},
+             {32768, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_binomial_MV2},
+             {262144, 1048576, &MPIR_Reduce_binomial_MV2},
+             {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           6,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 32768, &MPIR_Reduce_shmem_MV2},
+             {32768, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           512,
+           4,
+           4,
+           {1, 0, 1, 1, 1, 0},
+           6,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_binomial_MV2},
+             {262144, 1048576, &MPIR_Reduce_binomial_MV2},
+             {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
+           },
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           1024,
+           4,
+           4,
+           {1, 0, 1, 1, 1},
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 262144, &MPIR_Reduce_binomial_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+           5,
+           {
+             {0, 8192, &MPIR_Reduce_shmem_MV2},
+             {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {16384, 65536, &MPIR_Reduce_shmem_MV2},
+             {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {262144, -1, &MPIR_Reduce_binomial_MV2},
+           },
+         },
+         {
+           2048,
+           4,
+           4,
+           {1, 0, 1, 1, 1,1},
+           6,
+           {
+             {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
+             {4096, 16384, &MPIR_Reduce_binomial_MV2},
+             {16384, 65536, &MPIR_Reduce_binomial_MV2},
+             {65536, 131072, &MPIR_Reduce_binomial_MV2},
+             {131072, -1, &MPIR_Reduce_binomial_MV2},
+           },
+           6,
+           {
+             {0, 2048, &MPIR_Reduce_shmem_MV2},
+             {2048, 4096, &MPIR_Reduce_shmem_MV2},
+             {4096, 16384, &MPIR_Reduce_shmem_MV2},
+             {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
+             {65536, 131072, &MPIR_Reduce_binomial_MV2},
+             {131072, -1, &MPIR_Reduce_shmem_MV2},
+           },
+         },
+
+        }; 
+        memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
+                   mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
+}
+
+
+
+int smpi_coll_tuned_reduce_mvapich2( void *sendbuf,
+                    void *recvbuf,
+                    int count,
+                    MPI_Datatype datatype,
+                    MPI_Op op, int root, MPI_Comm comm)
+{
+   if(mv2_reduce_thresholds_table == NULL)
+     init_mv2_reduce_tables_stampede();
+
+    int mpi_errno = MPI_SUCCESS;
+    int range = 0;
+    int range_threshold = 0;
+    int range_intra_threshold = 0;
+    int is_commutative, pof2;
+    int comm_size = 0;
+    int nbytes = 0;
+    int sendtype_size;
+    int is_two_level = 0;
+
+    comm_size = smpi_comm_size(comm);
+    sendtype_size=smpi_datatype_size(datatype);
+    nbytes = count * sendtype_size;
+
+    if (count == 0)
+        return MPI_SUCCESS;
+
+    is_commutative = smpi_op_is_commute(op);
+
+    /* find nearest power-of-two less than or equal to comm_size */
+    for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
+    pof2 >>=1;
+    
+
+    /* Search for the corresponding system size inside the tuning table */
+    while ((range < (mv2_size_reduce_tuning_table - 1)) &&
+           (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
+        range++;
+    }
+    /* Search for corresponding inter-leader function */
+    while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
+           && (nbytes >
+               mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
+           && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
+               -1)) {
+        range_threshold++;
+    }
+
+    /* Search for corresponding intra node function */
+    while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
+           && (nbytes >
+               mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
+           && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
+               -1)) {
+        range_intra_threshold++;
+    }
+
+    /* Set intra-node function pt for reduce_two_level */
+    MV2_Reduce_intra_function = 
+                          mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
+                          MV2_pt_Reduce_function;
+    /* Set inter-leader pt */
+    MV2_Reduce_function =
+                          mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
+                          MV2_pt_Reduce_function;
+
+    if(mv2_reduce_intra_knomial_factor<0)
+    {
+        mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
+    }
+    if(mv2_reduce_inter_knomial_factor<0)
+    {
+        mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
+    }
+    if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
+               is_two_level = 1;
+    }
+    /* We call Reduce function */
+    if(is_two_level == 1)
+    {
+       /* if (comm->ch.shmem_coll_ok == 1
+            && is_commutative == 1) {
+            mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count, 
+                                           datatype, op, root, comm, errflag);
+        } else {*/
+            mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
+                                           datatype, op, root, comm);
+       //}
+    } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
+        if(is_commutative ==1)
+        {
+            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
+                                           datatype, op, root, comm);
+        } else {
+            mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
+                                           datatype, op, root, comm);
+        }
+    } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
+        if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
+        {
+            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
+                                            datatype, op, root, comm);
+        } else {
+            mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count, 
+                                            datatype, op, root, comm);
+        }
+    } else {
+        mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count, 
+                                        datatype, op, root, comm);
+    }
+
+
+      return mpi_errno;
+
+}
+
+
+
+static void init_mv2_reduce_scatter_tables_stampede(){
+        mv2_size_red_scat_tuning_table = 6;
+        mv2_red_scat_thresholds_table = malloc(mv2_size_red_scat_tuning_table *
+                                                  sizeof (mv2_red_scat_tuning_table));
+        mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
+            {
+                16,
+                3,
+                {
+                    {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                    {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
+                },
+            },
+            {
+                32,
+                3,
+                {
+                    {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                    {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
+                },
+            },
+            {
+                64,
+                3,
+                {
+                    {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                    {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
+                },
+            },
+            {
+                128,
+                2,
+                {
+                    {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                },
+            },
+            {
+                256,
+                2,
+                {
+                    {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                },
+            },
+            {
+                512,
+                2,
+                {
+                    {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
+                    {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
+                },
+            },
+
+        }; 
+        memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
+                  mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
+}
+
+int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *recvcnts,
+                                                       MPI_Datatype datatype, MPI_Op op,
+                                                       MPI_Comm comm)
+{
+       int mpi_errno = MPI_SUCCESS;
+       int i = 0, comm_size = smpi_comm_size(comm), total_count = 0, type_size =
+               0, nbytes = 0;
+    int range = 0;
+    int range_threshold = 0;
+       int is_commutative = 0;
+       int *disps = malloc(comm_size * sizeof (int));
+
+    if(mv2_red_scat_thresholds_table==NULL)
+      init_mv2_reduce_scatter_tables_stampede();
+      
+    is_commutative=smpi_op_is_commute(op);
+       for (i = 0; i < comm_size; i++) {
+               disps[i] = total_count;
+               total_count += recvcnts[i];
+       }
+
+       type_size=smpi_datatype_size(datatype);
+       nbytes = total_count * type_size;
+
+       if (is_commutative) {
+
+        /* Search for the corresponding system size inside the tuning table */
+        while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
+               (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
+            range++;
+        }
+        /* Search for corresponding inter-leader function */
+        while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
+               && (nbytes >
+                   mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
+               && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
+                   -1)) {
+            range_threshold++;
+        }
+    
+        /* Set inter-leader pt */
+        MV2_Red_scat_function =
+                              mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
+                              MV2_pt_Red_scat_function;
+
+               mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
+                                          recvcnts, datatype,
+                                          op, comm);
+       } else {
+        mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
+                                                     recvcnts, datatype,
+                                                     op, comm);
+       }
+
+    return mpi_errno;
+
+}
+
+
+
+static void init_mv2_scatter_tables_stampede(){
+{
+    int agg_table_sum = 0;
+    int i;
+    mv2_scatter_tuning_table **table_ptrs = NULL;
+     mv2_scatter_num_ppn_conf = 3;
+        mv2_scatter_thresholds_table
+         = malloc(sizeof(mv2_scatter_tuning_table *)
+                       * mv2_scatter_num_ppn_conf);
+        table_ptrs = malloc(sizeof(mv2_scatter_tuning_table *)
+                                 * mv2_scatter_num_ppn_conf);
+        mv2_size_scatter_tuning_table = malloc(sizeof(int) *
+                                                   mv2_scatter_num_ppn_conf);
+        mv2_scatter_table_ppn_conf 
+         = malloc(mv2_scatter_num_ppn_conf * sizeof(int));
+        mv2_scatter_table_ppn_conf[0] = 1;
+        mv2_size_scatter_tuning_table[0] = 6;
+        mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
+         {2,
+          1, 
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+
+         {4,
+          1, 
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+         },
+  
+         {8,
+          1, 
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+         },
+  
+         {16,
+          1, 
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+         },
+  
+         {32,
+          1, 
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+         },
+  
+         {64,
+          2, 
+          {
+            {0, 32, &MPIR_Scatter_MV2_Binomial},
+            {32, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+        };
+        table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
+        mv2_scatter_table_ppn_conf[1] = 2;
+        mv2_size_scatter_tuning_table[1] = 6;
+        mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
+         {4,
+          2, 
+          {
+            {0, 4096, &MPIR_Scatter_MV2_Binomial},
+            {4096, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Direct},
+          },
+         },
+  
+         {8,
+          2, 
+          {
+            {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
+            {512, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+  
+         {16,
+          2, 
+          {
+            {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
+            {2048, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+  
+         {32,
+          2, 
+          {
+            {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
+            {2048, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+  
+         {64,
+          2, 
+          {
+            {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
+            {8192, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+  
+         {128,
+          4, 
+          {
+            {0, 16, &MPIR_Scatter_MV2_Binomial},
+            {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
+            {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
+            {16384, -1, &MPIR_Scatter_MV2_Direct},
+          },
+          1,
+          {
+            {0, 128, &MPIR_Scatter_MV2_Direct},
+            {128, -1, &MPIR_Scatter_MV2_Binomial},
+          },
+         },
+        };
+        table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
+        mv2_scatter_table_ppn_conf[2] = 16;
+        mv2_size_scatter_tuning_table[2] = 8;
+        mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
+         {
+           16,
+           2,
+           { 
+             {0, 256, &MPIR_Scatter_MV2_Binomial}, 
+             {256, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1, 
+           { 
+             { 0, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         },
+
+         {
+           32,
+           2,
+           {
+             {0, 512, &MPIR_Scatter_MV2_Binomial}, 
+             {512, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1, 
+           { 
+             { 0, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         },
+
+         {
+           64,
+           2,
+           {
+             {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
+             {1024, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1,
+           {
+             { 0, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         },
+
+         {
+           128,
+           4,
+           {
+             {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
+             {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
+             {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
+             {2048, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1,
+           {
+             { 0, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         },
+
+         {
+           256,
+           4,
+           {
+             {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
+             {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
+             {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
+             {2048, -1,  &MPIR_Scatter_MV2_Direct},
+           },
+           1,
+           {
+             { 0, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         },
+
+         {
+           512,
+           4,
+           {
+             {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
+             {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
+             {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
+             {4096, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1,
+           {
+             { 0, -1, &MPIR_Scatter_MV2_Binomial},
+           }, 
+         },  
+         {
+           1024,
+           5,
+           {
+             {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
+             {0, 16,  &MPIR_Scatter_MV2_Binomial},
+             {16, 32, &MPIR_Scatter_MV2_Binomial},
+             {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
+             {4096, -1, &MPIR_Scatter_MV2_Direct},
+           },
+           1,
+           {
+             { 0, -1, &MPIR_Scatter_MV2_Binomial},
+           },  
+         },  
+         {
+           2048,
+           7,
+           {
+             {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
+             {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
+             {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
+             {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
+             {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
+             {16384, 65536, &MPIR_Scatter_MV2_Direct},
+             {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
+           },
+           6,
+           {
+             {0, 16, &MPIR_Scatter_MV2_Binomial},
+             {16, 128, &MPIR_Scatter_MV2_Binomial},
+             {128, 1024, &MPIR_Scatter_MV2_Binomial},
+             {1024, 16384, &MPIR_Scatter_MV2_Direct},
+             {16384, 65536, &MPIR_Scatter_MV2_Direct},
+             {65536, -1, &MPIR_Scatter_MV2_Direct},
+           },
+         }, 
+        };
+        table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
+        agg_table_sum = 0;
+        for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
+         agg_table_sum += mv2_size_scatter_tuning_table[i];
+        }
+        mv2_scatter_thresholds_table[0] =
+         malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
+        memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
+                   (sizeof(mv2_scatter_tuning_table)
+                     * mv2_size_scatter_tuning_table[0]));
+        for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
+         mv2_scatter_thresholds_table[i] =
+            mv2_scatter_thresholds_table[i - 1]
+            + mv2_size_scatter_tuning_table[i - 1];
+         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
+                      (sizeof(mv2_scatter_tuning_table)
+                       * mv2_size_scatter_tuning_table[i]));
+        }
+        free(table_ptrs);
+   }
+}
+
+int smpi_coll_tuned_scatter_mvapich2(void *sendbuf,
+                           int sendcnt,
+                           MPI_Datatype sendtype,
+                           void *recvbuf,
+                           int recvcnt,
+                           MPI_Datatype recvtype,
+                           int root, MPI_Comm comm_ptr)
+{
+    int range = 0, range_threshold = 0, range_threshold_intra = 0;
+    int mpi_errno = MPI_SUCCESS;
+ //   int mpi_errno_ret = MPI_SUCCESS;
+    int rank, nbytes, comm_size;
+    int recvtype_size, sendtype_size;
+    int partial_sub_ok = 0;
+    int conf_index = 0;
+  //  int local_size = -1;
+  //  int i;
+ //   MPI_Comm shmem_comm;
+//    MPID_Comm *shmem_commptr=NULL;
+    if(mv2_scatter_thresholds_table==NULL)
+      init_mv2_scatter_tables_stampede();
+
+    comm_size = smpi_comm_size(comm_ptr);
+
+    rank = smpi_comm_rank(comm_ptr);
+
+    if (rank == root) {
+        sendtype_size=smpi_datatype_size(sendtype);
+        nbytes = sendcnt * sendtype_size;
+    } else {
+        recvtype_size=smpi_datatype_size(recvtype);
+        nbytes = recvcnt * recvtype_size;
+    }
+/*
+    // check if safe to use partial subscription mode 
+    if (comm_ptr->ch.shmem_coll_ok == 1 && comm_ptr->ch.is_uniform) {
+    
+        shmem_comm = comm_ptr->ch.shmem_comm;
+        MPID_Comm_get_ptr(shmem_comm, shmem_commptr);
+        local_size = shmem_commptr->local_size;
+        i = 0;
+        if (mv2_scatter_table_ppn_conf[0] == -1) {
+            // Indicating user defined tuning 
+            conf_index = 0;
+            goto conf_check_end;
+        }
+        do {
+            if (local_size == mv2_scatter_table_ppn_conf[i]) {
+                conf_index = i;
+                partial_sub_ok = 1;
+                break;
+            }
+            i++;
+        } while(i < mv2_scatter_num_ppn_conf);
+    }
+    */
+    if (partial_sub_ok != 1) {
+        conf_index = 0;
+    }
+
+    /* Search for the corresponding system size inside the tuning table */
+    while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
+           (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
+        range++;
+    }
+    /* Search for corresponding inter-leader function */
+    while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
+           && (nbytes >
+           mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
+           && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
+           range_threshold++;
+    }
+
+    /* Search for corresponding intra-node function */
+    while ((range_threshold_intra <
+           (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
+            && (nbytes >
+                mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
+            && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
+            -1)) {
+            range_threshold_intra++;
+    }
+
+    MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
+                            .MV2_pt_Scatter_function;
+
+    if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) { 
+#if defined(_MCST_SUPPORT_)
+        if(comm_ptr->ch.is_mcast_ok == 1 
+           && mv2_use_mcast_scatter == 1 
+           && comm_ptr->ch.shmem_coll_ok == 1) {
+            MV2_Scatter_function = &MPIR_Scatter_mcst_MV2; 
+        } else
+#endif /*#if defined(_MCST_SUPPORT_) */
+        {
+            if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
+               MV2_pt_Scatter_function != NULL) { 
+                  MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
+                                                                          .MV2_pt_Scatter_function;
+            } else { 
+                  /* Fallback! */ 
+                  MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial; 
+            }  
+        } 
+    } 
+    if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) || 
+        (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) { 
+        /* if( comm_ptr->ch.shmem_coll_ok == 1 && 
+             comm_ptr->ch.is_global_block == 1 ) {
+             MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
+                                .MV2_pt_Scatter_function;
+
+             mpi_errno =
+                   MV2_Scatter_function(sendbuf, sendcnt, sendtype,
+                                        recvbuf, recvcnt, recvtype, root,
+                                        comm_ptr);
+         } else {*/
+             mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
+                                        recvbuf, recvcnt, recvtype, root,
+                                        comm_ptr);
+
+         //}
+    } else { 
+         mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
+                                    recvbuf, recvcnt, recvtype, root,
+                                    comm_ptr);
+    } 
+    return (mpi_errno);
+}
 
index 1c29af6..abfc786 100644 (file)
@@ -268,3 +268,210 @@ static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_rab1
 
+
+
+/*
+Bcast deactivated for now, defaults to mpich one
+typedef struct {
+    int min;
+    int max;
+    int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
+                                  int root, MPI_Comm comm_ptr);
+    int zcpy_pipelined_knomial_factor;
+} mv2_bcast_tuning_element;
+
+typedef struct {
+    int numproc;
+    int bcast_segment_size;
+    int intra_node_knomial_factor;
+    int inter_node_knomial_factor;
+    int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
+    int size_inter_table;
+    mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
+    int size_intra_table;
+    mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
+} mv2_bcast_tuning_table;
+
+extern int mv2_use_pipelined_bcast;
+extern int mv2_pipelined_knomial_factor; 
+extern int mv2_pipelined_zcpy_knomial_factor; 
+extern int zcpy_knomial_factor;
+extern int bcast_segment_size;
+
+extern int mv2_size_bcast_tuning_table;
+extern mv2_bcast_tuning_table *mv2_bcast_thresholds_table;
+extern int mv2_use_old_bcast;
+
+int mv2_size_bcast_tuning_table = 0;
+mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
+
+
+int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
+                           int root, MPI_Comm comm_ptr) = NULL;
+
+int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
+                                      int root, MPI_Comm comm_ptr) = NULL;
+                                      
+                                      
+*/
+
+typedef struct {
+    int min;
+    int max;
+    int (*MV2_pt_Reduce_function)(void *sendbuf,
+                                 void *recvbuf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 MPI_Op op,
+                                 int root,
+                                 MPI_Comm  comm_ptr);
+} mv2_reduce_tuning_element;
+
+typedef struct {
+    int numproc; 
+    int inter_k_degree;
+    int intra_k_degree;
+    int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
+    int size_inter_table;
+    mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
+    int size_intra_table;
+    mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
+} mv2_reduce_tuning_table;
+
+extern int mv2_size_reduce_tuning_table;
+extern mv2_reduce_tuning_table *mv2_reduce_thresholds_table;
+extern int mv2_use_old_reduce;
+
+int mv2_size_reduce_tuning_table = 0;
+mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
+
+
+int mv2_reduce_intra_knomial_factor = -1;
+int mv2_reduce_inter_knomial_factor = -1;
+
+int (*MV2_Reduce_function)( void *sendbuf,
+                           void *recvbuf,
+                           int count,
+                           MPI_Datatype datatype,
+                           MPI_Op op,
+                           int root,
+                           MPI_Comm  comm_ptr)=NULL;
+
+int (*MV2_Reduce_intra_function)( void *sendbuf,
+                                 void *recvbuf,
+                                 int count,
+                                 MPI_Datatype datatype,
+                                 MPI_Op op,
+                                 int root,
+                                 MPI_Comm  comm_ptr)=NULL;
+                                 
+                                 
+#define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_ompi_binomial
+#define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_ompi_binomial
+#define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_ompi_binomial
+#define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
+#define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
+
+typedef struct {
+    int min;
+    int max;
+    int (*MV2_pt_Red_scat_function)(void *sendbuf,
+                                    void *recvbuf,
+                                    int *recvcnts,
+                                    MPI_Datatype datatype,
+                                    MPI_Op op,
+                                    MPI_Comm comm_ptr);
+} mv2_red_scat_tuning_element;
+
+typedef struct {
+    int numproc; 
+    int size_inter_table;
+    mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
+} mv2_red_scat_tuning_table;
+
+extern int mv2_size_red_scat_tuning_table;
+extern mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table;
+
+int mv2_size_red_scat_tuning_table = 0;
+mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
+
+
+int (*MV2_Red_scat_function)(void *sendbuf,
+                             void *recvbuf,
+                             int *recvcnts,
+                             MPI_Datatype datatype,
+                             MPI_Op op,
+                             MPI_Comm comm_ptr);
+
+#define MPIR_Reduce_Scatter_Basic_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
+#define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
+#define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
+#define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
+
+
+
+/* Indicates number of processes per node */
+extern int *mv2_scatter_table_ppn_conf;
+/* Indicates total number of configurations */
+extern int mv2_scatter_num_ppn_conf;
+
+typedef struct {
+    int min;
+    int max;
+    int (*MV2_pt_Scatter_function)(void *sendbuf,
+                                   int sendcnt,
+                                   MPI_Datatype sendtype,
+                                   void *recvbuf,
+                                   int recvcnt,
+                                   MPI_Datatype recvtype,
+                                   int root, MPI_Comm comm);
+} mv2_scatter_tuning_element;
+
+typedef struct {
+    int numproc;
+    int size_inter_table;
+    mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
+    int size_intra_table;
+    mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
+} mv2_scatter_tuning_table;
+
+extern int *mv2_size_scatter_tuning_table;
+extern mv2_scatter_tuning_table **mv2_scatter_thresholds_table;
+
+
+int *mv2_scatter_table_ppn_conf = NULL;
+int mv2_scatter_num_ppn_conf = 1;
+int *mv2_size_scatter_tuning_table = NULL;
+mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
+
+int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                             int root, MPI_Comm comm)=NULL;
+
+int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                             int root, MPI_Comm comm)=NULL;
+int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
+                              int sendcnt,
+                              MPI_Datatype sendtype,
+                              void *recvbuf,
+                              int recvcnt,
+                              MPI_Datatype recvtype,
+                              int root, MPI_Comm comm_ptr);
+                              
+int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
+                              int sendcnt,
+                              MPI_Datatype sendtype,
+                              void *recvbuf,
+                              int recvcnt,
+                              MPI_Datatype recvtype,
+                              int root, MPI_Comm comm_ptr)
+{
+    return 0;
+}
+
+#define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
+#define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
+#define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
+#define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
+