Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
add ompi selector for smpi collectives (use --cfg=smpi/coll_selector="ompi" to activate)
authordegomme <degomme@debian.localdomain>
Fri, 7 Jun 2013 12:42:01 +0000 (14:42 +0200)
committerdegomme <degomme@debian.localdomain>
Fri, 7 Jun 2013 12:42:01 +0000 (14:42 +0200)
buildtools/Cmake/DefinePackages.cmake
src/include/smpi/smpi_interface.h
src/simgrid/sg_config.c
src/smpi/colls/colls.h
src/smpi/colls/smpi_openmpi_selector.c [new file with mode: 0644]
src/smpi/smpi_coll.c

index 001cfd4..ceb0014 100644 (file)
@@ -110,6 +110,7 @@ set(SMPI_SRC
   src/smpi/smpi_mpi_dt.c
   src/smpi/smpi_pmpi.c
   src/smpi/smpi_replay.c
   src/smpi/smpi_mpi_dt.c
   src/smpi/smpi_pmpi.c
   src/smpi/smpi_replay.c
+  src/smpi/colls/smpi_openmpi_selector.c
   src/smpi/colls/colls_global.c
   src/smpi/colls/allgather-2dmesh.c
   src/smpi/colls/allgather-3dmesh.c
   src/smpi/colls/colls_global.c
   src/smpi/colls/allgather-2dmesh.c
   src/smpi/colls/allgather-3dmesh.c
index d172618..9c11e29 100644 (file)
@@ -86,7 +86,5 @@ XBT_PUBLIC_DATA(int (*mpi_coll_reduce_fun)
 XBT_PUBLIC(void) coll_help(const char *category,
                            s_mpi_coll_description_t * table);
 XBT_PUBLIC(int) find_coll_description(s_mpi_coll_description_t * table,
 XBT_PUBLIC(void) coll_help(const char *category,
                            s_mpi_coll_description_t * table);
 XBT_PUBLIC(int) find_coll_description(s_mpi_coll_description_t * table,
-                                      const char *name);
-
-
+                                      char *name);
 #endif                          /* _SMPI_INTERFAC_H */
 #endif                          /* _SMPI_INTERFAC_H */
index 27390bb..f0ea976 100644 (file)
@@ -754,46 +754,43 @@ void sg_config_init(int *argc, char **argv)
                      xbt_cfgelm_double, &default_iprobe_time, 1, 1, NULL,
                      NULL);
     default_value = xbt_strdup("default");
                      xbt_cfgelm_double, &default_iprobe_time, 1, 1, NULL,
                      NULL);
     default_value = xbt_strdup("default");
+    xbt_cfg_register(&_sg_cfg_set, "smpi/coll_selector",
+                    "Which collective selector to use",
+                    xbt_cfgelm_string, &default_value, 1, 1, NULL,
+                    NULL);
     xbt_cfg_register(&_sg_cfg_set, "smpi/allgather",
                     "Which collective to use for allgather",
     xbt_cfg_register(&_sg_cfg_set, "smpi/allgather",
                     "Which collective to use for allgather",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allgather,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_allgather,
                     NULL);
 
                     NULL);
 
-    default_value = xbt_strdup("default");
     xbt_cfg_register(&_sg_cfg_set, "smpi/allgatherv",
                     "Which collective to use for allgatherv",
     xbt_cfg_register(&_sg_cfg_set, "smpi/allgatherv",
                     "Which collective to use for allgatherv",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allgatherv,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_allgatherv,
                     NULL);
 
                     NULL);
 
-    default_value = xbt_strdup("default");
     xbt_cfg_register(&_sg_cfg_set, "smpi/allreduce",
                     "Which collective to use for allreduce",
     xbt_cfg_register(&_sg_cfg_set, "smpi/allreduce",
                     "Which collective to use for allreduce",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_allreduce,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_allreduce,
                     NULL);
 
                     NULL);
 
-    default_value = xbt_strdup("ompi");
     xbt_cfg_register(&_sg_cfg_set, "smpi/alltoall",
                     "Which collective to use for alltoall",
     xbt_cfg_register(&_sg_cfg_set, "smpi/alltoall",
                     "Which collective to use for alltoall",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_alltoall,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_alltoall,
                     NULL);
 
                     NULL);
 
-    default_value = xbt_strdup("default");
     xbt_cfg_register(&_sg_cfg_set, "smpi/alltoallv",
                     "Which collective to use for alltoallv",
     xbt_cfg_register(&_sg_cfg_set, "smpi/alltoallv",
                     "Which collective to use for alltoallv",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_alltoallv,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_alltoallv,
                     NULL);
 
                     NULL);
 
-    default_value = xbt_strdup("default");
     xbt_cfg_register(&_sg_cfg_set, "smpi/bcast",
                     "Which collective to use for bcast",
     xbt_cfg_register(&_sg_cfg_set, "smpi/bcast",
                     "Which collective to use for bcast",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_bcast,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_bcast,
                     NULL);
 
                     NULL);
 
-
-    default_value = xbt_strdup("default");
     xbt_cfg_register(&_sg_cfg_set, "smpi/reduce",
                     "Which collective to use for reduce",
     xbt_cfg_register(&_sg_cfg_set, "smpi/reduce",
                     "Which collective to use for reduce",
-                    xbt_cfgelm_string, &default_value, 1, 1, &_sg_cfg_cb__coll_reduce,
+                    xbt_cfgelm_string, NULL, 1, 1, &_sg_cfg_cb__coll_reduce,
                     NULL);
 #endif // HAVE_SMPI
 
                     NULL);
 #endif // HAVE_SMPI
 
index 17c245d..2e124f6 100644 (file)
@@ -44,7 +44,8 @@ COLL_APPLY(action, COLL_ALLGATHER_SIG, rhv) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, SMP_NTS) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHER_SIG, smp_simple) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple)
+COLL_APPLY(action, COLL_ALLGATHER_SIG, spreading_simple) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHER_SIG, ompi)
 
 COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 
 
 COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 
@@ -59,7 +60,8 @@ COLL_ALLGATHERS(COLL_PROTO, COLL_NOsep)
 #define COLL_ALLGATHERVS(action, COLL_sep) \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \
 #define COLL_ALLGATHERVS(action, COLL_sep) \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, GB) COLL_sep \
 COLL_APPLY(action, COLL_ALLGATHERV_SIG, pair) COLL_sep \
-COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring)
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ring) COLL_sep \
+COLL_APPLY(action, COLL_ALLGATHERV_SIG, ompi)
 
 COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep)
 
 
 COLL_ALLGATHERVS(COLL_PROTO, COLL_NOsep)
 
@@ -85,7 +87,8 @@ COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rdb) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_lr) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_lr) COLL_sep \
 COLL_APPLY(action, COLL_ALLREDUCE_SIG, smp_rsag_rab) COLL_sep \
-COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast)
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, redbcast) COLL_sep \
+COLL_APPLY(action, COLL_ALLREDUCE_SIG, ompi)
 
 COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
 
 
 COLL_ALLREDUCES(COLL_PROTO, COLL_NOsep)
 
@@ -111,7 +114,8 @@ COLL_APPLY(action, COLL_ALLTOALL_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALL_SIG, ring_one_barrier) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALL_SIG, simple)
+COLL_APPLY(action, COLL_ALLTOALL_SIG, simple) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALL_SIG, ompi)
 
 COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
 
 
 COLL_ALLTOALLS(COLL_PROTO, COLL_NOsep)
 
@@ -132,7 +136,8 @@ COLL_APPLY(action, COLL_ALLTOALLV_SIG, pair_one_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_light_barrier) COLL_sep \
 COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_mpi_barrier) COLL_sep \
-COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier)
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, ring_one_barrier) COLL_sep \
+COLL_APPLY(action, COLL_ALLTOALLV_SIG, ompi)
 
 COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep)
 
 
 COLL_ALLTOALLVS(COLL_PROTO, COLL_NOsep)
 
@@ -158,7 +163,8 @@ COLL_APPLY(action, COLL_BCAST_SIG, scatter_LR_allgather) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, scatter_rdb_allgather) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binary) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, scatter_rdb_allgather) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binary) COLL_sep \
 COLL_APPLY(action, COLL_BCAST_SIG, SMP_binomial) COLL_sep \
-COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear)
+COLL_APPLY(action, COLL_BCAST_SIG, SMP_linear) COLL_sep \
+COLL_APPLY(action, COLL_BCAST_SIG, ompi)
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
 
 
 COLL_BCASTS(COLL_PROTO, COLL_NOsep)
 
@@ -175,7 +181,8 @@ COLL_APPLY(action, COLL_REDUCE_SIG, arrival_pattern_aware) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, binomial) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, flat_tree) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, NTSL) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, binomial) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, flat_tree) COLL_sep \
 COLL_APPLY(action, COLL_REDUCE_SIG, NTSL) COLL_sep \
-COLL_APPLY(action, COLL_REDUCE_SIG, scatter_gather)
+COLL_APPLY(action, COLL_REDUCE_SIG, scatter_gather) COLL_sep \
+COLL_APPLY(action, COLL_REDUCE_SIG, ompi)
 
 COLL_REDUCES(COLL_PROTO, COLL_NOsep)
 
 
 COLL_REDUCES(COLL_PROTO, COLL_NOsep)
 
diff --git a/src/smpi/colls/smpi_openmpi_selector.c b/src/smpi/colls/smpi_openmpi_selector.c
new file mode 100644 (file)
index 0000000..c0e948b
--- /dev/null
@@ -0,0 +1,621 @@
+/* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
+
+/* Copyright (c) 2009, 2010. The SimGrid Team.
+ * All rights reserved.                                                     */
+
+/* This program is free software; you can redistribute it and/or modify it
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+
+#include "colls_private.h"
+
+
+int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
+                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+{
+    size_t dsize, block_dsize;
+    int comm_size = smpi_comm_size(comm);
+    const size_t intermediate_message = 10000;
+
+    /**
+     * Decision function based on MX results from the Grig cluster at UTK.
+     * 
+     * Currently, linear, recursive doubling, and nonoverlapping algorithms 
+     * can handle both commutative and non-commutative operations.
+     * Ring algorithm does not support non-commutative operations.
+     */
+    dsize = smpi_datatype_size(dtype);
+    block_dsize = dsize * count;
+
+    if (block_dsize < intermediate_message) {
+        return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
+                                                                   count, dtype,
+                                                                   op, comm));
+    } 
+
+    if( /*smpi_op_is_commute(op) && */(count > comm_size) ) {
+        const size_t segment_size = 1 << 20; /* 1 MB */
+        if ((comm_size * segment_size >= block_dsize)) {
+            //return (smpi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, 
+            //FIXME: ok, these are not the right algorithms, try to find closer ones
+            return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
+                                              op, comm);
+        } else {
+           // return (smpi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, 
+           return (smpi_coll_tuned_allreduce_rab2 (sbuf, rbuf,
+                                                                    count, dtype, 
+                                                                    op, comm 
+                                                                    /*segment_size*/));
+        }
+    }
+
+    return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
+                                                            dtype, op, comm));
+}
+
+
+
+int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
+                                             MPI_Datatype sdtype,
+                                             void* rbuf, int rcount, 
+                                             MPI_Datatype rdtype, 
+                                             MPI_Comm comm)
+{
+    int communicator_size;
+    size_t dsize, block_dsize;
+    communicator_size = smpi_comm_size(comm);
+
+    /* Decision function based on measurement on Grig cluster at 
+       the University of Tennessee (2GB MX) up to 64 nodes.
+       Has better performance for messages of intermediate sizes than the old one */
+    /* determine block size */
+    dsize = smpi_datatype_size(sdtype);
+    block_dsize = dsize * scount;
+
+    if ((block_dsize < 200) && (communicator_size > 12)) {
+        return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
+                                                    rbuf, rcount, rdtype,
+                                                    comm);
+
+    } else if (block_dsize < 3000) {
+        return smpi_coll_tuned_alltoall_simple(sbuf, scount, sdtype, 
+                                                           rbuf, rcount, rdtype, 
+                                                           comm);
+    }
+
+    return smpi_coll_tuned_alltoall_pair (sbuf, scount, sdtype, 
+                                                    rbuf, rcount, rdtype,
+                                                    comm);
+}
+
+int smpi_coll_tuned_alltoallv_ompi(void *sbuf, int *scounts, int *sdisps,
+                                              MPI_Datatype sdtype,
+                                              void *rbuf, int *rcounts, int *rdisps,
+                                              MPI_Datatype rdtype,
+                                              MPI_Comm  comm
+                                              )
+{
+    /* For starters, just keep the original algorithm. */
+    return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype, 
+                                                        rbuf, rcounts, rdisps,rdtype,
+                                                        comm);
+}
+
+/*
+void smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
+{    int communicator_size = smpi_comm_size(comm);
+
+    if( 2 == communicator_size )
+        return smpi_coll_tuned_barrier_intra_two_procs(comm, module);
+     * Basic optimisation. If we have a power of 2 number of nodes
+     * the use the recursive doubling algorithm, otherwise
+     * bruck is the one we want.
+    {
+        bool has_one = false;
+        for( ; communicator_size > 0; communicator_size >>= 1 ) {
+            if( communicator_size & 0x1 ) {
+                if( has_one )
+                    return smpi_coll_tuned_barrier_intra_bruck(comm, module);
+                has_one = true;
+            }
+        }
+    }
+    return smpi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
+}*/
+
+int smpi_coll_tuned_bcast_ompi(void *buff, int count,
+                                          MPI_Datatype datatype, int root,
+                                          MPI_Comm  comm
+                                          )
+{
+    /* Decision function based on MX results for 
+       messages up to 36MB and communicator sizes up to 64 nodes */
+    //const size_t small_message_size = 2048;
+    const size_t intermediate_message_size = 370728;
+    //const double a_p16  = 3.2118e-6; /* [1 / byte] */
+    //const double b_p16  = 8.7936;   
+    //const double a_p64  = 2.3679e-6; /* [1 / byte] */
+    //const double b_p64  = 1.1787;     
+    //const double a_p128 = 1.6134e-6; /* [1 / byte] */
+    //const double b_p128 = 2.1102;
+
+    int communicator_size;
+    int segsize = 0;
+    size_t message_size, dsize;
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* else we need data size for decision function */
+    dsize = smpi_datatype_size(datatype);
+    message_size = dsize * (unsigned long)count;   /* needed for decision */
+
+    /* Handle messages of small and intermediate size, and 
+       single-element broadcasts */
+    if ((message_size < /*small_message_size*/intermediate_message_size) || (count <= 1)) {
+        /* Binomial without segmentation */
+        segsize = 0;
+        return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype, 
+                                                      root, comm/*
+                                                      segsize*/);
+
+    } /*else if (message_size < intermediate_message_size) {
+        // SplittedBinary with 1KB segments
+        segsize = 1024;
+        return smpi_coll_tuned_bcast_split_bintree(buff, count, datatype, 
+                                                         root, comm
+                                                         segsize);
+
+    } 
+     Handle large message sizes 
+    else if (communicator_size < (a_p128 * message_size + b_p128)) {
+         Pipeline with 128KB segments 
+        segsize = 1024  << 7;
+        return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
+                                                     root, comm, module,
+                                                     segsize);
+
+    } else if (communicator_size < 13) {
+        // Split Binary with 8KB segments 
+        segsize = 1024 << 3;
+        return smpi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype, 
+                                                         root, comm, module,
+                                                         segsize);
+       
+    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
+        // Pipeline with 64KB segments 
+        segsize = 1024 << 6;
+        return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, 
+                                                     root, comm, module,
+                                                     segsize);
+
+    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
+         Pipeline with 16KB segments 
+        //segsize = 1024 << 4;
+        return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
+                                                     root, comm, module,
+                                                     segsize);
+
+    }*/
+
+    /* Pipeline with 8KB segments */
+    //segsize = 1024 << 3;
+    return smpi_coll_tuned_bcast_flattree_pipeline (buff, count, datatype, 
+                                                 root, comm
+                                                 /*segsize*/);
+#if 0
+    /* this is based on gige measurements */
+
+    if (communicator_size  < 4) {
+        return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
+    }
+    if (communicator_size == 4) {
+        if (message_size < 524288) segsize = 0;
+        else segsize = 16384;
+        return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
+    }
+    if (communicator_size <= 8 && message_size < 4096) {
+        return smpi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
+    }
+    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
+        segsize = 16384;
+        return  smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
+    }
+    if (message_size >= 524288) {
+        segsize = 16384;
+        return smpi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
+    }
+    segsize = 0;
+    /* once tested can swap this back in */
+    /* return smpi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
+    return smpi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
+#endif  /* 0 */
+}
+
+int smpi_coll_tuned_reduce_ompi( void *sendbuf, void *recvbuf,
+                                            int count, MPI_Datatype  datatype,
+                                            MPI_Op   op, int root,
+                                            MPI_Comm   comm
+                                            )
+{
+    int communicator_size=0;
+    //int segsize = 0;
+    size_t message_size, dsize;
+    //const double a1 =  0.6016 / 1024.0; /* [1/B] */
+    //const double b1 =  1.3496;
+    //const double a2 =  0.0410 / 1024.0; /* [1/B] */
+    //const double b2 =  9.7128;
+    //const double a3 =  0.0422 / 1024.0; /* [1/B] */
+    //const double b3 =  1.1614;
+    //const double a4 =  0.0033 / 1024.0; /* [1/B] */
+    //const double b4 =  1.6761;
+
+    //const int max_requests = 0; /* no limit on # of outstanding requests */
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* need data size for decision function */
+    dsize=smpi_datatype_size(datatype);
+    message_size = dsize * count;   /* needed for decision */
+
+    /**
+     * If the operation is non commutative we currently have choice of linear 
+     * or in-order binary tree algorithm.
+     */
+/*    if( !ompi_op_is_commute(op) ) {
+        if ((communicator_size < 12) && (message_size < 2048)) {
+            return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
+        } 
+        return smpi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
+                                                             0, max_requests); 
+    }*/
+
+    if ((communicator_size < 8) && (message_size < 512)){
+        /* Linear_0K */
+        return smpi_coll_tuned_reduce_flat_tree (sendbuf, recvbuf, count, datatype, op, root, comm); 
+    } else if (((communicator_size < 8) && (message_size < 20480)) ||
+               (message_size < 2048) || (count <= 1)) {
+        /* Binomial_0K */
+        //segsize = 0;
+        return smpi_coll_tuned_reduce_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
+                                                     segsize, max_requests*/);
+    } /*else if (communicator_size > (a1 * message_size + b1)) {
+        // Binomial_1K 
+        segsize = 1024;
+        return smpi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
+                                                     segsize, max_requests);
+    } else if (communicator_size > (a2 * message_size + b2)) {
+        // Pipeline_1K 
+        segsize = 1024;
+        return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm, module, 
+                                                      segsize, max_requests);
+    } else if (communicator_size > (a3 * message_size + b3)) {
+        // Binary_32K 
+        segsize = 32*1024;
+        return smpi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
+                                                    comm, module, segsize, max_requests);
+    }
+    if (communicator_size > (a4 * message_size + b4)) {
+        // Pipeline_32K 
+        segsize = 32*1024;
+    } else {
+        // Pipeline_64K 
+        segsize = 64*1024;
+    }*/
+    return smpi_coll_tuned_reduce_NTSL (sendbuf, recvbuf, count, datatype, op, root, comm/*, module, 
+                                                  segsize, max_requests*/);
+
+#if 0
+    /* for small messages use linear algorithm */
+    if (message_size <= 4096) {
+        segsize = 0;
+        fanout = communicator_size - 1;
+        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
+        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
+        return smpi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module); 
+        /*        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
+    }
+    if (message_size < 524288) {
+        if (message_size <= 65536 ) {
+            segsize = 32768;
+            fanout = 8;
+        } else {
+            segsize = 1024;
+            fanout = communicator_size/2;
+        }
+        /* later swap this for a binary tree */
+        /*         fanout = 2; */
+        return smpi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
+                                                   segsize, fanout, max_requests);
+    }
+    segsize = 1024;
+    return smpi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
+                                                  segsize, max_requests);
+#endif  /* 0 */
+}
+
+/*int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
+                                                    int *rcounts,
+                                                    MPI_Datatype dtype,
+                                                    MPI_Op  op,
+                                                    MPI_Comm  comm,
+                                                    )
+{
+    int comm_size, i, pow2;
+    size_t total_message_size, dsize;
+    const double a = 0.0012;
+    const double b = 8.0;
+    const size_t small_message_size = 12 * 1024;
+    const size_t large_message_size = 256 * 1024;
+    bool zerocounts = false;
+
+    OPAL_OUTPUT((smpi_coll_tuned_stream, "smpi_coll_tuned_reduce_scatter_ompi"));
+
+    comm_size = smpi_comm_size(comm);
+    // We need data size for decision function 
+    ompi_datatype_type_size(dtype, &dsize);
+    total_message_size = 0;
+    for (i = 0; i < comm_size; i++) { 
+        total_message_size += rcounts[i];
+        if (0 == rcounts[i]) {
+            zerocounts = true;
+        }
+    }
+
+    if( !ompi_op_is_commute(op) || (zerocounts)) {
+        return smpi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts, 
+                                                                    dtype, op, 
+                                                                    comm, module); 
+    }
+   
+    total_message_size *= dsize;
+
+    // compute the nearest power of 2 
+    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);
+
+    if ((total_message_size <= small_message_size) ||
+        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
+        (comm_size >= a * total_message_size + b)) {
+        return 
+            smpi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
+                                                                        dtype, op,
+                                                                        comm, module);
+    } 
+    return smpi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
+                                                     dtype, op,
+                                                     comm, module);
+
+  
+    return smpi_coll_tuned_reduce_scatter(sbuf, rbuf, rcounts,
+                                                     dtype, op,
+                                                     comm;
+
+}*/
+
+int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
+                                              MPI_Datatype sdtype,
+                                              void* rbuf, int rcount, 
+                                              MPI_Datatype rdtype, 
+                                              MPI_Comm  comm
+                                              )
+{
+    int communicator_size, pow2_size;
+    size_t dsize, total_dsize;
+
+    communicator_size = smpi_comm_size(comm);
+
+    /* Special case for 2 processes */
+    if (communicator_size == 2) {
+        return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
+                                                          rbuf, rcount, rdtype, 
+                                                          comm/*, module*/);
+    }
+
+    /* Determine complete data size */
+    dsize=smpi_datatype_size(sdtype);
+    total_dsize = dsize * scount * communicator_size;   
+   
+    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 
+
+    /* Decision based on MX 2Gb results from Grig cluster at 
+       The University of Tennesse, Knoxville 
+       - if total message size is less than 50KB use either bruck or 
+       recursive doubling for non-power of two and power of two nodes, 
+       respectively.
+       - else use ring and neighbor exchange algorithms for odd and even 
+       number of nodes, respectively.
+    */
+    if (total_dsize < 50000) {
+        if (pow2_size == communicator_size) {
+            return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
+                                                                     rbuf, rcount, rdtype,
+                                                                     comm);
+        } else {
+            return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
+                                                         rbuf, rcount, rdtype, 
+                                                         comm);
+        }
+    } else {
+        //if (communicator_size % 2) {
+            return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
+                                                        rbuf, rcount, rdtype, 
+                                                        comm);
+        /*} else {
+            return  smpi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
+                                                                     rbuf, rcount, rdtype,
+                                                                     comm, module);
+        }*/
+    }
+   
+#if defined(USE_MPICH2_DECISION)
+    /* Decision as in MPICH-2 
+       presented in Thakur et.al. "Optimization of Collective Communication 
+       Operations in MPICH", International Journal of High Performance Computing 
+       Applications, Vol. 19, No. 1, 49-66 (2005)
+       - for power-of-two processes and small and medium size messages 
+       (up to 512KB) use recursive doubling
+       - for non-power-of-two processes and small messages (80KB) use bruck,
+       - for everything else use ring.
+    */
+    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
+        return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, 
+                                                                 rbuf, rcount, rdtype, 
+                                                                 comm, module);
+    } else if (total_dsize <= 81920) { 
+        return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, 
+                                                     rbuf, rcount, rdtype,
+                                                     comm, module);
+    } 
+    return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, 
+                                                rbuf, rcount, rdtype,
+                                                comm, module);
+#endif  /* defined(USE_MPICH2_DECISION) */
+}
+
+int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
+                                               MPI_Datatype sdtype,
+                                               void* rbuf, int *rcounts, 
+                                               int *rdispls,
+                                               MPI_Datatype rdtype, 
+                                               MPI_Comm  comm
+                                               )
+{
+    int i;
+    int communicator_size;
+    size_t dsize, total_dsize;
+    
+    communicator_size = smpi_comm_size(comm);
+    
+    /* Special case for 2 processes */
+    if (communicator_size == 2) {
+        return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
+                                                           rbuf, rcounts, rdispls, rdtype, 
+                                                           comm);
+    }
+    
+    /* Determine complete data size */
+    dsize=smpi_datatype_size(sdtype);
+    total_dsize = 0;
+    for (i = 0; i < communicator_size; i++) {
+        total_dsize += dsize * rcounts[i];
+    }
+    
+    /* Decision based on allgather decision.   */
+    if (total_dsize < 50000) {
+/*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
+                                                      rbuf, rcounts, rdispls, rdtype, 
+                                                      comm, module);*/
+    return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
+                                                      rbuf, rcounts, rdispls, rdtype, 
+                                                      comm);
+
+    } else {
+//        if (communicator_size % 2) {
+            return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
+                                                         rbuf, rcounts, rdispls, rdtype, 
+                                                         comm);
+/*        } else {
+            return  smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
+                                                                      rbuf, rcounts, rdispls, rdtype, 
+                                                                      comm, module);
+        }*/
+    }
+}
+/*
+int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
+                                           MPI_Datatype sdtype,
+                                           void* rbuf, int rcount, 
+                                           MPI_Datatype rdtype, 
+                                           int root,
+                                           MPI_Comm  comm,
+                                           )
+{
+    const int large_segment_size = 32768;
+    const int small_segment_size = 1024;
+
+    const size_t large_block_size = 92160;
+    const size_t intermediate_block_size = 6000;
+    const size_t small_block_size = 1024;
+
+    const int large_communicator_size = 60;
+    const int small_communicator_size = 10;
+
+    int communicator_size, rank;
+    size_t dsize, block_size;
+
+    OPAL_OUTPUT((smpi_coll_tuned_stream, 
+                 "smpi_coll_tuned_gather_ompi"));
+
+    communicator_size = smpi_comm_size(comm);
+    rank = ompi_comm_rank(comm);
+
+    // Determine block size 
+    if (rank == root) {
+        ompi_datatype_type_size(rdtype, &dsize);
+        block_size = dsize * rcount;
+    } else {
+        ompi_datatype_type_size(sdtype, &dsize);
+        block_size = dsize * scount;
+    }
+
+    if (block_size > large_block_size) {
+        return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
+                                                         rbuf, rcount, rdtype, 
+                                                         root, comm, module,
+                                                         large_segment_size);
+
+    } else if (block_size > intermediate_block_size) {
+        return smpi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, 
+                                                         rbuf, rcount, rdtype, 
+                                                         root, comm, module,
+                                                         small_segment_size);
+
+    } else if ((communicator_size > large_communicator_size) ||
+               ((communicator_size > small_communicator_size) &&
+                (block_size < small_block_size))) {
+        return smpi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype, 
+                                                      rbuf, rcount, rdtype, 
+                                                      root, comm, module);
+
+    }
+    // Otherwise, use basic linear 
+    return smpi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, 
+                                                      rbuf, rcount, rdtype, 
+                                                      root, comm, module);
+}*/
+/*
+int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
+                                            MPI_Datatype sdtype,
+                                            void* rbuf, int rcount, 
+                                            MPI_Datatype rdtype, 
+                                            int root, MPI_Comm  comm,
+                                            )
+{
+    const size_t small_block_size = 300;
+    const int small_comm_size = 10;
+    int communicator_size, rank;
+    size_t dsize, block_size;
+
+    OPAL_OUTPUT((smpi_coll_tuned_stream, 
+                 "smpi_coll_tuned_scatter_ompi"));
+
+    communicator_size = smpi_comm_size(comm);
+    rank = ompi_comm_rank(comm);
+    // Determine block size 
+    if (root == rank) {
+        ompi_datatype_type_size(sdtype, &dsize);
+        block_size = dsize * scount;
+    } else {
+        ompi_datatype_type_size(rdtype, &dsize);
+        block_size = dsize * rcount;
+    } 
+
+    if ((communicator_size > small_comm_size) &&
+        (block_size < small_block_size)) {
+        return smpi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype, 
+                                                       rbuf, rcount, rdtype, 
+                                                       root, comm, module);
+    }
+    return smpi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, 
+                                                       rbuf, rcount, rdtype, 
+                                                       root, comm, module);
+}*/
+
index 96ccc0d..29e3aa9 100644 (file)
@@ -12,6 +12,7 @@
 
 #include "private.h"
 #include "colls/colls.h"
 
 #include "private.h"
 #include "colls/colls.h"
+#include "simgrid/sg_config.h"
 
 s_mpi_coll_description_t mpi_coll_allgather_description[] = {
   {"default",
 
 s_mpi_coll_description_t mpi_coll_allgather_description[] = {
   {"default",
@@ -88,15 +89,28 @@ void coll_help(const char *category, s_mpi_coll_description_t * table)
 }
 
 int find_coll_description(s_mpi_coll_description_t * table,
 }
 
 int find_coll_description(s_mpi_coll_description_t * table,
-                           const char *name)
+                           char *name)
 {
   int i;
   char *name_list = NULL;
 {
   int i;
   char *name_list = NULL;
-
+  int selector_on=0;
+  if(name==NULL){//no argument provided, use active selector's algorithm
+    name=(char*)sg_cfg_get_string("smpi/coll_selector");
+    selector_on=1;
+  }
   for (i = 0; table[i].name; i++)
     if (!strcmp(name, table[i].name)) {
       return i;
     }
   for (i = 0; table[i].name; i++)
     if (!strcmp(name, table[i].name)) {
       return i;
     }
+
+  if(selector_on){
+    // collective seems not handled by the active selector, try with default one
+    name=(char*)"default";
+    for (i = 0; table[i].name; i++)
+      if (!strcmp(name, table[i].name)) {
+        return i;
+    }
+  }
   name_list = strdup(table[0].name);
   for (i = 1; table[i].name; i++) {
     name_list =
   name_list = strdup(table[0].name);
   for (i = 1; table[i].name; i++) {
     name_list =
@@ -109,8 +123,6 @@ int find_coll_description(s_mpi_coll_description_t * table,
   return -1;
 }
 
   return -1;
 }
 
-
-
 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
                                 "Logging specific to SMPI (coll)");
 
 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
                                 "Logging specific to SMPI (coll)");