From 38ea8eebef9013b0666d21d82493a559b5a27ac6 Mon Sep 17 00:00:00 2001
From: Augustin Degomme <augustin.degomme@imag.fr>
Date: Wed, 20 Aug 2014 16:42:44 +0200
Subject: [PATCH] switch way old SMP aware algos work, to be closer to the ones
 from mvapich

---
 src/smpi/colls/allgather-SMP-NTS.c            | 14 ++++++-------
 src/smpi/colls/allgather-loosely-lr.c         | 15 +++++++------
 src/smpi/colls/allgather-smp-simple.c         | 14 ++++++-------
 .../colls/allreduce-smp-binomial-pipeline.c   | 17 +++++++--------
 src/smpi/colls/allreduce-smp-binomial.c       | 17 +++++++--------
 src/smpi/colls/allreduce-smp-rdb.c            | 16 +++++++-------
 src/smpi/colls/allreduce-smp-rsag-lr.c        | 17 +++++++--------
 src/smpi/colls/allreduce-smp-rsag-rab.c       | 16 +++++++-------
 src/smpi/colls/allreduce-smp-rsag.c           | 17 +++++++--------
 src/smpi/colls/bcast-SMP-binary.c             | 19 ++++++++++-------
 src/smpi/colls/bcast-SMP-binomial.c           | 21 ++++++++++---------
 src/smpi/colls/bcast-SMP-linear.c             | 18 +++++++++-------
 12 files changed, 98 insertions(+), 103 deletions(-)

diff --git a/src/smpi/colls/allgather-SMP-NTS.c b/src/smpi/colls/allgather-SMP-NTS.c
index 81c1449a8c..d4838ed4a6 100644
--- a/src/smpi/colls/allgather-SMP-NTS.c
+++ b/src/smpi/colls/allgather-SMP-NTS.c
@@ -5,9 +5,6 @@
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include "colls_private.h"
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
                                       MPI_Datatype stype, void *rbuf,
@@ -25,10 +22,13 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
   int i, send_offset, recv_offset;
   int intra_rank, inter_rank;
 
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
 
 
   intra_rank = rank % num_core;
diff --git a/src/smpi/colls/allgather-loosely-lr.c b/src/smpi/colls/allgather-loosely-lr.c
index bed1abf68c..74df884b27 100644
--- a/src/smpi/colls/allgather-loosely-lr.c
+++ b/src/smpi/colls/allgather-loosely-lr.c
@@ -6,10 +6,6 @@
 
 #include "colls_private.h"
 
-#ifndef NUM_CORE
-#define NUM_CORE 4
-#endif
-
 int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
                                          MPI_Datatype stype, void *rbuf,
                                          int rcount, MPI_Datatype rtype,
@@ -23,10 +19,13 @@ int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
 
   comm_size = smpi_comm_size(comm);
 
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
 
   if(comm_size%num_core)
     THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);
diff --git a/src/smpi/colls/allgather-smp-simple.c b/src/smpi/colls/allgather-smp-simple.c
index 1a38857619..e54c75b5b9 100644
--- a/src/smpi/colls/allgather-smp-simple.c
+++ b/src/smpi/colls/allgather-smp-simple.c
@@ -5,9 +5,6 @@
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include "colls_private.h"
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount,
                                          MPI_Datatype stype, void *recv_buf,
@@ -17,10 +14,13 @@ int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount,
   int src, dst, comm_size, rank;
   comm_size = smpi_comm_size(comm);
 
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
 
   if(comm_size%num_core)
      THROWF(arg_error,0, "allgather SMP simple algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core);
diff --git a/src/smpi/colls/allreduce-smp-binomial-pipeline.c b/src/smpi/colls/allreduce-smp-binomial-pipeline.c
index 44dfcb77fe..e831be95fd 100644
--- a/src/smpi/colls/allreduce-smp-binomial-pipeline.c
+++ b/src/smpi/colls/allreduce-smp-binomial-pipeline.c
@@ -12,11 +12,7 @@
    inter-communication
    The communication are done in a pipeline fashion */
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
+
 
 /* this is a default segment size for pipelining, 
    but it is typically passed as a command line argument */
@@ -52,10 +48,13 @@ int smpi_coll_tuned_allreduce_smp_binomial_pipeline(void *send_buf,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
   MPI_Status status;
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
 
   comm_size = smpi_comm_size(comm);
   rank = smpi_comm_rank(comm);
diff --git a/src/smpi/colls/allreduce-smp-binomial.c b/src/smpi/colls/allreduce-smp-binomial.c
index 97183fb5d4..25f9837321 100644
--- a/src/smpi/colls/allreduce-smp-binomial.c
+++ b/src/smpi/colls/allreduce-smp-binomial.c
@@ -11,11 +11,6 @@
    It uses 2-layer communication: binomial for both intra-communication 
    inter-communication*/
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 /* ** NOTE **
    Use -DMPICH2 if this code does not compile.
@@ -41,11 +36,13 @@ int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
 
-
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
   MPI_Status status;
 
   comm_size=smpi_comm_size(comm);
diff --git a/src/smpi/colls/allreduce-smp-rdb.c b/src/smpi/colls/allreduce-smp-rdb.c
index 5b67e5a975..635258c397 100644
--- a/src/smpi/colls/allreduce-smp-rdb.c
+++ b/src/smpi/colls/allreduce-smp-rdb.c
@@ -11,11 +11,6 @@
    It uses 2-layer communication: binomial for intra-communication 
    and rdb for inter-communication*/
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 /* ** NOTE **
    Use -DMPICH2 if this code does not compile.
@@ -40,10 +35,13 @@ int smpi_coll_tuned_allreduce_smp_rdb(void *send_buf, void *recv_buf, int count,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
   MPI_Status status;
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
   /*
      #ifdef MPICH2_REDUCTION
      MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/allreduce-smp-rsag-lr.c b/src/smpi/colls/allreduce-smp-rsag-lr.c
index 746eecb33e..daa2b416de 100644
--- a/src/smpi/colls/allreduce-smp-rsag-lr.c
+++ b/src/smpi/colls/allreduce-smp-rsag-lr.c
@@ -7,12 +7,6 @@
 #include "colls_private.h"
 //#include <star-reduction.c>
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
-
 /*
 This fucntion performs all-reduce operation as follow.
 1) binomial_tree reduce inside each SMP node
@@ -29,10 +23,13 @@ int smpi_coll_tuned_allreduce_smp_rsag_lr(void *send_buf, void *recv_buf,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
   MPI_Status status;
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
   /*
      #ifdef MPICH2_REDUCTION
      MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/allreduce-smp-rsag-rab.c b/src/smpi/colls/allreduce-smp-rsag-rab.c
index ff921155ab..a3a11309b8 100644
--- a/src/smpi/colls/allreduce-smp-rsag-rab.c
+++ b/src/smpi/colls/allreduce-smp-rsag-rab.c
@@ -10,11 +10,6 @@
 #include "colls_private.h"
 //#include <star-reduction.c>
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 /*
 This fucntion performs all-reduce operation as follow.
@@ -32,10 +27,13 @@ int smpi_coll_tuned_allreduce_smp_rsag_rab(void *sbuf, void *rbuf, int count,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
   MPI_Status status;
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
 
   comm_size = smpi_comm_size(comm);
 
diff --git a/src/smpi/colls/allreduce-smp-rsag.c b/src/smpi/colls/allreduce-smp-rsag.c
index 8d751c88da..1bc921ccc3 100644
--- a/src/smpi/colls/allreduce-smp-rsag.c
+++ b/src/smpi/colls/allreduce-smp-rsag.c
@@ -6,12 +6,6 @@
 
 #include "colls_private.h"
 
-/* change number of core per smp-node
-   we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
-
 /*
 This fucntion performs all-reduce operation as follow.
 1) binomial_tree reduce inside each SMP node
@@ -28,10 +22,13 @@ int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf,
   int tag = COLL_TAG_ALLREDUCE;
   int mask, src, dst;
   MPI_Status status;
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }
   /*
      #ifdef MPICH2_REDUCTION
      MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/bcast-SMP-binary.c b/src/smpi/colls/bcast-SMP-binary.c
index 04b40b2240..5264ca3648 100644
--- a/src/smpi/colls/bcast-SMP-binary.c
+++ b/src/smpi/colls/bcast-SMP-binary.c
@@ -5,9 +5,7 @@
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include "colls_private.h"
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
+
 
 int bcast_SMP_binary_segment_byte = 8192;
 
@@ -27,10 +25,17 @@ int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
 
   rank = smpi_comm_rank(comm);
   size = smpi_comm_size(comm);
-  int host_num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (host_num_core == 1) host_num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int host_num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    host_num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }else{
+    //implementation buggy in this case
+    return smpi_coll_tuned_bcast_mpich( buf , count, datatype,
+              root, comm);
+  }
 
   int segment = bcast_SMP_binary_segment_byte / extent;
   int pipe_length = count / segment;
diff --git a/src/smpi/colls/bcast-SMP-binomial.c b/src/smpi/colls/bcast-SMP-binomial.c
index a45dca5fe4..6053e49c7a 100644
--- a/src/smpi/colls/bcast-SMP-binomial.c
+++ b/src/smpi/colls/bcast-SMP-binomial.c
@@ -5,9 +5,6 @@
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include "colls_private.h"
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
                                        MPI_Datatype datatype, int root,
@@ -22,13 +19,17 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
   size = smpi_comm_size(comm);
   rank = smpi_comm_rank(comm);
 
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
-
-  if(size%num_core)
-    THROWF(arg_error,0, "bcast SMP binomial can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }else{
+    //implementation buggy in this case
+    return smpi_coll_tuned_bcast_mpich( buf , count, datatype,
+              root, comm);
+  }
 
   int to_intra, to_inter;
   int from_intra, from_inter;
diff --git a/src/smpi/colls/bcast-SMP-linear.c b/src/smpi/colls/bcast-SMP-linear.c
index ee7e3d2a66..b3f9b6a630 100644
--- a/src/smpi/colls/bcast-SMP-linear.c
+++ b/src/smpi/colls/bcast-SMP-linear.c
@@ -5,9 +5,6 @@
  * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include "colls_private.h"
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
 
 int bcast_SMP_linear_segment_byte = 8192;
 
@@ -27,10 +24,17 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
 
   rank = smpi_comm_rank(comm);
   size = smpi_comm_size(comm);
-  int num_core = simcall_host_get_core(SIMIX_host_self());
-  // do we use the default one or the number of cores in the platform ?
-  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
-  if (num_core == 1) num_core = NUM_CORE;
+  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+    smpi_comm_init_smp(comm);
+  }
+  int num_core=1;
+  if (smpi_comm_is_uniform(comm)){
+    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+  }else{
+    //implementation buggy in this case
+    return smpi_coll_tuned_bcast_mpich( buf , count, datatype,
+              root, comm);
+  }
 
   int segment = bcast_SMP_linear_segment_byte / extent;
   segment =  segment == 0 ? 1 :segment; 
-- 
2.20.1