have smp-aware algorithms use number of cores on the node as basis for their computat...

author Augustin Degomme <degomme@idpann.imag.fr>

Fri, 24 Jan 2014 09:23:47 +0000 (10:23 +0100)

committer Augustin Degomme <degomme@idpann.imag.fr>

Fri, 24 Jan 2014 10:31:30 +0000 (11:31 +0100)
author Augustin Degomme <degomme@idpann.imag.fr>
Fri, 24 Jan 2014 09:23:47 +0000 (10:23 +0100)
committer Augustin Degomme <degomme@idpann.imag.fr>
Fri, 24 Jan 2014 10:31:30 +0000 (11:31 +0100)
diff --git a/src/smpi/colls/allgather-SMP-NTS.c b/src/smpi/colls/allgather-SMP-NTS.c

index 7ad059a..4e4dfd1 100644 (file)
--- a/src/smpi/colls/allgather-SMP-NTS.c
+++ b/src/smpi/colls/allgather-SMP-NTS.c
@@ -18,16 +18,23 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
  
    int i, send_offset, recv_offset;
    int intra_rank, inter_rank;
-  intra_rank = rank % NUM_CORE;
-  inter_rank = rank / NUM_CORE;
-  int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
-  int num_core_in_current_smp = NUM_CORE;
  
-  if(comm_size%NUM_CORE)
-    THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", NUM_CORE);
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
+
+
+  intra_rank = rank % num_core;
+  inter_rank = rank / num_core;
+  int inter_comm_size = (comm_size + num_core - 1) / num_core;
+  int num_core_in_current_smp = num_core;
+
+  if(comm_size%num_core)
+    THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core);
  
    /* for too small number of processes, use default implementation */
-  if (comm_size <= NUM_CORE) {
+  if (comm_size <= num_core) {
      XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");        
      smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
      return MPI_SUCCESS;    
@@ -35,7 +42,7 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
  
    // the last SMP node may have fewer number of running processes than all others
    if (inter_rank == (inter_comm_size - 1)) {
-    num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
+    num_core_in_current_smp = comm_size - (inter_rank * num_core);
    }
    //copy corresponding message from sbuf to rbuf
    recv_offset = rank * rextent * rcount;
@@ -48,9 +55,9 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
    for (i = 1; i < num_core_in_current_smp; i++) {
  
      dst =
-        (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
+        (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp);
      src =
-        (inter_rank * NUM_CORE) + (intra_rank - i +
+        (inter_rank * num_core) + (intra_rank - i +
                                     num_core_in_current_smp) %
          (num_core_in_current_smp);
      recv_offset = src * rextent * rcount;
@@ -70,35 +77,35 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
      MPI_Request *rrequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
      MPI_Request *srequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
  
-    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
-    dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;
+    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * num_core;
+    dst = ((inter_rank + 1) % inter_comm_size) * num_core;
  
      // post all inter Irecv
      for (i = 0; i < inter_comm_size - 1; i++) {
        recv_offset =
            ((inter_rank - i - 1 +
-            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
-      rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount * NUM_CORE,
+            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
+      rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount * num_core,
                                           rtype, src, tag + i, comm);
      }
  
      // send first message
      send_offset =
          ((inter_rank +
-          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
-    srequest_array[0] = smpi_mpi_isend((char *)rbuf + send_offset, scount * NUM_CORE,
+          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
+    srequest_array[0] = smpi_mpi_isend((char *)rbuf + send_offset, scount * num_core,
                                         stype, dst, tag, comm);
  
      // loop : recv-inter , send-inter, send-intra (linear-bcast)
      for (i = 0; i < inter_comm_size - 2; i++) {
        recv_offset =
            ((inter_rank - i - 1 +
-            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
+            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
        smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
-      srequest_array[i + 1] = smpi_mpi_isend((char *)rbuf + recv_offset, scount * NUM_CORE,
+      srequest_array[i + 1] = smpi_mpi_isend((char *)rbuf + recv_offset, scount * num_core,
                                               stype, dst, tag + i + 1, comm);
        if (num_core_in_current_smp > 1) {
-        smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
+        smpi_mpi_send((char *)rbuf + recv_offset, scount * num_core,
                        stype, (rank + 1), tag + i + 1, comm);
        }
      }
@@ -106,12 +113,12 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
      // recv last message and send_intra
      recv_offset =
          ((inter_rank - i - 1 +
-          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
-    //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
+          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
+    //recv_offset = ((inter_rank + 1) % inter_comm_size) * num_core * sextent * scount;
      //i=inter_comm_size-2;
      smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
      if (num_core_in_current_smp > 1) {
-      smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
+      smpi_mpi_send((char *)rbuf + recv_offset, scount * num_core,
                                    stype, (rank + 1), tag + i + 1, comm);
      }
  
@@ -124,8 +131,8 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
      for (i = 0; i < inter_comm_size - 1; i++) {
        recv_offset =
            ((inter_rank - i - 1 +
-            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
-      smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
+            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
+      smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                      rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
      }
    }
@@ -134,10 +141,10 @@ int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
      for (i = 0; i < inter_comm_size - 1; i++) {
        recv_offset =
            ((inter_rank - i - 1 +
-            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
-      smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
+            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
+      smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                      rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
-      smpi_mpi_send((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
+      smpi_mpi_send((char *) rbuf + recv_offset, (scount * num_core), stype,
                      (rank + 1), tag + i + 1, comm);
      }
    }
diff --git a/src/smpi/colls/allgather-loosely-lr.c b/src/smpi/colls/allgather-loosely-lr.c

index 7abe6e0..242990f 100644 (file)
--- a/src/smpi/colls/allgather-loosely-lr.c
+++ b/src/smpi/colls/allgather-loosely-lr.c
@@ -17,8 +17,13 @@ int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
  
    comm_size = smpi_comm_size(comm);
  
-  if(comm_size%4)
-    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=4 number of processes ! ");
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
+
+  if(comm_size%num_core)
+    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);
  
    rank = smpi_comm_rank(comm);
    MPI_Aint rextent, sextent;
@@ -36,10 +41,10 @@ int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
  
    MPI_Status status;
  
-  intra_rank = rank % NUM_CORE;
-  inter_rank = rank / NUM_CORE;
-  inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
-  intra_comm_size = NUM_CORE;
+  intra_rank = rank % num_core;
+  inter_rank = rank / num_core;
+  inter_comm_size = (comm_size + num_core - 1) / num_core;
+  intra_comm_size = num_core;
  
    int src_seg, dst_seg;
  
@@ -108,7 +113,7 @@ int smpi_coll_tuned_allgather_loosely_lr(void *sbuf, int scount,
      }                           // intra loop
  
  
-    // wait for inter communication to finish for these rounds (# of round equals NUM_CORE)
+    // wait for inter communication to finish for these rounds (# of round equals num_core)
      if (i != inter_comm_size - 1) {
        smpi_mpi_wait(&inter_rrequest, &status);
      }
diff --git a/src/smpi/colls/allgather-smp-simple.c b/src/smpi/colls/allgather-smp-simple.c

index f1c25d0..bd1206b 100644 (file)
--- a/src/smpi/colls/allgather-smp-simple.c
+++ b/src/smpi/colls/allgather-smp-simple.c
@@ -11,8 +11,13 @@ int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount,
    int src, dst, comm_size, rank;
    comm_size = smpi_comm_size(comm);
  
-  if(comm_size%NUM_CORE)
-     THROWF(arg_error,0, "allgather SMP simple algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", NUM_CORE);
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
+
+  if(comm_size%num_core)
+     THROWF(arg_error,0, "allgather SMP simple algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core);
  
    rank = smpi_comm_rank(comm);
    MPI_Aint rextent, sextent;
@@ -22,7 +27,6 @@ int smpi_coll_tuned_allgather_smp_simple(void *send_buf, int scount,
    MPI_Status status;
    int i, send_offset, recv_offset;
    int intra_rank, inter_rank;
-  int num_core = NUM_CORE;
    intra_rank = rank % num_core;
    inter_rank = rank / num_core;
    int inter_comm_size = (comm_size + num_core - 1) / num_core;
diff --git a/src/smpi/colls/allreduce-smp-binomial-pipeline.c b/src/smpi/colls/allreduce-smp-binomial-pipeline.c

index 68c99f5..6e829f8 100644 (file)
--- a/src/smpi/colls/allreduce-smp-binomial-pipeline.c
+++ b/src/smpi/colls/allreduce-smp-binomial-pipeline.c
@@ -46,7 +46,10 @@ int smpi_coll_tuned_allreduce_smp_binomial_pipeline(void *send_buf,
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
    MPI_Status status;
-  int num_core = NUM_CORE;
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
  
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
diff --git a/src/smpi/colls/allreduce-smp-binomial.c b/src/smpi/colls/allreduce-smp-binomial.c

index 77a5abc..99cf0b0 100644 (file)
--- a/src/smpi/colls/allreduce-smp-binomial.c
+++ b/src/smpi/colls/allreduce-smp-binomial.c
@@ -34,7 +34,12 @@ int smpi_coll_tuned_allreduce_smp_binomial(void *send_buf, void *recv_buf,
    void *tmp_buf;
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
-  int num_core = NUM_CORE;
+
+
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
    MPI_Status status;
  
    comm_size=smpi_comm_size(comm);
diff --git a/src/smpi/colls/allreduce-smp-rdb.c b/src/smpi/colls/allreduce-smp-rdb.c

index b6def2e..08d2324 100644 (file)
--- a/src/smpi/colls/allreduce-smp-rdb.c
+++ b/src/smpi/colls/allreduce-smp-rdb.c
@@ -34,7 +34,10 @@ int smpi_coll_tuned_allreduce_smp_rdb(void *send_buf, void *recv_buf, int count,
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
    MPI_Status status;
-  int num_core = NUM_CORE;
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
    /*
       #ifdef MPICH2_REDUCTION
       MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/allreduce-smp-rsag-lr.c b/src/smpi/colls/allreduce-smp-rsag-lr.c

index ce90781..bc72027 100644 (file)
--- a/src/smpi/colls/allreduce-smp-rsag-lr.c
+++ b/src/smpi/colls/allreduce-smp-rsag-lr.c
@@ -23,7 +23,10 @@ int smpi_coll_tuned_allreduce_smp_rsag_lr(void *send_buf, void *recv_buf,
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
    MPI_Status status;
-  int num_core = NUM_CORE;
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
    /*
       #ifdef MPICH2_REDUCTION
       MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/allreduce-smp-rsag-rab.c b/src/smpi/colls/allreduce-smp-rsag-rab.c

index ced01ab..0a3d12a 100644 (file)
--- a/src/smpi/colls/allreduce-smp-rsag-rab.c
+++ b/src/smpi/colls/allreduce-smp-rsag-rab.c
@@ -26,7 +26,10 @@ int smpi_coll_tuned_allreduce_smp_rsag_rab(void *sbuf, void *rbuf, int count,
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
    MPI_Status status;
-  int num_core = NUM_CORE;
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
  
    comm_size = smpi_comm_size(comm);
  
diff --git a/src/smpi/colls/allreduce-smp-rsag.c b/src/smpi/colls/allreduce-smp-rsag.c

index 573f5c6..9de2bcf 100644 (file)
--- a/src/smpi/colls/allreduce-smp-rsag.c
+++ b/src/smpi/colls/allreduce-smp-rsag.c
@@ -22,7 +22,10 @@ int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf,
    int tag = COLL_TAG_ALLREDUCE;
    int mask, src, dst;
    MPI_Status status;
-  int num_core = NUM_CORE;
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
    /*
       #ifdef MPICH2_REDUCTION
       MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
diff --git a/src/smpi/colls/bcast-SMP-binary.c b/src/smpi/colls/bcast-SMP-binary.c

index c09d703..979aebc 100644 (file)
--- a/src/smpi/colls/bcast-SMP-binary.c
+++ b/src/smpi/colls/bcast-SMP-binary.c
@@ -21,26 +21,30 @@ int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
  
    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
+  int host_num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (host_num_core == 1) host_num_core = NUM_CORE;
  
-  if(size%NUM_CORE)
-    THROWF(arg_error,0, "bcast SMP binary can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE);
+  if(size%host_num_core)
+    THROWF(arg_error,0, "bcast SMP binary can't be used with non multiple of NUM_CORE=%d number of processes ! ",host_num_core);
  
    int segment = bcast_SMP_binary_segment_byte / extent;
    int pipe_length = count / segment;
    int remainder = count % segment;
  
-  int to_intra_left = (rank / NUM_CORE) * NUM_CORE + (rank % NUM_CORE) * 2 + 1;
-  int to_intra_right = (rank / NUM_CORE) * NUM_CORE + (rank % NUM_CORE) * 2 + 2;
-  int to_inter_left = ((rank / NUM_CORE) * 2 + 1) * NUM_CORE;
-  int to_inter_right = ((rank / NUM_CORE) * 2 + 2) * NUM_CORE;
-  int from_inter = (((rank / NUM_CORE) - 1) / 2) * NUM_CORE;
-  int from_intra = (rank / NUM_CORE) * NUM_CORE + ((rank % NUM_CORE) - 1) / 2;
+  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
+  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
+  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
+  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
+  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
+  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
    int increment = segment * extent;
  
-  int base = (rank / NUM_CORE) * NUM_CORE;
-  int num_core = NUM_CORE;
-  if (((rank / NUM_CORE) * NUM_CORE) == ((size / NUM_CORE) * NUM_CORE))
-    num_core = size - (rank / NUM_CORE) * NUM_CORE;
+  int base = (rank / host_num_core) * host_num_core;
+  int num_core = host_num_core;
+  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
+    num_core = size - (rank / host_num_core) * host_num_core;
  
    // if root is not zero send to rank zero first
    if (root != 0) {
@@ -52,7 +56,7 @@ int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
    // when a message is smaller than a block size => no pipeline 
    if (count <= segment) {
      // case ROOT-of-each-SMP
-    if (rank % NUM_CORE == 0) {
+    if (rank % host_num_core == 0) {
        // case ROOT
        if (rank == 0) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
@@ -117,7 +121,7 @@ int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
          (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
  
      // case ROOT-of-each-SMP
-    if (rank % NUM_CORE == 0) {
+    if (rank % host_num_core == 0) {
        // case ROOT
        if (rank == 0) {
          for (i = 0; i < pipe_length; i++) {
diff --git a/src/smpi/colls/bcast-SMP-binomial.c b/src/smpi/colls/bcast-SMP-binomial.c

index 2239960..ee13085 100644 (file)
--- a/src/smpi/colls/bcast-SMP-binomial.c
+++ b/src/smpi/colls/bcast-SMP-binomial.c
@@ -16,17 +16,22 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
  
-  if(size%NUM_CORE)
-    THROWF(arg_error,0, "bcast SMP binomial can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE);
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
+
+  if(size%num_core)
+    THROWF(arg_error,0, "bcast SMP binomial can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);
  
    int to_intra, to_inter;
    int from_intra, from_inter;
-  int inter_rank = rank / NUM_CORE;
-  int inter_size = (size - 1) / NUM_CORE + 1;
-  int intra_rank = rank % NUM_CORE;
-  int intra_size = NUM_CORE;
-  if (((rank / NUM_CORE) * NUM_CORE) == ((size / NUM_CORE) * NUM_CORE))
-    intra_size = size - (rank / NUM_CORE) * NUM_CORE;
+  int inter_rank = rank / num_core;
+  int inter_size = (size - 1) / num_core + 1;
+  int intra_rank = rank % num_core;
+  int intra_size = num_core;
+  if (((rank / num_core) * num_core) == ((size / num_core) * num_core))
+    intra_size = size - (rank / num_core) * num_core;
  
    // if root is not zero send to rank zero first
    if (root != 0) {
@@ -43,7 +48,7 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
      mask = 1;
      while (mask < inter_size) {
        if (inter_rank & mask) {
-        from_inter = (inter_rank - mask) * NUM_CORE;
+        from_inter = (inter_rank - mask) * num_core;
          //printf("Node %d recv from node %d when mask is %d\n", rank, from_inter, mask);
          smpi_mpi_recv(buf, count, datatype, from_inter, tag, comm, &status);
          break;
@@ -56,7 +61,7 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
  
      while (mask > 0) {
        if (inter_rank < inter_size) {
-        to_inter = (inter_rank + mask) * NUM_CORE;
+        to_inter = (inter_rank + mask) * num_core;
          if (to_inter < size) {
            //printf("Node %d send to node %d when mask is %d\n", rank, to_inter, mask);
            smpi_mpi_send(buf, count, datatype, to_inter, tag, comm);
@@ -67,7 +72,7 @@ int smpi_coll_tuned_bcast_SMP_binomial(void *buf, int count,
    }
    // SECOND STEP every root-of-each-SMP send to all children with binomial tree
    // base is a rank of root-of-each-SMP
-  int base = (rank / NUM_CORE) * NUM_CORE;
+  int base = (rank / num_core) * num_core;
    mask = 1;
    while (mask < intra_size) {
      if (intra_rank & mask) {
diff --git a/src/smpi/colls/bcast-SMP-linear.c b/src/smpi/colls/bcast-SMP-linear.c

index 092ab26..3ef441e 100644 (file)
--- a/src/smpi/colls/bcast-SMP-linear.c
+++ b/src/smpi/colls/bcast-SMP-linear.c
@@ -21,9 +21,13 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
  
    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
+  int num_core = simcall_host_get_core(SIMIX_host_self());
+  // do we use the default one or the number of cores in the platform ?
+  // if the number of cores is one, the platform may be simulated with 1 node = 1 core
+  if (num_core == 1) num_core = NUM_CORE;
  
-  if(size%NUM_CORE)
-    THROWF(arg_error,0, "bcast SMP linear can't be used with non multiple of NUM_CORE=%d number of processes ! ",NUM_CORE);
+  if(size%num_core)
+    THROWF(arg_error,0, "bcast SMP linear can't be used with non multiple of num_core=%d number of processes!",num_core);
  
    int segment = bcast_SMP_linear_segment_byte / extent;
    int pipe_length = count / segment;
@@ -33,13 +37,13 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
  
    /* leader of each SMP do inter-communication
       and act as a root for intra-communication */
-  int to_inter = (rank + NUM_CORE) % size;
+  int to_inter = (rank + num_core) % size;
    int to_intra = (rank + 1) % size;
-  int from_inter = (rank - NUM_CORE + size) % size;
+  int from_inter = (rank - num_core + size) % size;
    int from_intra = (rank + size - 1) % size;
  
    // call native when MPI communication size is too small
-  if (size <= NUM_CORE) {
+  if (size <= num_core) {
      XBT_WARN("MPI_bcast_SMP_linear use default MPI_bcast.");             
      smpi_mpi_bcast(buf, count, datatype, root, comm);
      return MPI_SUCCESS;            
@@ -59,20 +63,20 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
        smpi_mpi_send(buf, count, datatype, to_intra, tag, comm);
      }
      // case last ROOT of each SMP
-    else if (rank == (((size - 1) / NUM_CORE) * NUM_CORE)) {
+    else if (rank == (((size - 1) / num_core) * num_core)) {
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_intra, tag, comm);
      }
      // case intermediate ROOT of each SMP
-    else if (rank % NUM_CORE == 0) {
+    else if (rank % num_core == 0) {
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_inter, tag, comm);
        smpi_mpi_send(buf, count, datatype, to_intra, tag, comm);
      }
      // case last non-ROOT of each SMP
-    else if (((rank + 1) % NUM_CORE == 0) || (rank == (size - 1))) {
+    else if (((rank + 1) % num_core == 0) || (rank == (size - 1))) {
        request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm);
        smpi_mpi_wait(&request, &status);
      }
@@ -92,7 +96,7 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
          (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
  
      // case ROOT of each SMP
-    if (rank % NUM_CORE == 0) {
+    if (rank % num_core == 0) {
        // case real root
        if (rank == 0) {
          for (i = 0; i < pipe_length; i++) {
@@ -103,7 +107,7 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
          }
        }
        // case last ROOT of each SMP
-      else if (rank == (((size - 1) / NUM_CORE) * NUM_CORE)) {
+      else if (rank == (((size - 1) / num_core) * num_core)) {
          for (i = 0; i < pipe_length; i++) {
            request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                      from_inter, (tag + i), comm);
@@ -129,7 +133,7 @@ int smpi_coll_tuned_bcast_SMP_linear(void *buf, int count,
          }
        }
      } else {                    // case last non-ROOT of each SMP
-      if (((rank + 1) % NUM_CORE == 0) || (rank == (size - 1))) {
+      if (((rank + 1) % num_core == 0) || (rank == (size - 1))) {
          for (i = 0; i < pipe_length; i++) {
            request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                      from_intra, (tag + i), comm);
author	Augustin Degomme <degomme@idpann.imag.fr>
	Fri, 24 Jan 2014 09:23:47 +0000 (10:23 +0100)
committer	Augustin Degomme <degomme@idpann.imag.fr>
	Fri, 24 Jan 2014 10:31:30 +0000 (11:31 +0100)
src/smpi/colls/allgather-SMP-NTS.c		patch \| blob \| history
src/smpi/colls/allgather-loosely-lr.c		patch \| blob \| history
src/smpi/colls/allgather-smp-simple.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-binomial-pipeline.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-binomial.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-rdb.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-rsag-lr.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-rsag-rab.c		patch \| blob \| history
src/smpi/colls/allreduce-smp-rsag.c		patch \| blob \| history
src/smpi/colls/bcast-SMP-binary.c		patch \| blob \| history
src/smpi/colls/bcast-SMP-binomial.c		patch \| blob \| history
src/smpi/colls/bcast-SMP-linear.c		patch \| blob \| history