avoid looping forever on iprobe

[simgrid.git] / src / smpi / colls / allreduce-lr.c
diff --git a/src/smpi/colls/allreduce-lr.c b/src/smpi/colls/allreduce-lr.c

index d4bf82a..a7f7336 100644 (file)
--- a/src/smpi/colls/allreduce-lr.c
+++ b/src/smpi/colls/allreduce-lr.c
@@ -1,4 +1,4 @@
-#include "colls.h"
+#include "colls_private.h"
  
  /* IMPLEMENTED BY PITCH PATARASUK 
     Non-topoloty-specific all-reduce operation designed bandwidth optimally 
@@ -23,16 +23,18 @@ smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
    int send_offset, recv_offset;
    int remainder, remainder_flag, remainder_offset;
  
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  rank = smpi_comm_rank(MPI_COMM_WORLD);
+  size = smpi_comm_size(MPI_COMM_WORLD);
  
    /* make it compatible with all data type */
    MPI_Aint extent;
-  MPI_Type_extent(dtype, &extent);
+  extent = smpi_datatype_get_extent(dtype);
  
    /* when communication size is smaller than number of process (not support) */
    if (rcount < size) {
-    return MPI_Allreduce(sbuf, rbuf, rcount, dtype, op, comm);
+    XBT_WARN("MPI_allreduce_lr use default MPI_allreduce.");     
+    smpi_mpi_allreduce(sbuf, rbuf, rcount, dtype, op, comm);
+    return MPI_SUCCESS; 
    }
  
    /* when communication size is not divisible by number of process: 
@@ -59,7 +61,7 @@ smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
    // copy partial data
    send_offset = ((rank - 1 + size) % size) * count * extent;
    recv_offset = ((rank - 1 + size) % size) * count * extent;
-  MPI_Sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
+  smpi_mpi_sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
                 (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
                 &status);
  
@@ -68,7 +70,7 @@ smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
      send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
      recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent;
      //    recv_offset = ((rank-i+2*size)%size)*count*extent;
-    MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+    smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                   tag + i, (char *) rbuf + recv_offset, count, dtype,
                   ((rank + size - 1) % size), tag + i, comm, &status);
  
@@ -81,7 +83,7 @@ smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
    for (i = 0; i < (size - 1); i++) {
      send_offset = ((rank - i + 2 * size) % size) * count * extent;
      recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
-    MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
+    smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                   tag + i, (char *) rbuf + recv_offset, count, dtype,
                   ((rank + size - 1) % size), tag + i, comm, &status);
    }
@@ -89,7 +91,7 @@ smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
    /* when communication size is not divisible by number of process: 
       call the native implementation for the remain chunk at the end of the operation */
    if (remainder_flag) {
-    return MPI_Allreduce((char *) sbuf + remainder_offset,
+    return mpi_coll_allreduce_fun((char *) sbuf + remainder_offset,
                           (char *) rbuf + remainder_offset, remainder, dtype, op,
                           comm);
    }