X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/727c068cd59669e79c6779f9ff93ec4ac91f7522..a2f1b23687f04169144f4ffb4f20dc4fc5c28395:/src/smpi/colls/allreduce-lr.c diff --git a/src/smpi/colls/allreduce-lr.c b/src/smpi/colls/allreduce-lr.c new file mode 100644 index 0000000000..d4bf82aae8 --- /dev/null +++ b/src/smpi/colls/allreduce-lr.c @@ -0,0 +1,98 @@ +#include "colls.h" + +/* IMPLEMENTED BY PITCH PATARASUK + Non-topoloty-specific all-reduce operation designed bandwidth optimally + Bug fixing by Xin Yuan, 04/04/2008 +*/ + +/* ** NOTE ** + Use -DMPICH2_REDUCTION if this code does not compile. + MPICH1 code also work on MPICH2 on our cluster and the performance are similar. + This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc). +*/ + +//#include + +int +smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount, + MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) +{ + int tag = 5000; + MPI_Status status; + int rank, i, size, count; + int send_offset, recv_offset; + int remainder, remainder_flag, remainder_offset; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + /* make it compatible with all data type */ + MPI_Aint extent; + MPI_Type_extent(dtype, &extent); + + /* when communication size is smaller than number of process (not support) */ + if (rcount < size) { + return MPI_Allreduce(sbuf, rbuf, rcount, dtype, op, comm); + } + + /* when communication size is not divisible by number of process: + call the native implementation for the remain chunk at the end of the operation */ + else if (rcount % size != 0) { + remainder = rcount % size; + remainder_flag = 1; + remainder_offset = (rcount / size) * size * extent; + } else { + remainder_flag = remainder_offset = 0; + } + + /* size of each point-to-point communication is equal to the size of the whole message + divided by number of processes + */ + count = rcount / size; + + /* our ALL-REDUCE implementation + 1. copy (partial of)send_buf to recv_buf + 2. use logical ring reduce-scatter + 3. use logical ring all-gather + */ + + // copy partial data + send_offset = ((rank - 1 + size) % size) * count * extent; + recv_offset = ((rank - 1 + size) % size) * count * extent; + MPI_Sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1, + (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm, + &status); + + // reduce-scatter + for (i = 0; i < (size - 1); i++) { + send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent; + recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent; + // recv_offset = ((rank-i+2*size)%size)*count*extent; + MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size), + tag + i, (char *) rbuf + recv_offset, count, dtype, + ((rank + size - 1) % size), tag + i, comm, &status); + + // compute result to rbuf+recv_offset + star_reduction(op, (char *) sbuf + recv_offset, (char *) rbuf + recv_offset, + &count, &dtype); + } + + // all-gather + for (i = 0; i < (size - 1); i++) { + send_offset = ((rank - i + 2 * size) % size) * count * extent; + recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent; + MPI_Sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size), + tag + i, (char *) rbuf + recv_offset, count, dtype, + ((rank + size - 1) % size), tag + i, comm, &status); + } + + /* when communication size is not divisible by number of process: + call the native implementation for the remain chunk at the end of the operation */ + if (remainder_flag) { + return MPI_Allreduce((char *) sbuf + remainder_offset, + (char *) rbuf + remainder_offset, remainder, dtype, op, + comm); + } + + return 0; +}