1 /* Copyright (c) 2013-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "colls_private.h"
9 /* IMPLEMENTED BY PITCH PATARASUK
10 Non-topoloty-specific all-reduce operation designed bandwidth optimally
11 Bug fixing by Xin Yuan, 04/04/2008
15 Use -DMPICH2_REDUCTION if this code does not compile.
16 MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
17 This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
20 //#include <star-reduction.c>
23 smpi_coll_tuned_allreduce_lr(void *sbuf, void *rbuf, int rcount,
24 MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
26 int tag = COLL_TAG_ALLREDUCE;
28 int rank, i, size, count;
29 int send_offset, recv_offset;
30 int remainder, remainder_flag, remainder_offset;
32 rank = smpi_comm_rank(comm);
33 size = smpi_comm_size(comm);
35 /* make it compatible with all data type */
37 extent = smpi_datatype_get_extent(dtype);
39 /* when communication size is smaller than number of process (not support) */
41 XBT_WARN("MPI_allreduce_lr use default MPI_allreduce.");
42 smpi_mpi_allreduce(sbuf, rbuf, rcount, dtype, op, comm);
46 /* when communication size is not divisible by number of process:
47 call the native implementation for the remain chunk at the end of the operation */
48 if (rcount % size != 0) {
49 remainder = rcount % size;
51 remainder_offset = (rcount / size) * size * extent;
53 remainder = remainder_flag = remainder_offset = 0;
56 /* size of each point-to-point communication is equal to the size of the whole message
57 divided by number of processes
59 count = rcount / size;
61 /* our ALL-REDUCE implementation
62 1. copy (partial of)send_buf to recv_buf
63 2. use logical ring reduce-scatter
64 3. use logical ring all-gather
68 send_offset = ((rank - 1 + size) % size) * count * extent;
69 recv_offset = ((rank - 1 + size) % size) * count * extent;
70 smpi_mpi_sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
71 (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
75 for (i = 0; i < (size - 1); i++) {
76 send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
77 recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent;
78 // recv_offset = ((rank-i+2*size)%size)*count*extent;
79 smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
80 tag + i, (char *) rbuf + recv_offset, count, dtype,
81 ((rank + size - 1) % size), tag + i, comm, &status);
83 // compute result to rbuf+recv_offset
84 smpi_op_apply(op, (char *) sbuf + recv_offset, (char *) rbuf + recv_offset,
89 for (i = 0; i < (size - 1); i++) {
90 send_offset = ((rank - i + 2 * size) % size) * count * extent;
91 recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
92 smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
93 tag + i, (char *) rbuf + recv_offset, count, dtype,
94 ((rank + size - 1) % size), tag + i, comm, &status);
97 /* when communication size is not divisible by number of process:
98 call the native implementation for the remain chunk at the end of the operation */
100 return mpi_coll_allreduce_fun((char *) sbuf + remainder_offset,
101 (char *) rbuf + remainder_offset, remainder, dtype, op,