X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/076aada113aa0566c059211416cd9214a54d763d..8293a833f9c68986f8bd174d5bf3d04eb62918d7:/src/smpi/colls/reduce_scatter-mpich.c diff --git a/src/smpi/colls/reduce_scatter-mpich.c b/src/smpi/colls/reduce_scatter-mpich.c new file mode 100644 index 0000000000..77eaef5bf8 --- /dev/null +++ b/src/smpi/colls/reduce_scatter-mpich.c @@ -0,0 +1,493 @@ +#include "colls_private.h" +#define MPIR_REDUCE_SCATTER_TAG 222 + +static inline int MPIU_Mirror_permutation(unsigned int x, int bits) +{ + /* a mask for the high order bits that should be copied as-is */ + int high_mask = ~((0x1 << bits) - 1); + int retval = x & high_mask; + int i; + + for (i = 0; i < bits; ++i) { + unsigned int bitval = (x & (0x1 << i)) >> i; /* 0x1 or 0x0 */ + retval |= bitval << ((bits - i) - 1); + } + + return retval; +} + + +int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[], + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) +{ + int rank, comm_size, i; + MPI_Aint extent, true_extent, true_lb; + int *disps; + void *tmp_recvbuf; + int mpi_errno = MPI_SUCCESS; + int type_size, total_count, nbytes, dst, src; + int is_commutative; + comm_size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + extent =smpi_datatype_get_extent(datatype); + smpi_datatype_extent(datatype, &true_lb, &true_extent); + + if (smpi_op_is_commute(op)) { + is_commutative = 1; + } + + disps = (int*)xbt_malloc( comm_size * sizeof(int)); + + total_count = 0; + for (i=0; i peer) { + /* we have the higher rank: send top half, recv bottom half */ + recv_offset += size; + } + else { + /* we have the lower rank: recv top half, send bottom half */ + send_offset += size; + } + + smpi_mpi_sendrecv(outgoing_data + send_offset*true_extent, + size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, + incoming_data + recv_offset*true_extent, + size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, + comm, MPI_STATUS_IGNORE); + /* always perform the reduction at recv_offset, the data at send_offset + is now our peer's responsibility */ + if (rank > peer) { + /* higher ranked value so need to call op(received_data, my_data) */ + smpi_op_apply(op, + incoming_data + recv_offset*true_extent, + outgoing_data + recv_offset*true_extent, + &size, &datatype ); + buf0_was_inout = buf0_was_inout; + } + else { + /* lower ranked value so need to call op(my_data, received_data) */ + smpi_op_apply( op, + outgoing_data + recv_offset*true_extent, + incoming_data + recv_offset*true_extent, + &size, &datatype); + buf0_was_inout = !buf0_was_inout; + } + + /* the next round of send/recv needs to happen within the block (of size + "size") that we just received and reduced */ + send_offset = recv_offset; + } + + xbt_assert(size == recvcounts[rank]); + + /* copy the reduced data to the recvbuf */ + result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent; + mpi_errno = smpi_datatype_copy(result_ptr, size, datatype, + recvbuf, size, datatype); + if (mpi_errno) return(mpi_errno); + return MPI_SUCCESS; +} + + + +int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[], + MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) +{ + int rank, comm_size, i; + MPI_Aint extent, true_extent, true_lb; + int *disps; + void *tmp_recvbuf, *tmp_results; + int mpi_errno = MPI_SUCCESS; + int type_size, dis[2], blklens[2], total_count, nbytes, dst; + int mask, dst_tree_root, my_tree_root, j, k; + int received; + MPI_Datatype sendtype, recvtype; + int nprocs_completed, tmp_mask, tree_root, is_commutative; + comm_size = smpi_comm_size(comm); + rank = smpi_comm_rank(comm); + + extent =smpi_datatype_get_extent(datatype); + smpi_datatype_extent(datatype, &true_lb, &true_extent); + + if (smpi_op_is_commute(op)) { + is_commutative = 1; + } + + disps = (int*)xbt_malloc( comm_size * sizeof(int)); + + total_count = 0; + for (i=0; i> i; + dst_tree_root <<= i; + + my_tree_root = rank >> i; + my_tree_root <<= i; + + /* At step 1, processes exchange (n-n/p) amount of + data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) + amount of data, and so forth. We use derived datatypes for this. + + At each step, a process does not need to send data + indexed from my_tree_root to + my_tree_root+mask-1. Similarly, a process won't receive + data indexed from dst_tree_root to dst_tree_root+mask-1. */ + + /* calculate sendtype */ + blklens[0] = blklens[1] = 0; + for (j=0; j comm_size) { + nprocs_completed = comm_size - my_tree_root - mask; + /* nprocs_completed is the number of processes in this + subtree that have all the data. Send data to others + in a tree fashion. First find root of current tree + that is being divided into two. k is the number of + least-significant bits in this process's rank that + must be zeroed out to find the rank of the root */ + j = mask; + k = 0; + while (j) { + j >>= 1; + k++; + } + k--; + + tmp_mask = mask >> 1; + while (tmp_mask) { + dst = rank ^ tmp_mask; + + tree_root = rank >> k; + tree_root <<= k; + + /* send only if this proc has data and destination + doesn't have data. at any step, multiple processes + can send if they have the data */ + if ((dst > rank) && + (rank < tree_root + nprocs_completed) + && (dst >= tree_root + nprocs_completed)) { + /* send the current result */ + smpi_mpi_send(tmp_recvbuf, 1, recvtype, + dst, MPIR_REDUCE_SCATTER_TAG, + comm); + } + /* recv only if this proc. doesn't have data and sender + has data */ + else if ((dst < rank) && + (dst < tree_root + nprocs_completed) && + (rank >= tree_root + nprocs_completed)) { + smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst, + MPIR_REDUCE_SCATTER_TAG, + comm, MPI_STATUS_IGNORE); + received = 1; + } + tmp_mask >>= 1; + k--; + } + } + + /* The following reduction is done here instead of after + the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is + because to do it above, in the noncommutative + case, we would need an extra temp buffer so as not to + overwrite temp_recvbuf, because temp_recvbuf may have + to be communicated to other processes in the + non-power-of-two case. To avoid that extra allocation, + we do the reduce here. */ + if (received) { + if (is_commutative || (dst_tree_root < my_tree_root)) { + { + smpi_op_apply(op, + tmp_recvbuf, tmp_results, &blklens[0], + &datatype); + smpi_op_apply(op, + ((char *)tmp_recvbuf + dis[1]*extent), + ((char *)tmp_results + dis[1]*extent), + &blklens[1], &datatype); + } + } + else { + { + smpi_op_apply(op, + tmp_results, tmp_recvbuf, &blklens[0], + &datatype); + smpi_op_apply(op, + ((char *)tmp_results + dis[1]*extent), + ((char *)tmp_recvbuf + dis[1]*extent), + &blklens[1], &datatype); + } + /* copy result back into tmp_results */ + mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, + tmp_results, 1, recvtype); + if (mpi_errno) return(mpi_errno); + } + } + + //smpi_datatype_free(&sendtype); + //smpi_datatype_free(&recvtype); + + mask <<= 1; + i++; + } + + /* now copy final results from tmp_results to recvbuf */ + mpi_errno = smpi_datatype_copy(((char *)tmp_results+disps[rank]*extent), + recvcounts[rank], datatype, recvbuf, + recvcounts[rank], datatype); + if (mpi_errno) return(mpi_errno); + + return MPI_SUCCESS; + } + +