ADD_TEST(smpi-gather-coll-${GATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/gather:${GATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/gather_coll.tesh)
ENDFOREACH()
- FOREACH (ALLGATHER_COLL default 2dmesh 3dmesh bruck GB loosely_lr lr
+ FOREACH (ALLGATHER_COLL default 2dmesh 3dmesh bruck GB loosely_lr
NTSLR NTSLR_NB pair rdb rhv ring SMP_NTS
smp_simple spreading_simple ompi mpich ompi_neighborexchange)
ADD_TEST(smpi-allgather-coll-${ALLGATHER_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgather:${ALLGATHER_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgather_coll.tesh)
ADD_TEST(smpi-allgatherv-coll-${ALLGATHERV_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allgatherv:${ALLGATHERV_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allgatherv_coll.tesh)
ENDFOREACH()
- FOREACH (ALLREDUCE_COLL default lr NTS rab1 rab2 rab_rdb
+ FOREACH (ALLREDUCE_COLL default lr rab1 rab2 rab_rdb
rab_rsag rdb smp_binomial smp_binomial_pipeline
smp_rdb smp_rsag smp_rsag_lr smp_rsag_rab redbcast ompi mpich ompi_ring_segmented)
ADD_TEST(smpi-allreduce-coll-${ALLREDUCE_COLL} ${CMAKE_BINARY_DIR}/bin/tesh ${TESH_OPTION} --cfg smpi/allreduce:${ALLREDUCE_COLL} --cd ${CMAKE_BINARY_DIR}/teshsuite/smpi ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/allreduce_coll.tesh)
+++ /dev/null
-#include "colls_private.h"
-/* IMPLEMENTED BY PITCH PATARASUK
- Non-topoloty-specific all-reduce operation designed bandwidth optimally */
-
-/* ** NOTE **
- Use -DMPICH2_REDUCTION if this code does not compile.
- MPICH1 code also work on MPICH2 on our cluster and the performance are similar.
- This code assume commutative and associative reduce operator (MPI_SUM, MPI_MAX, etc).
-*/
-
-//#include <star-reduction.c>
-
-int
-smpi_coll_tuned_allreduce_NTS(void *sbuf, void *rbuf, int rcount,
- MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
-{
- int tag = COLL_TAG_ALLREDUCE;
- MPI_Status status;
- int rank, i, size, count;
- int send_offset, recv_offset;
- int remainder, remainder_flag, remainder_offset;
-
- rank = smpi_comm_rank(comm);
- size = smpi_comm_size(comm);
-
- /* make it compatible with all data type */
- MPI_Aint extent;
- extent = smpi_datatype_get_extent(dtype);
-
- /* when communication size is smaller than number of process (not support) */
- if (rcount < size) {
- return mpi_coll_allreduce_fun(sbuf, rbuf, rcount, dtype, op, comm);
- }
-
- /* when communication size is not divisible by number of process:
- call the native implementation for the remain chunk at the end of the operation */
- if (rcount % size != 0) {
- remainder = rcount % size;
- remainder_flag = 1;
- remainder_offset = (rcount / size) * size * extent;
- } else {
- remainder = remainder_flag = remainder_offset = 0;
- }
-
- /* size of each point-to-point communication is equal to the size of the whole message
- divided by number of processes
- */
- count = rcount / size;
-
- /* our ALL-REDUCE implementation
- 1. copy (partial of)send_buf to recv_buf
- 2. use logical ring reduce-scatter
- 3. use logical ring all-gather
- */
-
- // copy partial data
- send_offset = ((rank - 1 + size) % size) * count * extent;
- recv_offset = ((rank - 1 + size) % size) * count * extent;
- smpi_mpi_sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
- (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
- &status);
-
- // reduce-scatter
- for (i = 0; i < (size - 1); i++) {
- send_offset = ((rank - 1 - i + size) % size) * count * extent;
- recv_offset = ((rank - 2 - i + size) % size) * count * extent;
- smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
- tag + i, (char *) rbuf + recv_offset, count, dtype,
- ((rank + size - 1) % size), tag + i, comm, &status);
-
- // compute result to rbuf+recv_offset
- smpi_op_apply(op, (char *)sbuf + recv_offset, (char *)rbuf + recv_offset, &count, &dtype);
- }
-
- // all-gather
- for (i = 0; i < (size - 1); i++) {
- send_offset = ((rank - i + size) % size) * count * extent;
- recv_offset = ((rank - 1 - i + size) % size) * count * extent;
- smpi_mpi_sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
- tag + i, (char *) rbuf + recv_offset, count, dtype,
- ((rank + size - 1) % size), tag + i, comm, &status);
- }
-
- /* when communication size is not divisible by number of process:
- call the native implementation for the remain chunk at the end of the operation */
- if (remainder_flag) {
- XBT_WARN("MPI_allreduce_NTS use default MPI_allreduce.");
- smpi_mpi_allreduce((char *) sbuf + remainder_offset,
- (char *) rbuf + remainder_offset, remainder, dtype, op,
- comm);
- return MPI_SUCCESS;
- }
-
- return MPI_SUCCESS;
-}