From c741788e9455cf69f04ed6ffeda08d674a1c3895 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Wed, 23 Feb 2022 00:25:21 +0100 Subject: [PATCH] Update OMPI selector. It was last updated from version 3.1.2, which was released in 2018, and the selection logic was modified since. Few algorithms will be added soon to match better. --- src/smpi/colls/smpi_openmpi_selector.cpp | 1586 ++++++++++++++++------ 1 file changed, 1176 insertions(+), 410 deletions(-) diff --git a/src/smpi/colls/smpi_openmpi_selector.cpp b/src/smpi/colls/smpi_openmpi_selector.cpp index 27ba787629..2809babd76 100644 --- a/src/smpi/colls/smpi_openmpi_selector.cpp +++ b/src/smpi/colls/smpi_openmpi_selector.cpp @@ -1,4 +1,5 @@ -/* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */ +/* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector + * Updated 02/2022 */ /* Copyright (c) 2009-2022. The SimGrid Team. * All rights reserved. */ @@ -10,42 +11,180 @@ #include +/* FIXME +add algos: +allreduce nonoverlapping, basic linear +alltoall linear_sync +bcast chain +reduce_scatter butterfly +scatter linear_nb +*/ + namespace simgrid { namespace smpi { int allreduce__ompi(const void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm) { - size_t dsize, block_dsize; - int comm_size = comm->size(); - const size_t intermediate_message = 10000; - - /** - * Decision function based on MX results from the Grig cluster at UTK. + size_t total_dsize = dtype->size() * (ptrdiff_t)count; + int communicator_size = comm->size(); + int alg = 1; + int(*funcs[]) (const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm)={ + &allreduce__lr, + &allreduce__lr, + &allreduce__rdb, + &allreduce__lr, + &allreduce__ompi_ring_segmented, + &allreduce__rab_rdb + }; + + /** Algorithms: + * {1, "basic_linear"}, + * {2, "nonoverlapping"}, + * {3, "recursive_doubling"}, + * {4, "ring"}, + * {5, "segmented_ring"}, + * {6, "rabenseifner" * - * Currently, linear, recursive doubling, and nonoverlapping algorithms - * can handle both commutative and non-commutative operations. - * Ring algorithm does not support non-commutative operations. + * Currently, ring, segmented ring, and rabenseifner do not support + * non-commutative operations. */ - dsize = dtype->size(); - block_dsize = dsize * count; - - if (block_dsize < intermediate_message) { - return allreduce__rdb(sbuf, rbuf, count, dtype, op, comm); - } - - if( ((op==MPI_OP_NULL) || op->is_commutative()) && (count > comm_size) ) { - const size_t segment_size = 1 << 20; /* 1 MB */ - if ((comm_size * segment_size >= block_dsize)) { - //FIXME: ok, these are not the right algorithms, try to find closer ones - // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv) - return allreduce__lr(sbuf, rbuf, count, dtype, op, comm); + if ((op != MPI_OP_NULL) && not op->is_commutative()) { + if (communicator_size < 4) { + if (total_dsize < 131072) { + alg = 3; + } else { + alg = 1; + } + } else if (communicator_size < 8) { + alg = 3; + } else if (communicator_size < 16) { + if (total_dsize < 1048576) { + alg = 3; + } else { + alg = 2; + } + } else if (communicator_size < 128) { + alg = 3; + } else if (communicator_size < 256) { + if (total_dsize < 131072) { + alg = 2; + } else if (total_dsize < 524288) { + alg = 3; + } else { + alg = 2; + } + } else if (communicator_size < 512) { + if (total_dsize < 4096) { + alg = 2; + } else if (total_dsize < 524288) { + alg = 3; + } else { + alg = 2; + } + } else { + if (total_dsize < 2048) { + alg = 2; + } else { + alg = 3; + } + } + } else { + if (communicator_size < 4) { + if (total_dsize < 8) { + alg = 4; + } else if (total_dsize < 4096) { + alg = 3; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 16384) { + alg = 3; + } else if (total_dsize < 65536) { + alg = 4; + } else if (total_dsize < 262144) { + alg = 5; + } else { + alg = 6; + } + } else if (communicator_size < 8) { + if (total_dsize < 16) { + alg = 4; + } else if (total_dsize < 8192) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 16) { + if (total_dsize < 8192) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 32) { + if (total_dsize < 64) { + alg = 5; + } else if (total_dsize < 4096) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 64) { + if (total_dsize < 128) { + alg = 5; + } else { + alg = 6; + } + } else if (communicator_size < 128) { + if (total_dsize < 262144) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 256) { + if (total_dsize < 131072) { + alg = 2; + } else if (total_dsize < 262144) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 512) { + if (total_dsize < 4096) { + alg = 2; + } else { + alg = 6; + } + } else if (communicator_size < 2048) { + if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 16384) { + alg = 3; + } else { + alg = 6; + } + } else if (communicator_size < 4096) { + if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 5; + } else if (total_dsize < 16384) { + alg = 3; + } else { + alg = 6; + } } else { - return allreduce__ompi_ring_segmented(sbuf, rbuf, count, dtype, op, comm /*segment_size*/); + if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 16384) { + alg = 5; + } else if (total_dsize < 32768) { + alg = 3; + } else { + alg = 6; + } } } - - return allreduce__redbcast(sbuf, rbuf, count, dtype, op, comm); + return funcs[alg-1](sbuf, rbuf, count, dtype, op, comm); } @@ -56,27 +195,185 @@ int alltoall__ompi(const void *sbuf, int scount, MPI_Datatype rdtype, MPI_Comm comm) { - int communicator_size; - size_t dsize, block_dsize; - communicator_size = comm->size(); - - /* Decision function based on measurement on Grig cluster at - the University of Tennessee (2GB MX) up to 64 nodes. - Has better performance for messages of intermediate sizes than the old one */ - /* determine block size */ - dsize = sdtype->size(); - block_dsize = dsize * scount; - - if ((block_dsize < 200) && (communicator_size > 12)) { - return alltoall__bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, comm); + int alg = 1; + size_t dsize, total_dsize; + int communicator_size = comm->size(); - } else if (block_dsize < 3000) { - return alltoall__basic_linear(sbuf, scount, sdtype, - rbuf, rcount, rdtype, comm); + if (MPI_IN_PLACE != sbuf) { + dsize = sdtype->size(); + } else { + dsize = rdtype->size(); + } + total_dsize = dsize * (ptrdiff_t)scount; + int (*funcs[])(const void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = { + &alltoall__basic_linear, + &alltoall__pair, + &alltoall__bruck, + &alltoall__basic_linear, + &alltoall__basic_linear + }; + /** Algorithms: + * {1, "linear"}, + * {2, "pairwise"}, + * {3, "modified_bruck"}, + * {4, "linear_sync"}, + * {5, "two_proc"}, + */ + if (communicator_size == 2) { + if (total_dsize < 2) { + alg = 2; + } else if (total_dsize < 4) { + alg = 5; + } else if (total_dsize < 16) { + alg = 2; + } else if (total_dsize < 64) { + alg = 5; + } else if (total_dsize < 256) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 5; + } else if (total_dsize < 32768) { + alg = 2; + } else if (total_dsize < 262144) { + alg = 4; + } else if (total_dsize < 1048576) { + alg = 5; + } else { + alg = 2; + } + } else if (communicator_size < 8) { + if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 16384) { + alg = 1; + } else if (total_dsize < 65536) { + alg = 4; + } else if (total_dsize < 524288) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 2; + } else { + alg = 1; + } + } else if (communicator_size < 16) { + if (total_dsize < 262144) { + alg = 4; + } else { + alg = 1; + } + } else if (communicator_size < 32) { + if (total_dsize < 4) { + alg = 4; + } else if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 32768) { + alg = 1; + } else if (total_dsize < 262144) { + alg = 4; + } else if (total_dsize < 524288) { + alg = 1; + } else { + alg = 4; + } + } else if (communicator_size < 64) { + if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 524288) { + alg = 1; + } else { + alg = 4; + } + } else if (communicator_size < 128) { + if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 2048) { + alg = 1; + } else if (total_dsize < 4096) { + alg = 4; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 256) { + if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 2048) { + alg = 4; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 512) { + if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 32768) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 1024) { + if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 16384) { + alg = 1; + } else if (total_dsize < 131072) { + alg = 4; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 2048) { + if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 1024) { + alg = 4; + } else if (total_dsize < 2048) { + alg = 1; + } else if (total_dsize < 16384) { + alg = 4; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 4; + } + } else if (communicator_size < 4096) { + if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 4096) { + alg = 4; + } else if (total_dsize < 8192) { + alg = 1; + } else if (total_dsize < 131072) { + alg = 4; + } else { + alg = 1; + } + } else { + if (total_dsize < 2048) { + alg = 3; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 16384) { + alg = 1; + } else if (total_dsize < 32768) { + alg = 4; + } else if (total_dsize < 65536) { + alg = 1; + } else { + alg = 4; + } } - return alltoall__ring(sbuf, scount, sdtype, + return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); } @@ -87,121 +384,220 @@ int alltoallv__ompi(const void *sbuf, const int *scounts, const int *sdisps, MPI_Comm comm ) { - /* For starters, just keep the original algorithm. */ - return alltoallv__ring(sbuf, scounts, sdisps, sdtype, + int communicator_size = comm->size(); + int alg = 1; + int (*funcs[])(const void *, const int*, const int*, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = { + &alltoallv__ompi_basic_linear, + &alltoallv__pair + }; + /** Algorithms: + * {1, "basic_linear"}, + * {2, "pairwise"}, + * + * We can only optimize based on com size + */ + if (communicator_size < 4) { + alg = 2; + } else if (communicator_size < 64) { + alg = 1; + } else if (communicator_size < 128) { + alg = 2; + } else if (communicator_size < 256) { + alg = 1; + } else if (communicator_size < 1024) { + alg = 2; + } else { + alg = 1; + } + return funcs[alg-1](sbuf, scounts, sdisps, sdtype, rbuf, rcounts, rdisps,rdtype, comm); } int barrier__ompi(MPI_Comm comm) -{ int communicator_size = comm->size(); - - if( 2 == communicator_size ) - return barrier__ompi_two_procs(comm); -/* * Basic optimisation. If we have a power of 2 number of nodes*/ -/* * the use the recursive doubling algorithm, otherwise*/ -/* * bruck is the one we want.*/ - { - bool has_one = false; - for( ; communicator_size > 0; communicator_size >>= 1 ) { - if( communicator_size & 0x1 ) { - if( has_one ) - return barrier__ompi_bruck(comm); - has_one = true; - } - } +{ + int communicator_size = comm->size(); + int alg = 1; + int (*funcs[])(MPI_Comm) = { + &barrier__ompi_basic_linear, + &barrier__ompi_basic_linear, + &barrier__ompi_recursivedoubling, + &barrier__ompi_bruck, + &barrier__ompi_two_procs, + &barrier__ompi_tree + }; + /** Algorithms: + * {1, "linear"}, + * {2, "double_ring"}, + * {3, "recursive_doubling"}, + * {4, "bruck"}, + * {5, "two_proc"}, + * {6, "tree"}, + * + * We can only optimize based on com size + */ + if (communicator_size < 4) { + alg = 3; + } else if (communicator_size < 8) { + alg = 1; + } else if (communicator_size < 64) { + alg = 3; + } else if (communicator_size < 256) { + alg = 4; + } else if (communicator_size < 512) { + alg = 6; + } else if (communicator_size < 1024) { + alg = 4; + } else if (communicator_size < 4096) { + alg = 6; + } else { + alg = 4; } - return barrier__ompi_recursivedoubling(comm); + + return funcs[alg-1](comm); } int bcast__ompi(void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { - /* Decision function based on MX results for - messages up to 36MB and communicator sizes up to 64 nodes */ - const size_t small_message_size = 2048; - const size_t intermediate_message_size = 370728; - const double a_p16 = 3.2118e-6; /* [1 / byte] */ - const double b_p16 = 8.7936; - const double a_p64 = 2.3679e-6; /* [1 / byte] */ - const double b_p64 = 1.1787; - const double a_p128 = 1.6134e-6; /* [1 / byte] */ - const double b_p128 = 2.1102; - - int communicator_size; - //int segsize = 0; - size_t message_size, dsize; + int alg = 1; + size_t total_dsize, dsize; - communicator_size = comm->size(); + int communicator_size = comm->size(); - /* else we need data size for decision function */ dsize = datatype->size(); - message_size = dsize * (unsigned long)count; /* needed for decision */ - - /* Handle messages of small and intermediate size, and - single-element broadcasts */ - if ((message_size < small_message_size) || (count <= 1)) { - /* Binomial without segmentation */ - return bcast__binomial_tree(buff, count, datatype, root, comm); - - } else if (message_size < intermediate_message_size) { - // SplittedBinary with 1KB segments - return bcast__ompi_split_bintree(buff, count, datatype, root, comm); - - } - //Handle large message sizes - else if (communicator_size < (a_p128 * message_size + b_p128)) { - //Pipeline with 128KB segments - //segsize = 1024 << 7; - return bcast__ompi_pipeline(buff, count, datatype, root, comm); - - - } else if (communicator_size < 13) { - // Split Binary with 8KB segments - return bcast__ompi_split_bintree(buff, count, datatype, root, comm); - - } else if (communicator_size < (a_p64 * message_size + b_p64)) { - // Pipeline with 64KB segments - //segsize = 1024 << 6; - return bcast__ompi_pipeline(buff, count, datatype, root, comm); - - - } else if (communicator_size < (a_p16 * message_size + b_p16)) { - //Pipeline with 16KB segments - //segsize = 1024 << 4; - return bcast__ompi_pipeline(buff, count, datatype, root, comm); - - - } - /* Pipeline with 8KB segments */ - //segsize = 1024 << 3; - return bcast__flattree_pipeline(buff, count, datatype, root, comm /*segsize*/); -#if 0 - /* this is based on gige measurements */ - - if (communicator_size < 4) { - return bcast__intra_basic_linear(buff, count, datatype, root, comm, module); - } - if (communicator_size == 4) { - if (message_size < 524288) segsize = 0; - else segsize = 16384; - return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize); - } - if (communicator_size <= 8 && message_size < 4096) { - return bcast__intra_basic_linear(buff, count, datatype, root, comm, module); - } - if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) { - segsize = 16384; - return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize); - } - if (message_size >= 524288) { - segsize = 16384; - return bcast__intra_pipeline(buff, count, datatype, root, comm, module, segsize); + total_dsize = dsize * (unsigned long)count; + int (*funcs[])(void*, int, MPI_Datatype, int, MPI_Comm) = { + &bcast__NTSL, + &bcast__ompi_pipeline, + &bcast__ompi_pipeline, + &bcast__ompi_split_bintree, + &bcast__NTSB, + &bcast__binomial_tree, + &bcast__mvapich2_knomial_intra_node, + &bcast__scatter_rdb_allgather, + &bcast__scatter_LR_allgather, + }; + /** Algorithms: + * {1, "basic_linear"}, + * {2, "chain"}, + * {3, "pipeline"}, + * {4, "split_binary_tree"}, + * {5, "binary_tree"}, + * {6, "binomial"}, + * {7, "knomial"}, + * {8, "scatter_allgather"}, + * {9, "scatter_allgather_ring"}, + */ + if (communicator_size < 4) { + if (total_dsize < 32) { + alg = 3; + } else if (total_dsize < 256) { + alg = 5; + } else if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 1024) { + alg = 7; + } else if (total_dsize < 32768) { + alg = 1; + } else if (total_dsize < 131072) { + alg = 5; + } else if (total_dsize < 262144) { + alg = 2; + } else if (total_dsize < 524288) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 6; + } else { + alg = 5; + } + } else if (communicator_size < 8) { + if (total_dsize < 64) { + alg = 5; + } else if (total_dsize < 128) { + alg = 6; + } else if (total_dsize < 2048) { + alg = 5; + } else if (total_dsize < 8192) { + alg = 6; + } else if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 16) { + if (total_dsize < 8) { + alg = 7; + } else if (total_dsize < 64) { + alg = 5; + } else if (total_dsize < 4096) { + alg = 7; + } else if (total_dsize < 16384) { + alg = 5; + } else if (total_dsize < 32768) { + alg = 6; + } else { + alg = 1; + } + } else if (communicator_size < 32) { + if (total_dsize < 4096) { + alg = 7; + } else if (total_dsize < 1048576) { + alg = 6; + } else { + alg = 8; + } + } else if (communicator_size < 64) { + if (total_dsize < 2048) { + alg = 6; + } else { + alg = 7; + } + } else if (communicator_size < 128) { + alg = 7; + } else if (communicator_size < 256) { + if (total_dsize < 2) { + alg = 6; + } else if (total_dsize < 16384) { + alg = 5; + } else if (total_dsize < 32768) { + alg = 1; + } else if (total_dsize < 65536) { + alg = 5; + } else { + alg = 7; + } + } else if (communicator_size < 1024) { + if (total_dsize < 16384) { + alg = 7; + } else if (total_dsize < 32768) { + alg = 4; + } else { + alg = 7; + } + } else if (communicator_size < 2048) { + if (total_dsize < 524288) { + alg = 7; + } else { + alg = 8; + } + } else if (communicator_size < 4096) { + if (total_dsize < 262144) { + alg = 7; + } else { + alg = 8; + } + } else { + if (total_dsize < 8192) { + alg = 7; + } else if (total_dsize < 16384) { + alg = 5; + } else if (total_dsize < 262144) { + alg = 7; + } else { + alg = 8; + } } - segsize = 0; - /* once tested can swap this back in */ - /* return bcast__intra_bmtree(buff, count, datatype, root, comm, segsize); */ - return bcast__intra_bintree(buff, count, datatype, root, comm, module, segsize); -#endif /* 0 */ + return funcs[alg-1](buff, count, datatype, root, comm); } int reduce__ompi(const void *sendbuf, void *recvbuf, @@ -209,100 +605,155 @@ int reduce__ompi(const void *sendbuf, void *recvbuf, MPI_Op op, int root, MPI_Comm comm) { - int communicator_size=0; - //int segsize = 0; - size_t message_size, dsize; - const double a1 = 0.6016 / 1024.0; /* [1/B] */ - const double b1 = 1.3496; - const double a2 = 0.0410 / 1024.0; /* [1/B] */ - const double b2 = 9.7128; - const double a3 = 0.0422 / 1024.0; /* [1/B] */ - const double b3 = 1.1614; - //const double a4 = 0.0033 / 1024.0; [1/B] - //const double b4 = 1.6761; - - /* no limit on # of outstanding requests */ - //const int max_requests = 0; - - communicator_size = comm->size(); + size_t total_dsize, dsize; + int alg = 1; + int communicator_size = comm->size(); - /* need data size for decision function */ dsize=datatype->size(); - message_size = dsize * count; /* needed for decision */ - - /** - * If the operation is non commutative we currently have choice of linear - * or in-order binary tree algorithm. + total_dsize = dsize * count; + int (*funcs[])(const void*, void*, int, MPI_Datatype, MPI_Op, int, MPI_Comm) = { + &reduce__ompi_basic_linear, + &reduce__ompi_chain, + &reduce__ompi_pipeline, + &reduce__ompi_binary, + &reduce__ompi_binomial, + &reduce__ompi_in_order_binary, + //&reduce__rab our rab can't be used with all datatypes + &reduce__ompi_basic_linear + }; + /** Algorithms: + * {1, "linear"}, + * {2, "chain"}, + * {3, "pipeline"}, + * {4, "binary"}, + * {5, "binomial"}, + * {6, "in-order_binary"}, + * {7, "rabenseifner"}, + * + * Currently, only linear and in-order binary tree algorithms are + * capable of non commutative ops. */ - if ((op != MPI_OP_NULL) && not op->is_commutative()) { - if ((communicator_size < 12) && (message_size < 2048)) { - return reduce__ompi_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/); - } - return reduce__ompi_in_order_binary(sendbuf, recvbuf, count, datatype, op, root, comm /*, module, - 0, max_requests*/); + if ((op != MPI_OP_NULL) && not op->is_commutative()) { + if (communicator_size < 4) { + if (total_dsize < 8) { + alg = 6; + } else { + alg = 1; + } + } else if (communicator_size < 8) { + alg = 1; + } else if (communicator_size < 16) { + if (total_dsize < 1024) { + alg = 6; + } else if (total_dsize < 8192) { + alg = 1; + } else if (total_dsize < 16384) { + alg = 6; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 6; + } + } else if (communicator_size < 128) { + alg = 6; + } else if (communicator_size < 256) { + if (total_dsize < 512) { + alg = 6; + } else if (total_dsize < 1024) { + alg = 1; + } else { + alg = 6; + } + } else { + alg = 6; + } + } else { + if (communicator_size < 4) { + if (total_dsize < 8) { + alg = 7; + } else if (total_dsize < 16) { + alg = 4; + } else if (total_dsize < 32) { + alg = 3; + } else if (total_dsize < 262144) { + alg = 1; + } else if (total_dsize < 524288) { + alg = 3; + } else if (total_dsize < 1048576) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 8) { + if (total_dsize < 4096) { + alg = 4; + } else if (total_dsize < 65536) { + alg = 2; + } else if (total_dsize < 262144) { + alg = 5; + } else if (total_dsize < 524288) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 5; + } else { + alg = 1; + } + } else if (communicator_size < 16) { + if (total_dsize < 8192) { + alg = 4; + } else { + alg = 5; + } + } else if (communicator_size < 32) { + if (total_dsize < 4096) { + alg = 4; + } else { + alg = 5; + } + } else if (communicator_size < 256) { + alg = 5; + } else if (communicator_size < 512) { + if (total_dsize < 8192) { + alg = 5; + } else if (total_dsize < 16384) { + alg = 6; + } else { + alg = 5; + } + } else if (communicator_size < 2048) { + alg = 5; + } else if (communicator_size < 4096) { + if (total_dsize < 512) { + alg = 5; + } else if (total_dsize < 1024) { + alg = 6; + } else if (total_dsize < 8192) { + alg = 5; + } else if (total_dsize < 16384) { + alg = 6; + } else { + alg = 5; + } + } else { + if (total_dsize < 16) { + alg = 5; + } else if (total_dsize < 32) { + alg = 6; + } else if (total_dsize < 1024) { + alg = 5; + } else if (total_dsize < 2048) { + alg = 6; + } else if (total_dsize < 8192) { + alg = 5; + } else if (total_dsize < 16384) { + alg = 6; + } else { + alg = 5; + } + } } - if ((communicator_size < 8) && (message_size < 512)){ - /* Linear_0K */ - return reduce__ompi_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm); - } else if (((communicator_size < 8) && (message_size < 20480)) || - (message_size < 2048) || (count <= 1)) { - /* Binomial_0K */ - //segsize = 0; - return reduce__ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, segsize, max_requests*/); - } else if (communicator_size > (a1 * message_size + b1)) { - // Binomial_1K - //segsize = 1024; - return reduce__ompi_binomial(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, - segsize, max_requests*/); - } else if (communicator_size > (a2 * message_size + b2)) { - // Pipeline_1K - //segsize = 1024; - return reduce__ompi_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, - segsize, max_requests*/); - } else if (communicator_size > (a3 * message_size + b3)) { - // Binary_32K - //segsize = 32*1024; - return reduce__ompi_binary( sendbuf, recvbuf, count, datatype, op, root, - comm/*, module, segsize, max_requests*/); - } -// if (communicator_size > (a4 * message_size + b4)) { - // Pipeline_32K -// segsize = 32*1024; -// } else { - // Pipeline_64K -// segsize = 64*1024; -// } - return reduce__ompi_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm/*, module, - segsize, max_requests*/); - -#if 0 - /* for small messages use linear algorithm */ - if (message_size <= 4096) { - segsize = 0; - fanout = communicator_size - 1; - /* when linear implemented or taken from basic put here, right now using chain as a linear system */ - /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */ - return reduce__intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module); - /* return reduce__intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */ - } - if (message_size < 524288) { - if (message_size <= 65536 ) { - segsize = 32768; - fanout = 8; - } else { - segsize = 1024; - fanout = communicator_size/2; - } - /* later swap this for a binary tree */ - /* fanout = 2; */ - return reduce__intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module, - segsize, fanout, max_requests); - } - segsize = 1024; - return reduce__intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module, - segsize, max_requests); -#endif /* 0 */ + return funcs[alg-1] (sendbuf, recvbuf, count, datatype, op, root, comm); } int reduce_scatter__ompi(const void *sbuf, void *rbuf, @@ -312,43 +763,141 @@ int reduce_scatter__ompi(const void *sbuf, void *rbuf, MPI_Comm comm ) { - int comm_size, i, pow2; - size_t total_message_size, dsize; - const double a = 0.0012; - const double b = 8.0; - const size_t small_message_size = 12 * 1024; - const size_t large_message_size = 256 * 1024; + size_t total_dsize, dsize; + int communicator_size = comm->size(); + int alg = 1; int zerocounts = 0; - - XBT_DEBUG("reduce_scatter__ompi"); - - comm_size = comm->size(); - // We need data size for decision function dsize=dtype->size(); - total_message_size = 0; - for (i = 0; i < comm_size; i++) { - total_message_size += rcounts[i]; - if (0 == rcounts[i]) { - zerocounts = 1; - } + total_dsize = 0; + for (int i = 0; i < communicator_size; i++) { + total_dsize += rcounts[i]; + // if (0 == rcounts[i]) { + // zerocounts = 1; + //} } - + total_dsize *= dsize; + int (*funcs[])(const void*, void*, const int*, MPI_Datatype, MPI_Op, MPI_Comm) = { + &reduce_scatter__default, + &reduce_scatter__ompi_basic_recursivehalving, + &reduce_scatter__ompi_ring, + &reduce_scatter__ompi_ring, + }; + /** Algorithms: + * {1, "non-overlapping"}, + * {2, "recursive_halving"}, + * {3, "ring"}, + * {4, "butterfly"}, + * + * Non commutative algorithm capability needs re-investigation. + * Defaulting to non overlapping for non commutative ops. + */ if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) { - reduce_scatter__default(sbuf, rbuf, rcounts, dtype, op, comm); - return MPI_SUCCESS; + alg = 1; + } else { + if (communicator_size < 4) { + if (total_dsize < 65536) { + alg = 3; + } else if (total_dsize < 131072) { + alg = 4; + } else { + alg = 3; + } + } else if (communicator_size < 8) { + if (total_dsize < 8) { + alg = 1; + } else if (total_dsize < 262144) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 32) { + if (total_dsize < 262144) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 64) { + if (total_dsize < 64) { + alg = 1; + } else if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 524288) { + alg = 4; + } else { + alg = 3; + } + } else if (communicator_size < 128) { + if (total_dsize < 256) { + alg = 1; + } else if (total_dsize < 512) { + alg = 2; + } else if (total_dsize < 2048) { + alg = 4; + } else if (total_dsize < 4096) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 256) { + if (total_dsize < 256) { + alg = 1; + } else if (total_dsize < 512) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 512) { + if (total_dsize < 256) { + alg = 1; + } else if (total_dsize < 1024) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 1024) { + if (total_dsize < 512) { + alg = 1; + } else if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 8192) { + alg = 4; + } else if (total_dsize < 16384) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 2048) { + if (total_dsize < 512) { + alg = 1; + } else if (total_dsize < 4096) { + alg = 2; + } else if (total_dsize < 16384) { + alg = 4; + } else if (total_dsize < 32768) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 4096) { + if (total_dsize < 512) { + alg = 1; + } else if (total_dsize < 4096) { + alg = 2; + } else { + alg = 4; + } + } else { + if (total_dsize < 1024) { + alg = 1; + } else if (total_dsize < 8192) { + alg = 2; + } else { + alg = 4; + } + } } - total_message_size *= dsize; - - // compute the nearest power of 2 - for (pow2 = 1; pow2 < comm_size; pow2 <<= 1); - - if ((total_message_size <= small_message_size) || - ((total_message_size <= large_message_size) && (pow2 == comm_size)) || - (comm_size >= a * total_message_size + b)) { - return reduce_scatter__ompi_basic_recursivehalving(sbuf, rbuf, rcounts, dtype, op, comm); - } - return reduce_scatter__ompi_ring(sbuf, rbuf, rcounts, dtype, op, comm); + return funcs[alg-1] (sbuf, rbuf, rcounts, dtype, op, comm); } int allgather__ompi(const void *sbuf, int scount, @@ -358,77 +907,138 @@ int allgather__ompi(const void *sbuf, int scount, MPI_Comm comm ) { - int communicator_size, pow2_size; + int communicator_size; size_t dsize, total_dsize; - + int alg = 1; communicator_size = comm->size(); - - /* Special case for 2 processes */ - if (communicator_size == 2) { - return allgather__pair(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm/*, module*/); + if (MPI_IN_PLACE != sbuf) { + dsize = sdtype->size(); + } else { + dsize = rdtype->size(); } - - /* Determine complete data size */ - dsize=sdtype->size(); - total_dsize = dsize * scount * communicator_size; - - for (pow2_size = 1; pow2_size < communicator_size; pow2_size <<=1); - - /* Decision based on MX 2Gb results from Grig cluster at - The University of Tennesse, Knoxville - - if total message size is less than 50KB use either bruck or - recursive doubling for non-power of two and power of two nodes, - respectively. - - else use ring and neighbor exchange algorithms for odd and even - number of nodes, respectively. - */ - if (total_dsize < 50000) { - if (pow2_size == communicator_size) { - return allgather__rdb(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); - } else { - return allgather__bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); + total_dsize = dsize * (ptrdiff_t)scount; + int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm) = { + &allgather__NTSLR_NB, + &allgather__bruck, + &allgather__rdb, + &allgather__ring, + &allgather__ompi_neighborexchange, + &allgather__pair + }; + /** Algorithms: + * {1, "linear"}, + * {2, "bruck"}, + * {3, "recursive_doubling"}, + * {4, "ring"}, + * {5, "neighbor"}, + * {6, "two_proc"} + */ + if (communicator_size == 2) { + alg = 6; + } else if (communicator_size < 32) { + alg = 3; + } else if (communicator_size < 64) { + if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 65536) { + alg = 5; + } else { + alg = 4; + } + } else if (communicator_size < 128) { + if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 65536) { + alg = 5; + } else { + alg = 4; + } + } else if (communicator_size < 256) { + if (total_dsize < 512) { + alg = 3; + } else if (total_dsize < 131072) { + alg = 5; + } else if (total_dsize < 524288) { + alg = 4; + } else if (total_dsize < 1048576) { + alg = 5; + } else { + alg = 4; + } + } else if (communicator_size < 512) { + if (total_dsize < 32) { + alg = 3; + } else if (total_dsize < 128) { + alg = 2; + } else if (total_dsize < 1024) { + alg = 3; + } else if (total_dsize < 131072) { + alg = 5; + } else if (total_dsize < 524288) { + alg = 4; + } else if (total_dsize < 1048576) { + alg = 5; + } else { + alg = 4; + } + } else if (communicator_size < 1024) { + if (total_dsize < 64) { + alg = 3; + } else if (total_dsize < 256) { + alg = 2; + } else if (total_dsize < 2048) { + alg = 3; + } else { + alg = 5; + } + } else if (communicator_size < 2048) { + if (total_dsize < 4) { + alg = 3; + } else if (total_dsize < 8) { + alg = 2; + } else if (total_dsize < 16) { + alg = 3; + } else if (total_dsize < 32) { + alg = 2; + } else if (total_dsize < 256) { + alg = 3; + } else if (total_dsize < 512) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 3; + } else { + alg = 5; + } + } else if (communicator_size < 4096) { + if (total_dsize < 32) { + alg = 2; + } else if (total_dsize < 128) { + alg = 3; + } else if (total_dsize < 512) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 3; + } else { + alg = 5; } } else { - if (communicator_size % 2) { - return allgather__ring(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); + if (total_dsize < 2) { + alg = 3; + } else if (total_dsize < 8) { + alg = 2; + } else if (total_dsize < 16) { + alg = 3; + } else if (total_dsize < 512) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 3; } else { - return allgather__ompi_neighborexchange(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); + alg = 5; } } -#if defined(USE_MPICH2_DECISION) - /* Decision as in MPICH-2 - presented in Thakur et.al. "Optimization of Collective Communication - Operations in MPICH", International Journal of High Performance Computing - Applications, Vol. 19, No. 1, 49-66 (2005) - - for power-of-two processes and small and medium size messages - (up to 512KB) use recursive doubling - - for non-power-of-two processes and small messages (80KB) use bruck, - - for everything else use ring. - */ - if ((pow2_size == communicator_size) && (total_dsize < 524288)) { - return allgather__rdb(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); - } else if (total_dsize <= 81920) { - return allgather__bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); - } - return allgather__ring(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm); -#endif /* defined(USE_MPICH2_DECISION) */ + return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, comm); + } int allgatherv__ompi(const void *sbuf, int scount, @@ -442,40 +1052,125 @@ int allgatherv__ompi(const void *sbuf, int scount, int i; int communicator_size; size_t dsize, total_dsize; - + int alg = 1; communicator_size = comm->size(); - - /* Special case for 2 processes */ - if (communicator_size == 2) { - return allgatherv__pair(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm); + if (MPI_IN_PLACE != sbuf) { + dsize = sdtype->size(); + } else { + dsize = rdtype->size(); } - /* Determine complete data size */ - dsize=sdtype->size(); total_dsize = 0; for (i = 0; i < communicator_size; i++) { total_dsize += dsize * rcounts[i]; } - /* Decision based on allgather decision. */ - if (total_dsize < 50000) { - return allgatherv__ompi_bruck(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm); - + /* use the per-rank data size as basis, similar to allgather */ + size_t per_rank_dsize = total_dsize / communicator_size; + + int (*funcs[])(const void*, int, MPI_Datatype, void*, const int*, const int*, MPI_Datatype, MPI_Comm) = { + &allgatherv__GB, + &allgatherv__ompi_bruck, + &allgatherv__mpich_ring, + &allgatherv__ompi_neighborexchange, + &allgatherv__pair + }; + /** Algorithms: + * {1, "default"}, + * {2, "bruck"}, + * {3, "ring"}, + * {4, "neighbor"}, + * {5, "two_proc"}, + */ + if (communicator_size == 2) { + if (per_rank_dsize < 2048) { + alg = 3; + } else if (per_rank_dsize < 4096) { + alg = 5; + } else if (per_rank_dsize < 8192) { + alg = 3; + } else { + alg = 5; + } + } else if (communicator_size < 8) { + if (per_rank_dsize < 256) { + alg = 1; + } else if (per_rank_dsize < 4096) { + alg = 4; + } else if (per_rank_dsize < 8192) { + alg = 3; + } else if (per_rank_dsize < 16384) { + alg = 4; + } else if (per_rank_dsize < 262144) { + alg = 2; + } else { + alg = 4; + } + } else if (communicator_size < 16) { + if (per_rank_dsize < 1024) { + alg = 1; + } else { + alg = 2; + } + } else if (communicator_size < 32) { + if (per_rank_dsize < 128) { + alg = 1; + } else if (per_rank_dsize < 262144) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 64) { + if (per_rank_dsize < 256) { + alg = 1; + } else if (per_rank_dsize < 8192) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 128) { + if (per_rank_dsize < 256) { + alg = 1; + } else if (per_rank_dsize < 4096) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 256) { + if (per_rank_dsize < 1024) { + alg = 2; + } else if (per_rank_dsize < 65536) { + alg = 4; + } else { + alg = 3; + } + } else if (communicator_size < 512) { + if (per_rank_dsize < 1024) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 1024) { + if (per_rank_dsize < 512) { + alg = 2; + } else if (per_rank_dsize < 1024) { + alg = 1; + } else if (per_rank_dsize < 4096) { + alg = 2; + } else if (per_rank_dsize < 1048576) { + alg = 4; + } else { + alg = 3; + } } else { - if (communicator_size % 2) { - return allgatherv__ring(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm); + if (per_rank_dsize < 4096) { + alg = 2; } else { - return allgatherv__ompi_neighborexchange(sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, - comm); + alg = 4; } } + + return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } int gather__ompi(const void *sbuf, int scount, @@ -486,55 +1181,73 @@ int gather__ompi(const void *sbuf, int scount, MPI_Comm comm ) { - //const int large_segment_size = 32768; - //const int small_segment_size = 1024; - - //const size_t large_block_size = 92160; - const size_t intermediate_block_size = 6000; - const size_t small_block_size = 1024; - - const int large_communicator_size = 60; - const int small_communicator_size = 10; - int communicator_size, rank; - size_t dsize, block_size; - - XBT_DEBUG("smpi_coll_tuned_gather_ompi"); - + size_t dsize, total_dsize; + int alg = 1; communicator_size = comm->size(); rank = comm->rank(); - // Determine block size if (rank == root) { dsize = rdtype->size(); - block_size = dsize * rcount; + total_dsize = dsize * rcount; } else { dsize = sdtype->size(); - block_size = dsize * scount; + total_dsize = dsize * scount; } - -/* if (block_size > large_block_size) {*/ -/* return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */ -/* rbuf, rcount, rdtype, */ -/* root, comm);*/ - -/* } else*/ if (block_size > intermediate_block_size) { - return gather__ompi_linear_sync(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm); - - } else if ((communicator_size > large_communicator_size) || - ((communicator_size > small_communicator_size) && - (block_size < small_block_size))) { - return gather__ompi_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm); - + int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = { + &gather__ompi_basic_linear, + &gather__ompi_binomial, + &gather__ompi_linear_sync + }; + /** Algorithms: + * {1, "basic_linear"}, + * {2, "binomial"}, + * {3, "linear_sync"}, + * + * We do not make any rank specific checks since the params + * should be uniform across ranks. + */ + if (communicator_size < 4) { + if (total_dsize < 2) { + alg = 3; + } else if (total_dsize < 4) { + alg = 1; + } else if (total_dsize < 32768) { + alg = 2; + } else if (total_dsize < 65536) { + alg = 1; + } else if (total_dsize < 131072) { + alg = 2; + } else { + alg = 3; + } + } else if (communicator_size < 8) { + if (total_dsize < 1024) { + alg = 2; + } else if (total_dsize < 8192) { + alg = 1; + } else if (total_dsize < 32768) { + alg = 2; + } else if (total_dsize < 262144) { + alg = 1; + } else { + alg = 3; + } + } else if (communicator_size < 256) { + alg = 2; + } else if (communicator_size < 512) { + if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 8192) { + alg = 1; + } else { + alg = 2; + } + } else { + alg = 2; } - // Otherwise, use basic linear - return gather__ompi_basic_linear(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm); + + return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } @@ -545,38 +1258,91 @@ int scatter__ompi(const void *sbuf, int scount, int root, MPI_Comm comm ) { - const size_t small_block_size = 300; - const int small_comm_size = 10; int communicator_size, rank; - size_t dsize, block_size; - - XBT_DEBUG("Coll_scatter_ompi::scatter"); + size_t dsize, total_dsize; + int alg = 1; communicator_size = comm->size(); rank = comm->rank(); - // Determine block size if (root == rank) { dsize=sdtype->size(); - block_size = dsize * scount; + total_dsize = dsize * scount; } else { dsize=rdtype->size(); - block_size = dsize * rcount; + total_dsize = dsize * rcount; } - - if ((communicator_size > small_comm_size) && - (block_size < small_block_size)) { - std::unique_ptr tmp_buf; - if (rank != root) { - tmp_buf = std::make_unique(rcount * rdtype->get_extent()); - sbuf = tmp_buf.get(); - scount = rcount; - sdtype = rdtype; - } - return scatter__ompi_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); + int (*funcs[])(const void*, int, MPI_Datatype, void*, int, MPI_Datatype, int, MPI_Comm) = { + &scatter__ompi_basic_linear, + &scatter__ompi_binomial, + &scatter__ompi_basic_linear + }; + /** Algorithms: + * {1, "basic_linear"}, + * {2, "binomial"}, + * {3, "linear_nb"}, + * + * We do not make any rank specific checks since the params + * should be uniform across ranks. + */ + if (communicator_size < 4) { + if (total_dsize < 2) { + alg = 3; + } else if (total_dsize < 131072) { + alg = 1; + } else if (total_dsize < 262144) { + alg = 3; + } else { + alg = 1; + } + } else if (communicator_size < 8) { + if (total_dsize < 2048) { + alg = 2; + } else if (total_dsize < 4096) { + alg = 1; + } else if (total_dsize < 8192) { + alg = 2; + } else if (total_dsize < 32768) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 3; + } else { + alg = 1; + } + } else if (communicator_size < 16) { + if (total_dsize < 16384) { + alg = 2; + } else if (total_dsize < 1048576) { + alg = 3; + } else { + alg = 1; + } + } else if (communicator_size < 32) { + if (total_dsize < 16384) { + alg = 2; + } else if (total_dsize < 32768) { + alg = 1; + } else { + alg = 3; + } + } else if (communicator_size < 64) { + if (total_dsize < 512) { + alg = 2; + } else if (total_dsize < 8192) { + alg = 3; + } else if (total_dsize < 16384) { + alg = 2; + } else { + alg = 3; + } + } else { + if (total_dsize < 512) { + alg = 2; + } else { + alg = 3; + } } - return scatter__ompi_basic_linear(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm); + + return funcs[alg-1](sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm); } } -- 2.20.1