1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
11 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
13 #define INTEL_MAX_NB_THRESHOLDS 32
14 #define INTEL_MAX_NB_NUMPROCS 12
15 #define INTEL_MAX_NB_PPN 5 /* 1 2 4 8 16 ppn */
17 struct intel_tuning_table_size_element {
18 unsigned int max_size;
22 struct intel_tuning_table_numproc_element {
25 intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
28 struct intel_tuning_table_element {
30 intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
34 I_MPI_ADJUST_ALLREDUCE
38 1 - Recursive doubling algorithm
39 2 - Rabenseifner's algorithm
40 3 - Reduce + Bcast algorithm
41 4 - Topology aware Reduce + Bcast algorithm
42 5 - Binomial gather + scatter algorithm
43 6 - Topology aware binominal gather + scatter algorithm
44 7 - Shumilin's ring algorithm
47 as Shumilin's ring algorithm is unknown, default to ring'
53 int (*intel_allreduce_functions_table[])(void *sendbuf,
56 MPI_Datatype datatype,
57 MPI_Op op, MPI_Comm comm) ={
58 Coll_allreduce_rdb::allreduce,
59 Coll_allreduce_rab1::allreduce,
60 Coll_allreduce_redbcast::allreduce,
61 Coll_allreduce_mvapich2_two_level::allreduce,
62 Coll_allreduce_smp_binomial::allreduce,
63 Coll_allreduce_mvapich2_two_level::allreduce,
64 Coll_allreduce_ompi_ring_segmented::allreduce,
65 Coll_allreduce_ompi_ring_segmented::allreduce
68 intel_tuning_table_element intel_allreduce_table[] =
385 /*I_MPI_ADJUST_ALLTOALL
390 2. Isend/Irecv + waitall algorithm
391 3. Pair wise exchange algorithm
397 intel_tuning_table_element intel_alltoall_table[] =
637 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
639 void* rbuf, int rcount,
642 Coll_alltoall_bruck::alltoall,
643 Coll_alltoall_mvapich2_scatter_dest::alltoall,
644 Coll_alltoall_pair::alltoall,
645 Coll_alltoall_mvapich2::alltoall//Plum is proprietary ? (and super efficient)
648 /*I_MPI_ADJUST_BARRIER
652 1. Dissemination algorithm
653 2. Recursive doubling algorithm
654 3. Topology aware dissemination algorithm
655 4. Topology aware recursive doubling algorithm
656 5. Binominal gather + scatter algorithm
657 6. Topology aware binominal gather + scatter algorithm
660 static int intel_barrier_gather_scatter(MPI_Comm comm){
661 //our default barrier performs a antibcast/bcast
662 Coll_barrier_default::barrier(comm);
666 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
667 Coll_barrier_ompi_basic_linear::barrier,
668 Coll_barrier_ompi_recursivedoubling::barrier,
669 Coll_barrier_ompi_basic_linear::barrier,
670 Coll_barrier_ompi_recursivedoubling::barrier,
671 intel_barrier_gather_scatter,
672 intel_barrier_gather_scatter
675 intel_tuning_table_element intel_barrier_table[] =
792 1. Binomial algorithm
793 2. Recursive doubling algorithm
795 4. Topology aware binomial algorithm
796 5. Topology aware recursive doubling algorithm
797 6. Topology aware ring algorithm
798 7. Shumilin's bcast algorithm
801 int (*intel_bcast_functions_table[])(void *buff, int count,
802 MPI_Datatype datatype, int root,
804 Coll_bcast_binomial_tree::bcast,
805 //Coll_bcast_scatter_rdb_allgather::bcast,
806 Coll_bcast_NTSL::bcast,
807 Coll_bcast_NTSL::bcast,
808 Coll_bcast_SMP_binomial::bcast,
809 //Coll_bcast_scatter_rdb_allgather::bcast,
810 Coll_bcast_NTSL::bcast,
811 Coll_bcast_SMP_linear::bcast,
812 Coll_bcast_mvapich2::bcast,//we don't know shumilin's algo'
815 intel_tuning_table_element intel_bcast_table[] =
957 /*I_MPI_ADJUST_REDUCE
961 1. Shumilin's algorithm
962 2. Binomial algorithm
963 3. Topology aware Shumilin's algorithm
964 4. Topology aware binomial algorithm
965 5. Rabenseifner's algorithm
966 6. Topology aware Rabenseifner's algorithm
970 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
971 int count, MPI_Datatype datatype,
974 Coll_reduce_mvapich2::reduce,
975 Coll_reduce_binomial::reduce,
976 Coll_reduce_mvapich2::reduce,
977 Coll_reduce_mvapich2_two_level::reduce,
978 Coll_reduce_rab::reduce,
979 Coll_reduce_rab::reduce
982 intel_tuning_table_element intel_reduce_table[] =
1047 /* I_MPI_ADJUST_REDUCE_SCATTER
1051 1. Recursive having algorithm
1052 2. Pair wise exchange algorithm
1053 3. Recursive doubling algorithm
1054 4. Reduce + Scatterv algorithm
1055 5. Topology aware Reduce + Scatterv algorithm
1058 static int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
1064 Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1068 static int intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
1074 if(op==MPI_OP_NULL || op->is_commutative())
1075 return Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1077 return Coll_reduce_scatter_mvapich2::reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1080 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
1086 intel_reduce_scatter_recursivehalving,
1087 Coll_reduce_scatter_mpich_pair::reduce_scatter,
1088 Coll_reduce_scatter_mpich_rdb::reduce_scatter,
1089 intel_reduce_scatter_reduce_scatterv,
1090 intel_reduce_scatter_reduce_scatterv
1093 intel_tuning_table_element intel_reduce_scatter_table[] =
1479 /* I_MPI_ADJUST_ALLGATHER
1483 1. Recursive doubling algorithm
1484 2. Bruck's algorithm
1486 4. Topology aware Gatherv + Bcast algorithm
1490 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
1491 MPI_Datatype sdtype,
1492 void* rbuf, int rcount,
1493 MPI_Datatype rdtype,
1496 Coll_allgather_rdb::allgather,
1497 Coll_allgather_bruck::allgather,
1498 Coll_allgather_ring::allgather,
1499 Coll_allgather_GB::allgather
1502 intel_tuning_table_element intel_allgather_table[] =
1648 /* I_MPI_ADJUST_ALLGATHERV
1652 1. Recursive doubling algorithm
1653 2. Bruck's algorithm
1655 4. Topology aware Gatherv + Bcast algorithm
1659 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
1660 MPI_Datatype sdtype,
1661 void* rbuf, int *rcounts,
1663 MPI_Datatype rdtype,
1666 Coll_allgatherv_mpich_rdb::allgatherv,
1667 Coll_allgatherv_ompi_bruck::allgatherv,
1668 Coll_allgatherv_ring::allgatherv,
1669 Coll_allgatherv_GB::allgatherv
1672 intel_tuning_table_element intel_allgatherv_table[] =
1860 /* I_MPI_ADJUST_GATHER
1864 1. Binomial algorithm
1865 2. Topology aware binomial algorithm
1866 3. Shumilin's algorithm
1870 int (*intel_gather_functions_table[])(void *sbuf, int scount,
1871 MPI_Datatype sdtype,
1872 void* rbuf, int rcount,
1873 MPI_Datatype rdtype,
1877 Coll_gather_ompi_binomial::gather,
1878 Coll_gather_ompi_binomial::gather,
1879 Coll_gather_mvapich2::gather
1882 intel_tuning_table_element intel_gather_table[] =
1964 /* I_MPI_ADJUST_SCATTER
1968 1. Binomial algorithm
1969 2. Topology aware binomial algorithm
1970 3. Shumilin's algorithm
1974 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
1975 MPI_Datatype sdtype,
1976 void* rbuf, int rcount,
1977 MPI_Datatype rdtype,
1978 int root, MPI_Comm comm
1980 Coll_scatter_ompi_binomial::scatter,
1981 Coll_scatter_ompi_binomial::scatter,
1982 Coll_scatter_mvapich2::scatter
1985 intel_tuning_table_element intel_scatter_table[] =
2139 /* I_MPI_ADJUST_ALLTOALLV
2143 1. Isend/Irecv + waitall algorithm
2148 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
2149 MPI_Datatype sdtype,
2150 void *rbuf, int *rcounts, int *rdisps,
2151 MPI_Datatype rdtype,
2154 Coll_alltoallv_ompi_basic_linear::alltoallv,
2155 Coll_alltoallv_bruck::alltoallv
2158 intel_tuning_table_element intel_alltoallv_table[] =
2178 {2147483647,1}//weirdly, intel reports the use of algo 0 here
2194 {0,1},//weird again, only for 0-sized messages
2215 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2218 #define SIZECOMP_reduce_scatter\
2219 int total_message_size = 0;\
2220 for (i = 0; i < comm_size; i++) { \
2221 total_message_size += rcounts[i];\
2223 size_t block_dsize = total_message_size*dtype->size();\
2225 #define SIZECOMP_allreduce\
2226 size_t block_dsize =rcount * dtype->size();
2228 #define SIZECOMP_alltoall\
2229 size_t block_dsize =send_count * send_type->size();
2231 #define SIZECOMP_bcast\
2232 size_t block_dsize =count * datatype->size();
2234 #define SIZECOMP_reduce\
2235 size_t block_dsize =count * datatype->size();
2237 #define SIZECOMP_barrier\
2238 size_t block_dsize = 1;
2240 #define SIZECOMP_allgather\
2241 size_t block_dsize =recv_count * recv_type->size();
2243 #define SIZECOMP_allgatherv\
2244 int total_message_size = 0;\
2245 for (i = 0; i < comm_size; i++) { \
2246 total_message_size += recv_count[i];\
2248 size_t block_dsize = total_message_size*recv_type->size();
2250 #define SIZECOMP_gather\
2251 int rank = comm->rank();\
2252 size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2253 recv_count * recv_type->size() :\
2254 send_count * send_type->size();
2256 #define SIZECOMP_scatter\
2257 int rank = comm->rank();\
2258 size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2259 recvcount * recvtype->size() :\
2260 sendcount * sendtype->size();
2262 #define SIZECOMP_alltoallv\
2263 size_t block_dsize = 1;
2265 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
2266 ret Coll_ ## cat ## _impi:: cat (COLL_UNPAREN args)\
2268 int comm_size = comm->size();\
2273 if(comm->get_leaders_comm()==MPI_COMM_NULL){\
2277 if (comm->is_uniform()) {\
2278 local_size = comm->get_intra_comm()->size();\
2280 while(i < INTEL_MAX_NB_PPN &&\
2281 local_size!=intel_ ## cat ## _table[i].ppn)\
2283 if(i==INTEL_MAX_NB_PPN) i=0;\
2284 while(comm_size>intel_ ## cat ## _table[i].elems[j].max_num_proc\
2285 && j < INTEL_MAX_NB_THRESHOLDS)\
2287 while(block_dsize >=intel_ ## cat ## _table[i].elems[j].elems[k].max_size\
2288 && k< intel_ ## cat ## _table[i].elems[j].num_elems)\
2290 return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].elems[k].algo-1]\
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
2303 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
2304 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
2305 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));