1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2023. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.hpp"
11 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
13 #define INTEL_MAX_NB_THRESHOLDS 32
14 #define INTEL_MAX_NB_NUMPROCS 12
15 #define INTEL_MAX_NB_PPN 5 /* 1 2 4 8 16 ppn */
17 struct intel_tuning_table_size_element {
18 unsigned int max_size;
22 struct intel_tuning_table_numproc_element {
25 intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
28 struct intel_tuning_table_element {
30 intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
34 I_MPI_ADJUST_ALLREDUCE
38 1 - Recursive doubling algorithm
39 2 - Rabenseifner's algorithm
40 3 - Reduce + Bcast algorithm
41 4 - Topology aware Reduce + Bcast algorithm
42 5 - Binomial gather + scatter algorithm
43 6 - Topology aware binominal gather + scatter algorithm
44 7 - Shumilin's ring algorithm
47 as Shumilin's ring algorithm is unknown, default to ring'
50 namespace simgrid::smpi {
52 int (*intel_allreduce_functions_table[])(const void *sendbuf,
55 MPI_Datatype datatype,
56 MPI_Op op, MPI_Comm comm) ={
60 allreduce__mvapich2_two_level,
61 allreduce__smp_binomial,
62 allreduce__mvapich2_two_level,
63 allreduce__ompi_ring_segmented,
64 allreduce__ompi_ring_segmented
67 intel_tuning_table_element intel_allreduce_table[] =
384 /*I_MPI_ADJUST_ALLTOALL
389 2. Isend/Irecv + waitall algorithm
390 3. Pair wise exchange algorithm
396 intel_tuning_table_element intel_alltoall_table[] =
636 int (*intel_alltoall_functions_table[])(const void *sbuf, int scount,
638 void* rbuf, int rcount,
642 alltoall__mvapich2_scatter_dest,
644 alltoall__mvapich2//Plum is proprietary ? (and super efficient)
647 /*I_MPI_ADJUST_BARRIER
651 1. Dissemination algorithm
652 2. Recursive doubling algorithm
653 3. Topology aware dissemination algorithm
654 4. Topology aware recursive doubling algorithm
655 5. Binominal gather + scatter algorithm
656 6. Topology aware binominal gather + scatter algorithm
659 static int intel_barrier_gather_scatter(MPI_Comm comm){
660 // our default barrier performs an antibcast/bcast
661 barrier__default(comm);
665 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
666 barrier__ompi_basic_linear,
667 barrier__ompi_recursivedoubling,
668 barrier__ompi_basic_linear,
669 barrier__ompi_recursivedoubling,
670 intel_barrier_gather_scatter,
671 intel_barrier_gather_scatter
674 intel_tuning_table_element intel_barrier_table[] =
791 1. Binomial algorithm
792 2. Recursive doubling algorithm
794 4. Topology aware binomial algorithm
795 5. Topology aware recursive doubling algorithm
796 6. Topology aware ring algorithm
797 7. Shumilin's bcast algorithm
800 int (*intel_bcast_functions_table[])(void *buff, int count,
801 MPI_Datatype datatype, int root,
803 bcast__binomial_tree,
804 //bcast__scatter_rdb_allgather,
808 //bcast__scatter_rdb_allgather,
811 bcast__mvapich2,//we don't know shumilin's algo'
814 intel_tuning_table_element intel_bcast_table[] =
956 /*I_MPI_ADJUST_REDUCE
960 1. Shumilin's algorithm
961 2. Binomial algorithm
962 3. Topology aware Shumilin's algorithm
963 4. Topology aware binomial algorithm
964 5. Rabenseifner's algorithm
965 6. Topology aware Rabenseifner's algorithm
969 int (*intel_reduce_functions_table[])(const void *sendbuf, void *recvbuf,
970 int count, MPI_Datatype datatype,
976 reduce__mvapich2_two_level,
981 intel_tuning_table_element intel_reduce_table[] =
1046 /* I_MPI_ADJUST_REDUCE_SCATTER
1050 1. Recursive having algorithm
1051 2. Pair wise exchange algorithm
1052 3. Recursive doubling algorithm
1053 4. Reduce + Scatterv algorithm
1054 5. Topology aware Reduce + Scatterv algorithm
1057 static int intel_reduce_scatter_reduce_scatterv(const void *sbuf, void *rbuf,
1063 reduce_scatter__default(sbuf, rbuf, rcounts,dtype, op,comm);
1067 static int intel_reduce_scatter_recursivehalving(const void *sbuf, void *rbuf,
1073 if(op==MPI_OP_NULL || op->is_commutative())
1074 return reduce_scatter__ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1076 return reduce_scatter__mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1079 int (*intel_reduce_scatter_functions_table[])( const void *sbuf, void *rbuf,
1085 intel_reduce_scatter_recursivehalving,
1086 reduce_scatter__mpich_pair,
1087 reduce_scatter__mpich_rdb,
1088 intel_reduce_scatter_reduce_scatterv,
1089 intel_reduce_scatter_reduce_scatterv
1092 intel_tuning_table_element intel_reduce_scatter_table[] =
1478 /* I_MPI_ADJUST_ALLGATHER
1482 1. Recursive doubling algorithm
1483 2. Bruck's algorithm
1485 4. Topology aware Gatherv + Bcast algorithm
1489 int (*intel_allgather_functions_table[])(const void *sbuf, int scount,
1490 MPI_Datatype sdtype,
1491 void* rbuf, int rcount,
1492 MPI_Datatype rdtype,
1501 intel_tuning_table_element intel_allgather_table[] =
1647 /* I_MPI_ADJUST_ALLGATHERV
1651 1. Recursive doubling algorithm
1652 2. Bruck's algorithm
1654 4. Topology aware Gatherv + Bcast algorithm
1658 int (*intel_allgatherv_functions_table[])(const void *sbuf, int scount,
1659 MPI_Datatype sdtype,
1660 void* rbuf, const int *rcounts,
1662 MPI_Datatype rdtype,
1665 allgatherv__mpich_rdb,
1666 allgatherv__ompi_bruck,
1671 intel_tuning_table_element intel_allgatherv_table[] =
1859 /* I_MPI_ADJUST_GATHER
1863 1. Binomial algorithm
1864 2. Topology aware binomial algorithm
1865 3. Shumilin's algorithm
1869 int (*intel_gather_functions_table[])(const void *sbuf, int scount,
1870 MPI_Datatype sdtype,
1871 void* rbuf, int rcount,
1872 MPI_Datatype rdtype,
1876 gather__ompi_binomial,
1877 gather__ompi_binomial,
1881 intel_tuning_table_element intel_gather_table[] =
1963 /* I_MPI_ADJUST_SCATTER
1967 1. Binomial algorithm
1968 2. Topology aware binomial algorithm
1969 3. Shumilin's algorithm
1973 int (*intel_scatter_functions_table[])(const void *sbuf, int scount,
1974 MPI_Datatype sdtype,
1975 void* rbuf, int rcount,
1976 MPI_Datatype rdtype,
1977 int root, MPI_Comm comm
1979 scatter__ompi_binomial,
1980 scatter__ompi_binomial,
1984 intel_tuning_table_element intel_scatter_table[] =
2138 /* I_MPI_ADJUST_ALLTOALLV
2142 1. Isend/Irecv + waitall algorithm
2147 int (*intel_alltoallv_functions_table[])(const void *sbuf, const int *scounts, const int *sdisps,
2148 MPI_Datatype sdtype,
2149 void *rbuf, const int *rcounts, const int *rdisps,
2150 MPI_Datatype rdtype,
2153 alltoallv__ompi_basic_linear,
2157 intel_tuning_table_element intel_alltoallv_table[] =
2177 {2147483647,1}//weirdly, intel reports the use of algo 0 here
2193 {0,1},//weird again, only for 0-sized messages
2214 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2217 #define SIZECOMP_reduce_scatter\
2218 int total_message_size = 0;\
2219 for (i = 0; i < comm_size; i++) { \
2220 total_message_size += rcounts[i];\
2222 size_t block_dsize = total_message_size*dtype->size();\
2224 #define SIZECOMP_allreduce\
2225 size_t block_dsize =rcount * dtype->size();
2227 #define SIZECOMP_alltoall\
2228 size_t block_dsize =send_count * send_type->size();
2230 #define SIZECOMP_bcast\
2231 size_t block_dsize =count * datatype->size();
2233 #define SIZECOMP_reduce\
2234 size_t block_dsize =count * datatype->size();
2236 #define SIZECOMP_barrier\
2237 size_t block_dsize = 1;
2239 #define SIZECOMP_allgather\
2240 size_t block_dsize =recv_count * recv_type->size();
2242 #define SIZECOMP_allgatherv\
2243 int total_message_size = 0;\
2244 for (i = 0; i < comm_size; i++) { \
2245 total_message_size += recv_count[i];\
2247 size_t block_dsize = total_message_size*recv_type->size();
2249 #define SIZECOMP_gather\
2250 int rank = comm->rank();\
2251 size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2252 recv_count * recv_type->size() :\
2253 send_count * send_type->size();
2255 #define SIZECOMP_scatter\
2256 int rank = comm->rank();\
2257 size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2258 recvcount * recvtype->size() :\
2259 sendcount * sendtype->size();
2261 #define SIZECOMP_alltoallv\
2262 size_t block_dsize = 1;
2264 #define IMPI_COLL_SELECT(cat, ret, args, args2) \
2265 ret _XBT_CONCAT2(cat, __impi)(COLL_UNPAREN args) \
2267 int comm_size = comm->size(); \
2269 _XBT_CONCAT(SIZECOMP_, cat) \
2272 if (comm->get_leaders_comm() == MPI_COMM_NULL) { \
2275 int local_size = 1; \
2276 if (comm->is_uniform()) { \
2277 local_size = comm->get_intra_comm()->size(); \
2279 while (i < INTEL_MAX_NB_PPN && local_size != _XBT_CONCAT3(intel_, cat, _table)[i].ppn) \
2281 if (i == INTEL_MAX_NB_PPN) \
2283 while (comm_size > _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].max_num_proc && j < INTEL_MAX_NB_THRESHOLDS) \
2285 while (block_dsize >= _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].elems[k].max_size && \
2286 k < _XBT_CONCAT3(intel_, cat, _table)[i].elems[j].num_elems) \
2288 return (_XBT_CONCAT3(intel_, cat, \
2289 _functions_table)[_XBT_CONCAT3(intel_, cat, _table)[i].elems[j].elems[k].algo - 1] args2); \
2292 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm))
2293 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm))
2294 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm))
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm))
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm))
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm))
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm))
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm))
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm))
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm))
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm))
2304 } // namespace simgrid::smpi