1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
13 #define INTEL_MAX_NB_THRESHOLDS 32
14 #define INTEL_MAX_NB_NUMPROCS 12
15 #define INTEL_MAX_NB_PPN 5 /* 1 2 4 8 16 ppn */
20 } intel_tuning_table_size_element;
25 intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
26 } intel_tuning_table_numproc_element;
30 intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
31 } intel_tuning_table_element;
34 I_MPI_ADJUST_ALLREDUCE
38 1 - Recursive doubling algorithm
39 2 - Rabenseifner's algorithm
40 3 - Reduce + Bcast algorithm
41 4 - Topology aware Reduce + Bcast algorithm
42 5 - Binomial gather + scatter algorithm
43 6 - Topology aware binominal gather + scatter algorithm
44 7 - Shumilin's ring algorithm
47 as Shumilin's ring algorithm is unknown, default to ring'
51 int (*intel_allreduce_functions_table[])(void *sendbuf,
54 MPI_Datatype datatype,
55 MPI_Op op, MPI_Comm comm) ={
56 smpi_coll_tuned_allreduce_rdb,
57 smpi_coll_tuned_allreduce_rab1,
58 smpi_coll_tuned_allreduce_redbcast,
59 smpi_coll_tuned_allreduce_mvapich2_two_level,
60 smpi_coll_tuned_allreduce_smp_binomial,
61 smpi_coll_tuned_allreduce_mvapich2_two_level,
62 smpi_coll_tuned_allreduce_ompi_ring_segmented,
63 smpi_coll_tuned_allreduce_ompi_ring_segmented
66 intel_tuning_table_element intel_allreduce_table[] =
383 /*I_MPI_ADJUST_ALLTOALL
388 2. Isend/Irecv + waitall algorithm
389 3. Pair wise exchange algorithm
395 intel_tuning_table_element intel_alltoall_table[] =
635 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
637 void* rbuf, int rcount,
640 smpi_coll_tuned_alltoall_bruck,
641 smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
642 smpi_coll_tuned_alltoall_pair,
643 smpi_coll_tuned_alltoall_mvapich2//Plum is proprietary ? (and super efficient)
646 /*I_MPI_ADJUST_BARRIER
650 1. Dissemination algorithm
651 2. Recursive doubling algorithm
652 3. Topology aware dissemination algorithm
653 4. Topology aware recursive doubling algorithm
654 5. Binominal gather + scatter algorithm
655 6. Topology aware binominal gather + scatter algorithm
658 static int intel_barrier_gather_scatter(MPI_Comm comm){
659 //our default barrier performs a antibcast/bcast
660 smpi_mpi_barrier(comm);
664 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
665 smpi_coll_tuned_barrier_ompi_basic_linear,
666 smpi_coll_tuned_barrier_ompi_recursivedoubling,
667 smpi_coll_tuned_barrier_ompi_basic_linear,
668 smpi_coll_tuned_barrier_ompi_recursivedoubling,
669 intel_barrier_gather_scatter,
670 intel_barrier_gather_scatter
673 intel_tuning_table_element intel_barrier_table[] =
790 1. Binomial algorithm
791 2. Recursive doubling algorithm
793 4. Topology aware binomial algorithm
794 5. Topology aware recursive doubling algorithm
795 6. Topology aware ring algorithm
796 7. Shumilin's bcast algorithm
799 int (*intel_bcast_functions_table[])(void *buff, int count,
800 MPI_Datatype datatype, int root,
802 smpi_coll_tuned_bcast_binomial_tree,
803 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
804 smpi_coll_tuned_bcast_NTSL,
805 smpi_coll_tuned_bcast_NTSL,
806 smpi_coll_tuned_bcast_SMP_binomial,
807 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
808 smpi_coll_tuned_bcast_NTSL,
809 smpi_coll_tuned_bcast_SMP_linear,
810 smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
813 intel_tuning_table_element intel_bcast_table[] =
955 /*I_MPI_ADJUST_REDUCE
959 1. Shumilin's algorithm
960 2. Binomial algorithm
961 3. Topology aware Shumilin's algorithm
962 4. Topology aware binomial algorithm
963 5. Rabenseifner's algorithm
964 6. Topology aware Rabenseifner's algorithm
968 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
969 int count, MPI_Datatype datatype,
972 smpi_coll_tuned_reduce_mvapich2,
973 smpi_coll_tuned_reduce_binomial,
974 smpi_coll_tuned_reduce_mvapich2,
975 smpi_coll_tuned_reduce_mvapich2_two_level,
976 smpi_coll_tuned_reduce_rab,
977 smpi_coll_tuned_reduce_rab
980 intel_tuning_table_element intel_reduce_table[] =
1045 /* I_MPI_ADJUST_REDUCE_SCATTER
1049 1. Recursive having algorithm
1050 2. Pair wise exchange algorithm
1051 3. Recursive doubling algorithm
1052 4. Reduce + Scatterv algorithm
1053 5. Topology aware Reduce + Scatterv algorithm
1056 static int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
1062 smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1066 static int intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
1072 if(smpi_op_is_commute(op))
1073 return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1075 return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1078 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
1084 intel_reduce_scatter_recursivehalving,
1085 smpi_coll_tuned_reduce_scatter_mpich_pair,
1086 smpi_coll_tuned_reduce_scatter_mpich_rdb,
1087 intel_reduce_scatter_reduce_scatterv,
1088 intel_reduce_scatter_reduce_scatterv
1091 intel_tuning_table_element intel_reduce_scatter_table[] =
1477 /* I_MPI_ADJUST_ALLGATHER
1481 1. Recursive doubling algorithm
1482 2. Bruck's algorithm
1484 4. Topology aware Gatherv + Bcast algorithm
1488 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
1489 MPI_Datatype sdtype,
1490 void* rbuf, int rcount,
1491 MPI_Datatype rdtype,
1494 smpi_coll_tuned_allgather_rdb,
1495 smpi_coll_tuned_allgather_bruck,
1496 smpi_coll_tuned_allgather_ring,
1497 smpi_coll_tuned_allgather_GB
1500 intel_tuning_table_element intel_allgather_table[] =
1646 /* I_MPI_ADJUST_ALLGATHERV
1650 1. Recursive doubling algorithm
1651 2. Bruck's algorithm
1653 4. Topology aware Gatherv + Bcast algorithm
1657 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
1658 MPI_Datatype sdtype,
1659 void* rbuf, int *rcounts,
1661 MPI_Datatype rdtype,
1664 smpi_coll_tuned_allgatherv_mpich_rdb,
1665 smpi_coll_tuned_allgatherv_ompi_bruck,
1666 smpi_coll_tuned_allgatherv_ring,
1667 smpi_coll_tuned_allgatherv_GB
1670 intel_tuning_table_element intel_allgatherv_table[] =
1858 /* I_MPI_ADJUST_GATHER
1862 1. Binomial algorithm
1863 2. Topology aware binomial algorithm
1864 3. Shumilin's algorithm
1868 int (*intel_gather_functions_table[])(void *sbuf, int scount,
1869 MPI_Datatype sdtype,
1870 void* rbuf, int rcount,
1871 MPI_Datatype rdtype,
1875 smpi_coll_tuned_gather_ompi_binomial,
1876 smpi_coll_tuned_gather_ompi_binomial,
1877 smpi_coll_tuned_gather_mvapich2
1880 intel_tuning_table_element intel_gather_table[] =
1962 /* I_MPI_ADJUST_SCATTER
1966 1. Binomial algorithm
1967 2. Topology aware binomial algorithm
1968 3. Shumilin's algorithm
1972 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
1973 MPI_Datatype sdtype,
1974 void* rbuf, int rcount,
1975 MPI_Datatype rdtype,
1976 int root, MPI_Comm comm
1978 smpi_coll_tuned_scatter_ompi_binomial,
1979 smpi_coll_tuned_scatter_ompi_binomial,
1980 smpi_coll_tuned_scatter_mvapich2
1983 intel_tuning_table_element intel_scatter_table[] =
2137 /* I_MPI_ADJUST_ALLTOALLV
2141 1. Isend/Irecv + waitall algorithm
2146 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
2147 MPI_Datatype sdtype,
2148 void *rbuf, int *rcounts, int *rdisps,
2149 MPI_Datatype rdtype,
2152 smpi_coll_tuned_alltoallv_ompi_basic_linear,
2153 smpi_coll_tuned_alltoallv_bruck
2156 intel_tuning_table_element intel_alltoallv_table[] =
2176 {2147483647,1}//weirdly, intel reports the use of algo 0 here
2192 {0,1},//weird again, only for 0-sized messages
2213 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2216 #define SIZECOMP_reduce_scatter\
2217 int total_message_size = 0;\
2218 for (i = 0; i < comm_size; i++) { \
2219 total_message_size += rcounts[i];\
2221 size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
2223 #define SIZECOMP_allreduce\
2224 size_t block_dsize =rcount * smpi_datatype_size(dtype);
2226 #define SIZECOMP_alltoall\
2227 size_t block_dsize =send_count * smpi_datatype_size(send_type);
2229 #define SIZECOMP_bcast\
2230 size_t block_dsize =count * smpi_datatype_size(datatype);
2232 #define SIZECOMP_reduce\
2233 size_t block_dsize =count * smpi_datatype_size(datatype);
2235 #define SIZECOMP_barrier\
2236 size_t block_dsize = 1;
2238 #define SIZECOMP_allgather\
2239 size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
2241 #define SIZECOMP_allgatherv\
2242 int total_message_size = 0;\
2243 for (i = 0; i < comm_size; i++) { \
2244 total_message_size += recv_count[i];\
2246 size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
2248 #define SIZECOMP_gather\
2249 int rank = smpi_comm_rank(comm);\
2250 size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2251 recv_count * smpi_datatype_size(recv_type) :\
2252 send_count * smpi_datatype_size(send_type);
2254 #define SIZECOMP_scatter\
2255 int rank = smpi_comm_rank(comm);\
2256 size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2257 recvcount * smpi_datatype_size(recvtype) :\
2258 sendcount * smpi_datatype_size(sendtype);
2260 #define SIZECOMP_alltoallv\
2261 size_t block_dsize = 1;
2263 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
2264 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
2266 int comm_size = smpi_comm_size(comm);\
2271 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){\
2272 smpi_comm_init_smp(comm);\
2275 if (smpi_comm_is_uniform(comm)) {\
2276 local_size = smpi_comm_size(smpi_comm_get_intra_comm(comm));\
2278 while(i < INTEL_MAX_NB_PPN &&\
2279 local_size!=intel_ ## cat ## _table[i].ppn)\
2281 if(i==INTEL_MAX_NB_PPN) i=0;\
2282 while(comm_size>intel_ ## cat ## _table[i].elems[j].max_num_proc\
2283 && j < INTEL_MAX_NB_THRESHOLDS)\
2285 while(block_dsize >=intel_ ## cat ## _table[i].elems[j].elems[k].max_size\
2286 && k< intel_ ## cat ## _table[i].elems[j].num_elems)\
2288 return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].elems[k].algo-1]\
2292 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
2293 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
2294 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));