1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching runs with 1,2,4,8,16 processes per node.
15 #define INTEL_MAX_NB_THRESHOLDS 32
16 #define INTEL_MAX_NB_NUMPROCS 12
17 #define INTEL_MAX_NB_PPN 5 /* 1 2 4 8 16 ppn */
22 } intel_tuning_table_size_element;
27 intel_tuning_table_size_element elems[INTEL_MAX_NB_THRESHOLDS];
28 } intel_tuning_table_numproc_element;
32 intel_tuning_table_numproc_element elems[INTEL_MAX_NB_NUMPROCS];
33 } intel_tuning_table_element;
36 I_MPI_ADJUST_ALLREDUCE
40 1 - Recursive doubling algorithm
41 2 - Rabenseifner's algorithm
42 3 - Reduce + Bcast algorithm
43 4 - Topology aware Reduce + Bcast algorithm
44 5 - Binomial gather + scatter algorithm
45 6 - Topology aware binominal gather + scatter algorithm
46 7 - Shumilin's ring algorithm
50 //as Shumilin's ring algorithm is unknown, default to ring'
54 int (*intel_allreduce_functions_table[])(void *sendbuf,
57 MPI_Datatype datatype,
58 MPI_Op op, MPI_Comm comm) ={
59 smpi_coll_tuned_allreduce_rdb,
60 smpi_coll_tuned_allreduce_rab1,
61 smpi_coll_tuned_allreduce_redbcast,
62 smpi_coll_tuned_allreduce_mvapich2_two_level,
63 smpi_coll_tuned_allreduce_smp_binomial,
64 smpi_coll_tuned_allreduce_mvapich2_two_level,
65 smpi_coll_tuned_allreduce_ompi_ring_segmented,
66 smpi_coll_tuned_allreduce_ompi_ring_segmented
69 intel_tuning_table_element intel_allreduce_table[] =
386 /*I_MPI_ADJUST_ALLTOALL
391 2. Isend/Irecv + waitall algorithm
392 3. Pair wise exchange algorithm
398 intel_tuning_table_element intel_alltoall_table[] =
638 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
640 void* rbuf, int rcount,
643 smpi_coll_tuned_alltoall_bruck,
644 smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
645 smpi_coll_tuned_alltoall_pair,
646 smpi_coll_tuned_alltoall_mvapich2//Plum is proprietary ? (and super efficient)
649 /*I_MPI_ADJUST_BARRIER
653 1. Dissemination algorithm
654 2. Recursive doubling algorithm
655 3. Topology aware dissemination algorithm
656 4. Topology aware recursive doubling algorithm
657 5. Binominal gather + scatter algorithm
658 6. Topology aware binominal gather + scatter algorithm
661 static int intel_barrier_gather_scatter(MPI_Comm comm){
662 //our default barrier performs a antibcast/bcast
663 smpi_mpi_barrier(comm);
667 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
668 smpi_coll_tuned_barrier_ompi_basic_linear,
669 smpi_coll_tuned_barrier_ompi_recursivedoubling,
670 smpi_coll_tuned_barrier_ompi_basic_linear,
671 smpi_coll_tuned_barrier_ompi_recursivedoubling,
672 intel_barrier_gather_scatter,
673 intel_barrier_gather_scatter
676 intel_tuning_table_element intel_barrier_table[] =
793 1. Binomial algorithm
794 2. Recursive doubling algorithm
796 4. Topology aware binomial algorithm
797 5. Topology aware recursive doubling algorithm
798 6. Topology aware ring algorithm
799 7. Shumilin's bcast algorithm
802 int (*intel_bcast_functions_table[])(void *buff, int count,
803 MPI_Datatype datatype, int root,
805 smpi_coll_tuned_bcast_binomial_tree,
806 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
807 smpi_coll_tuned_bcast_NTSL,
808 smpi_coll_tuned_bcast_NTSL,
809 smpi_coll_tuned_bcast_SMP_binomial,
810 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
811 smpi_coll_tuned_bcast_NTSL,
812 smpi_coll_tuned_bcast_SMP_linear,
813 smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
816 intel_tuning_table_element intel_bcast_table[] =
958 /*I_MPI_ADJUST_REDUCE
962 1. Shumilin's algorithm
963 2. Binomial algorithm
964 3. Topology aware Shumilin's algorithm
965 4. Topology aware binomial algorithm
966 5. Rabenseifner's algorithm
967 6. Topology aware Rabenseifner's algorithm
971 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
972 int count, MPI_Datatype datatype,
975 smpi_coll_tuned_reduce_mvapich2,
976 smpi_coll_tuned_reduce_binomial,
977 smpi_coll_tuned_reduce_mvapich2,
978 smpi_coll_tuned_reduce_mvapich2_two_level,
979 smpi_coll_tuned_reduce_rab,
980 smpi_coll_tuned_reduce_rab
983 intel_tuning_table_element intel_reduce_table[] =
1048 /* I_MPI_ADJUST_REDUCE_SCATTER
1052 1. Recursive having algorithm
1053 2. Pair wise exchange algorithm
1054 3. Recursive doubling algorithm
1055 4. Reduce + Scatterv algorithm
1056 5. Topology aware Reduce + Scatterv algorithm
1059 static int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
1065 smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
1069 static int intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
1075 if(smpi_op_is_commute(op))
1076 return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
1078 return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
1081 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
1087 intel_reduce_scatter_recursivehalving,
1088 smpi_coll_tuned_reduce_scatter_mpich_pair,
1089 smpi_coll_tuned_reduce_scatter_mpich_rdb,
1090 intel_reduce_scatter_reduce_scatterv,
1091 intel_reduce_scatter_reduce_scatterv
1094 intel_tuning_table_element intel_reduce_scatter_table[] =
1480 /* I_MPI_ADJUST_ALLGATHER
1484 1. Recursive doubling algorithm
1485 2. Bruck's algorithm
1487 4. Topology aware Gatherv + Bcast algorithm
1491 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
1492 MPI_Datatype sdtype,
1493 void* rbuf, int rcount,
1494 MPI_Datatype rdtype,
1497 smpi_coll_tuned_allgather_rdb,
1498 smpi_coll_tuned_allgather_bruck,
1499 smpi_coll_tuned_allgather_ring,
1500 smpi_coll_tuned_allgather_GB
1503 intel_tuning_table_element intel_allgather_table[] =
1649 /* I_MPI_ADJUST_ALLGATHERV
1653 1. Recursive doubling algorithm
1654 2. Bruck's algorithm
1656 4. Topology aware Gatherv + Bcast algorithm
1660 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
1661 MPI_Datatype sdtype,
1662 void* rbuf, int *rcounts,
1664 MPI_Datatype rdtype,
1667 smpi_coll_tuned_allgatherv_mpich_rdb,
1668 smpi_coll_tuned_allgatherv_ompi_bruck,
1669 smpi_coll_tuned_allgatherv_ring,
1670 smpi_coll_tuned_allgatherv_GB
1673 intel_tuning_table_element intel_allgatherv_table[] =
1861 /* I_MPI_ADJUST_GATHER
1865 1. Binomial algorithm
1866 2. Topology aware binomial algorithm
1867 3. Shumilin's algorithm
1871 int (*intel_gather_functions_table[])(void *sbuf, int scount,
1872 MPI_Datatype sdtype,
1873 void* rbuf, int rcount,
1874 MPI_Datatype rdtype,
1878 smpi_coll_tuned_gather_ompi_binomial,
1879 smpi_coll_tuned_gather_ompi_binomial,
1880 smpi_coll_tuned_gather_mvapich2
1883 intel_tuning_table_element intel_gather_table[] =
1965 /* I_MPI_ADJUST_SCATTER
1969 1. Binomial algorithm
1970 2. Topology aware binomial algorithm
1971 3. Shumilin's algorithm
1975 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
1976 MPI_Datatype sdtype,
1977 void* rbuf, int rcount,
1978 MPI_Datatype rdtype,
1979 int root, MPI_Comm comm
1981 smpi_coll_tuned_scatter_ompi_binomial,
1982 smpi_coll_tuned_scatter_ompi_binomial,
1983 smpi_coll_tuned_scatter_mvapich2
1986 intel_tuning_table_element intel_scatter_table[] =
2140 /* I_MPI_ADJUST_ALLTOALLV
2144 1. Isend/Irecv + waitall algorithm
2149 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
2150 MPI_Datatype sdtype,
2151 void *rbuf, int *rcounts, int *rdisps,
2152 MPI_Datatype rdtype,
2155 smpi_coll_tuned_alltoallv_ompi_basic_linear,
2156 smpi_coll_tuned_alltoallv_bruck
2159 intel_tuning_table_element intel_alltoallv_table[] =
2179 {2147483647,1}//weirdly, intel reports the use of algo 0 here
2195 {0,1},//weird again, only for 0-sized messages
2216 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
2219 #define SIZECOMP_reduce_scatter\
2220 int total_message_size = 0;\
2221 for (i = 0; i < comm_size; i++) { \
2222 total_message_size += rcounts[i];\
2224 size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
2226 #define SIZECOMP_allreduce\
2227 size_t block_dsize =rcount * smpi_datatype_size(dtype);
2229 #define SIZECOMP_alltoall\
2230 size_t block_dsize =send_count * smpi_datatype_size(send_type);
2232 #define SIZECOMP_bcast\
2233 size_t block_dsize =count * smpi_datatype_size(datatype);
2235 #define SIZECOMP_reduce\
2236 size_t block_dsize =count * smpi_datatype_size(datatype);
2238 #define SIZECOMP_barrier\
2239 size_t block_dsize = 1;
2241 #define SIZECOMP_allgather\
2242 size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
2244 #define SIZECOMP_allgatherv\
2245 int total_message_size = 0;\
2246 for (i = 0; i < comm_size; i++) { \
2247 total_message_size += recv_count[i];\
2249 size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
2251 #define SIZECOMP_gather\
2252 int rank = smpi_comm_rank(comm);\
2253 size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
2254 recv_count * smpi_datatype_size(recv_type) :\
2255 send_count * smpi_datatype_size(send_type);
2257 #define SIZECOMP_scatter\
2258 int rank = smpi_comm_rank(comm);\
2259 size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
2260 recvcount * smpi_datatype_size(recvtype) :\
2261 sendcount * smpi_datatype_size(sendtype);
2263 #define SIZECOMP_alltoallv\
2264 size_t block_dsize = 1;
2266 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
2267 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
2269 int comm_size = smpi_comm_size(comm);\
2274 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){\
2275 smpi_comm_init_smp(comm);\
2278 if (smpi_comm_is_uniform(comm)) {\
2279 local_size = smpi_comm_size(smpi_comm_get_intra_comm(comm));\
2281 while(i < INTEL_MAX_NB_PPN &&\
2282 local_size!=intel_ ## cat ## _table[i].ppn)\
2284 if(i==INTEL_MAX_NB_PPN) i=0;\
2285 while(comm_size>intel_ ## cat ## _table[i].elems[j].max_num_proc\
2286 && j < INTEL_MAX_NB_THRESHOLDS)\
2288 while(block_dsize >=intel_ ## cat ## _table[i].elems[j].elems[k].max_size\
2289 && k< intel_ ## cat ## _table[i].elems[j].num_elems)\
2291 return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].elems[k].algo-1]\
2295 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
2296 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
2297 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
2298 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
2299 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
2300 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
2301 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
2302 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
2303 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
2304 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
2305 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));