1 /* selector for collective algorithms based on openmpi's default coll_tuned_decision_fixed selector */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
12 // This selector is based on information gathered on the Stampede cluster, with Intel MPI 4.1.3.049, and from the intel reference manual. The data was gathered launching one process/node. With other settings, selection will be different (more SMP aware algorithms, for instance)
15 #define INTEL_MAX_NB_THRESHOLDS 32
20 } intel_tuning_table_element_element;
25 intel_tuning_table_element_element elems[INTEL_MAX_NB_THRESHOLDS];
26 } intel_tuning_table_element;
29 I_MPI_ADJUST_ALLREDUCE
33 1 - Recursive doubling algorithm
34 2 - Rabenseifner's algorithm
35 3 - Reduce + Bcast algorithm
36 4 - Topology aware Reduce + Bcast algorithm
37 5 - Binomial gather + scatter algorithm
38 6 - Topology aware binominal gather + scatter algorithm
39 7 - Shumilin's ring algorithm
43 //as Shumilin's ring algorithm is unknown, default to ring'
47 int (*intel_allreduce_functions_table[])(void *sendbuf,
50 MPI_Datatype datatype,
51 MPI_Op op, MPI_Comm comm) ={
52 smpi_coll_tuned_allreduce_rdb,
53 smpi_coll_tuned_allreduce_rab1,
54 smpi_coll_tuned_allreduce_redbcast,
55 smpi_coll_tuned_allreduce_redbcast,
56 smpi_coll_tuned_allreduce_smp_binomial,
57 smpi_coll_tuned_allreduce_smp_binomial,
58 smpi_coll_tuned_allreduce_ompi_ring_segmented,
59 smpi_coll_tuned_allreduce_ompi_ring_segmented
62 intel_tuning_table_element intel_allreduce_table[] =
126 /*I_MPI_ADJUST_ALLTOALL
131 2. Isend/Irecv + waitall algorithm
132 3. Pair wise exchange algorithm
138 intel_tuning_table_element intel_alltoall_table[] =
191 int (*intel_alltoall_functions_table[])(void *sbuf, int scount,
193 void* rbuf, int rcount,
196 smpi_coll_tuned_alltoall_bruck,
197 smpi_coll_tuned_alltoall_mvapich2_scatter_dest,
198 smpi_coll_tuned_alltoall_pair,
199 smpi_coll_tuned_alltoall_pair//Plum is proprietary ? (and super efficient)
202 /*I_MPI_ADJUST_BARRIER
206 1. Dissemination algorithm
207 2. Recursive doubling algorithm
208 3. Topology aware dissemination algorithm
209 4. Topology aware recursive doubling algorithm
210 5. Binominal gather + scatter algorithm
211 6. Topology aware binominal gather + scatter algorithm
214 static int intel_barrier_gather_scatter(MPI_Comm comm){
215 //our default barrier performs a antibcast/bcast
216 smpi_mpi_barrier(comm);
220 int (*intel_barrier_functions_table[])(MPI_Comm comm) ={
221 smpi_coll_tuned_barrier_ompi_basic_linear,
222 smpi_coll_tuned_barrier_ompi_recursivedoubling,
223 smpi_coll_tuned_barrier_ompi_basic_linear,
224 smpi_coll_tuned_barrier_ompi_recursivedoubling,
225 intel_barrier_gather_scatter,
226 intel_barrier_gather_scatter
229 intel_tuning_table_element intel_barrier_table[] =
263 1. Binomial algorithm
264 2. Recursive doubling algorithm
266 4. Topology aware binomial algorithm
267 5. Topology aware recursive doubling algorithm
268 6. Topology aware ring algorithm
269 7. Shumilin's bcast algorithm
272 int (*intel_bcast_functions_table[])(void *buff, int count,
273 MPI_Datatype datatype, int root,
275 smpi_coll_tuned_bcast_binomial_tree,
276 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
277 smpi_coll_tuned_bcast_NTSL,
278 smpi_coll_tuned_bcast_NTSL,
279 smpi_coll_tuned_bcast_SMP_binomial,
280 //smpi_coll_tuned_bcast_scatter_rdb_allgather,
281 smpi_coll_tuned_bcast_NTSL,
282 smpi_coll_tuned_bcast_SMP_linear,
283 smpi_coll_tuned_bcast_mvapich2,//we don't know shumilin's algo'
286 intel_tuning_table_element intel_bcast_table[] =
329 /*I_MPI_ADJUST_REDUCE
333 1. Shumilin's algorithm
334 2. Binomial algorithm
335 3. Topology aware Shumilin's algorithm
336 4. Topology aware binomial algorithm
337 5. Rabenseifner's algorithm
338 6. Topology aware Rabenseifner's algorithm
342 int (*intel_reduce_functions_table[])(void *sendbuf, void *recvbuf,
343 int count, MPI_Datatype datatype,
346 smpi_coll_tuned_reduce_mvapich2,
347 smpi_coll_tuned_reduce_binomial,
348 smpi_coll_tuned_reduce_mvapich2,
349 smpi_coll_tuned_reduce_binomial,
350 smpi_coll_tuned_reduce_rab,
351 smpi_coll_tuned_reduce_rab
354 intel_tuning_table_element intel_reduce_table[] =
363 /* I_MPI_ADJUST_REDUCE_SCATTER
367 1. Recursive having algorithm
368 2. Pair wise exchange algorithm
369 3. Recursive doubling algorithm
370 4. Reduce + Scatterv algorithm
371 5. Topology aware Reduce + Scatterv algorithm
374 static int intel_reduce_scatter_reduce_scatterv(void *sbuf, void *rbuf,
380 smpi_mpi_reduce_scatter(sbuf, rbuf, rcounts,dtype, op,comm);
384 static int intel_reduce_scatter_recursivehalving(void *sbuf, void *rbuf,
390 if(smpi_op_is_commute(op))
391 return smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,dtype, op,comm);
393 return smpi_coll_tuned_reduce_scatter_mvapich2(sbuf, rbuf, rcounts,dtype, op,comm);
396 int (*intel_reduce_scatter_functions_table[])( void *sbuf, void *rbuf,
402 intel_reduce_scatter_recursivehalving,
403 smpi_coll_tuned_reduce_scatter_mpich_pair,
404 smpi_coll_tuned_reduce_scatter_mpich_rdb,
405 intel_reduce_scatter_reduce_scatterv,
406 intel_reduce_scatter_reduce_scatterv
409 intel_tuning_table_element intel_reduce_scatter_table[] =
482 /* I_MPI_ADJUST_ALLGATHER
486 1. Recursive doubling algorithm
489 4. Topology aware Gatherv + Bcast algorithm
493 int (*intel_allgather_functions_table[])(void *sbuf, int scount,
495 void* rbuf, int rcount,
499 smpi_coll_tuned_allgather_rdb,
500 smpi_coll_tuned_allgather_bruck,
501 smpi_coll_tuned_allgather_ring,
502 smpi_coll_tuned_allgather_GB
505 intel_tuning_table_element intel_allgather_table[] =
564 /* I_MPI_ADJUST_ALLGATHERV
568 1. Recursive doubling algorithm
571 4. Topology aware Gatherv + Bcast algorithm
575 int (*intel_allgatherv_functions_table[])(void *sbuf, int scount,
577 void* rbuf, int *rcounts,
582 smpi_coll_tuned_allgatherv_mpich_rdb,
583 smpi_coll_tuned_allgatherv_ompi_bruck,
584 smpi_coll_tuned_allgatherv_ring,
585 smpi_coll_tuned_allgatherv_GB
588 intel_tuning_table_element intel_allgatherv_table[] =
632 /* I_MPI_ADJUST_GATHER
636 1. Binomial algorithm
637 2. Topology aware binomial algorithm
638 3. Shumilin's algorithm
642 int (*intel_gather_functions_table[])(void *sbuf, int scount,
644 void* rbuf, int rcount,
649 smpi_coll_tuned_gather_ompi_binomial,
650 smpi_coll_tuned_gather_ompi_binomial,
651 smpi_coll_tuned_gather_mvapich2
654 intel_tuning_table_element intel_gather_table[] =
685 /* I_MPI_ADJUST_SCATTER
689 1. Binomial algorithm
690 2. Topology aware binomial algorithm
691 3. Shumilin's algorithm
695 int (*intel_scatter_functions_table[])(void *sbuf, int scount,
697 void* rbuf, int rcount,
699 int root, MPI_Comm comm
701 smpi_coll_tuned_scatter_ompi_binomial,
702 smpi_coll_tuned_scatter_ompi_binomial,
703 smpi_coll_tuned_scatter_mvapich2
706 intel_tuning_table_element intel_scatter_table[] =
759 /* I_MPI_ADJUST_ALLTOALLV
763 1. Isend/Irecv + waitall algorithm
768 int (*intel_alltoallv_functions_table[])(void *sbuf, int *scounts, int *sdisps,
770 void *rbuf, int *rcounts, int *rdisps,
774 smpi_coll_tuned_alltoallv_ompi_basic_linear,
775 smpi_coll_tuned_alltoallv_bruck
778 intel_tuning_table_element intel_alltoallv_table[] =
788 //These are collected from table 3.5-2 of the Intel MPI Reference Manual
791 #define SIZECOMP_reduce_scatter\
792 int total_message_size = 0;\
793 for (i = 0; i < comm_size; i++) { \
794 total_message_size += rcounts[i];\
796 size_t block_dsize = total_message_size*smpi_datatype_size(dtype);\
798 #define SIZECOMP_allreduce\
799 size_t block_dsize =rcount * smpi_datatype_size(dtype);
801 #define SIZECOMP_alltoall\
802 size_t block_dsize =send_count * smpi_datatype_size(send_type);
804 #define SIZECOMP_bcast\
805 size_t block_dsize =count * smpi_datatype_size(datatype);
807 #define SIZECOMP_reduce\
808 size_t block_dsize =count * smpi_datatype_size(datatype);
810 #define SIZECOMP_barrier\
811 size_t block_dsize = 1;
813 #define SIZECOMP_allgather\
814 size_t block_dsize =recv_count * smpi_datatype_size(recv_type);
816 #define SIZECOMP_allgatherv\
817 int total_message_size = 0;\
818 for (i = 0; i < comm_size; i++) { \
819 total_message_size += recv_count[i];\
821 size_t block_dsize = total_message_size*smpi_datatype_size(recv_type);
823 #define SIZECOMP_gather\
824 int rank = smpi_comm_rank(comm);\
825 size_t block_dsize = (send_buff == MPI_IN_PLACE || rank ==root) ?\
826 recv_count * smpi_datatype_size(recv_type) :\
827 send_count * smpi_datatype_size(send_type);
829 #define SIZECOMP_scatter\
830 int rank = smpi_comm_rank(comm);\
831 size_t block_dsize = (sendbuf == MPI_IN_PLACE || rank !=root ) ?\
832 recvcount * smpi_datatype_size(recvtype) :\
833 sendcount * smpi_datatype_size(sendtype);
835 #define SIZECOMP_alltoallv\
836 size_t block_dsize = 1;
838 #define IMPI_COLL_SELECT(cat, ret, args, args2)\
839 ret smpi_coll_tuned_ ## cat ## _impi (COLL_UNPAREN args)\
841 int comm_size = smpi_comm_size(comm);\
846 while(comm_size>=intel_ ## cat ## _table[i].max_num_proc\
847 && i < INTEL_MAX_NB_THRESHOLDS)\
849 while(block_dsize >=intel_ ## cat ## _table[i].elems[j].max_size\
850 && j< intel_ ## cat ## _table[i].num_elems)\
852 return (intel_ ## cat ## _functions_table[intel_ ## cat ## _table[i].elems[j].algo-1]\
856 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHERV_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_disps, recv_type, comm));
857 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLREDUCE_SIG, (sbuf, rbuf, rcount, dtype, op, comm));
858 COLL_APPLY(IMPI_COLL_SELECT, COLL_GATHER_SIG, (send_buff, send_count, send_type, recv_buff, recv_count, recv_type, root, comm));
859 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLGATHER_SIG, (send_buff,send_count,send_type,recv_buff,recv_count,recv_type,comm));
860 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALL_SIG,(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,comm));
861 COLL_APPLY(IMPI_COLL_SELECT, COLL_ALLTOALLV_SIG, (send_buff, send_counts, send_disps, send_type, recv_buff, recv_counts, recv_disps, recv_type, comm));
862 COLL_APPLY(IMPI_COLL_SELECT, COLL_BCAST_SIG , (buf, count, datatype, root, comm));
863 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SIG,(buf,rbuf, count, datatype, op, root, comm));
864 COLL_APPLY(IMPI_COLL_SELECT, COLL_REDUCE_SCATTER_SIG ,(sbuf,rbuf, rcounts,dtype,op,comm));
865 COLL_APPLY(IMPI_COLL_SELECT, COLL_SCATTER_SIG ,(sendbuf, sendcount, sendtype,recvbuf, recvcount, recvtype,root, comm));
866 COLL_APPLY(IMPI_COLL_SELECT, COLL_BARRIER_SIG,(comm));