1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
18 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19 void *recvbuf, int recvcount, MPI_Datatype recvtype,
21 } mv2_alltoall_tuning_element;
26 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
47 static void init_mv2_alltoall_tables_stampede(){
49 int agg_table_sum = 0;
50 mv2_alltoall_tuning_table **table_ptrs = NULL;
51 mv2_alltoall_num_ppn_conf = 3;
52 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53 * mv2_alltoall_num_ppn_conf);
54 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55 * mv2_alltoall_num_ppn_conf);
56 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
57 mv2_alltoall_num_ppn_conf);
58 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
59 mv2_alltoall_table_ppn_conf[0] = 1;
60 mv2_size_alltoall_tuning_table[0] = 6;
61 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67 {{0, -1, &MPIR_Alltoall_inplace_MV2},
73 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
74 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77 {{0, -1, &MPIR_Alltoall_inplace_MV2},
83 {{0, 8, &MPIR_Alltoall_RD_MV2},
84 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87 {{0, -1, &MPIR_Alltoall_inplace_MV2},
93 {{0, 64, &MPIR_Alltoall_RD_MV2},
94 {64, 512, &MPIR_Alltoall_bruck_MV2},
95 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98 {{0,-1, &MPIR_Alltoall_inplace_MV2},
104 {{0, 32, &MPIR_Alltoall_RD_MV2},
105 {32, 2048, &MPIR_Alltoall_bruck_MV2},
106 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109 {{0, -1, &MPIR_Alltoall_inplace_MV2},
115 {{0, 8, &MPIR_Alltoall_RD_MV2},
116 {8, 1024, &MPIR_Alltoall_bruck_MV2},
117 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120 {{0, -1, &MPIR_Alltoall_inplace_MV2},
124 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
125 mv2_alltoall_table_ppn_conf[1] = 2;
126 mv2_size_alltoall_tuning_table[1] = 6;
127 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130 {{0, 32, &MPIR_Alltoall_RD_MV2},
131 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134 {{0, -1, &MPIR_Alltoall_inplace_MV2},
140 {{0, 64, &MPIR_Alltoall_RD_MV2},
141 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144 {{0, -1, &MPIR_Alltoall_inplace_MV2},
150 {{0, 64, &MPIR_Alltoall_RD_MV2},
151 {64, 2048, &MPIR_Alltoall_bruck_MV2},
152 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155 {{0,-1, &MPIR_Alltoall_inplace_MV2},
161 {{0, 16, &MPIR_Alltoall_RD_MV2},
162 {16, 2048, &MPIR_Alltoall_bruck_MV2},
163 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166 {{0, -1, &MPIR_Alltoall_inplace_MV2},
172 {{0, 8, &MPIR_Alltoall_RD_MV2},
173 {8, 1024, &MPIR_Alltoall_bruck_MV2},
174 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177 {{0, -1, &MPIR_Alltoall_inplace_MV2},
183 {{0, 4, &MPIR_Alltoall_RD_MV2},
184 {4, 2048, &MPIR_Alltoall_bruck_MV2},
185 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188 {{0, -1, &MPIR_Alltoall_inplace_MV2},
192 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
193 mv2_alltoall_table_ppn_conf[2] = 16;
194 mv2_size_alltoall_tuning_table[2] = 7;
195 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
199 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
202 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
208 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
209 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
218 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
219 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
220 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
229 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
230 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
239 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
240 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
249 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
250 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
258 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
259 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
267 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
269 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
270 agg_table_sum += mv2_size_alltoall_tuning_table[i];
272 mv2_alltoall_thresholds_table[0] =
273 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
274 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
275 (sizeof(mv2_alltoall_tuning_table)
276 * mv2_size_alltoall_tuning_table[0]));
277 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
278 mv2_alltoall_thresholds_table[i] =
279 mv2_alltoall_thresholds_table[i - 1]
280 + mv2_size_alltoall_tuning_table[i - 1];
281 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
282 (sizeof(mv2_alltoall_tuning_table)
283 * mv2_size_alltoall_tuning_table[i]));
285 xbt_free(table_ptrs);
291 /************ Allgather variables and initializers */
296 int (*MV2_pt_Allgather_function)(void *sendbuf,
298 MPI_Datatype sendtype,
301 MPI_Datatype recvtype, MPI_Comm comm_ptr);
302 } mv2_allgather_tuning_element;
306 int two_level[MV2_MAX_NB_THRESHOLDS];
307 int size_inter_table;
308 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
309 } mv2_allgather_tuning_table;
311 int (*MV2_Allgather_function)(void *sendbuf,
313 MPI_Datatype sendtype,
316 MPI_Datatype recvtype, MPI_Comm comm);
318 int *mv2_allgather_table_ppn_conf = NULL;
319 int mv2_allgather_num_ppn_conf = 1;
320 int *mv2_size_allgather_tuning_table = NULL;
321 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
323 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
325 MPI_Datatype sendtype,
328 MPI_Datatype recvtype, MPI_Comm comm_ptr)
333 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
334 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
335 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
338 static void init_mv2_allgather_tables_stampede(){
340 int agg_table_sum = 0;
341 mv2_allgather_tuning_table **table_ptrs = NULL;
342 mv2_allgather_num_ppn_conf = 3;
343 mv2_allgather_thresholds_table
344 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345 * mv2_allgather_num_ppn_conf);
346 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347 * mv2_allgather_num_ppn_conf);
348 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
349 mv2_allgather_num_ppn_conf);
350 mv2_allgather_table_ppn_conf
351 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
352 mv2_allgather_table_ppn_conf[0] = 1;
353 mv2_size_allgather_tuning_table[0] = 6;
354 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
360 {0, -1, &MPIR_Allgather_Ring_MV2},
368 {0, 262144, &MPIR_Allgather_RD_MV2},
369 {262144, -1, &MPIR_Allgather_Ring_MV2},
377 {0, 131072, &MPIR_Allgather_RD_MV2},
378 {131072, -1, &MPIR_Allgather_Ring_MV2},
386 {0, 131072, &MPIR_Allgather_RD_MV2},
387 {131072, -1, &MPIR_Allgather_Ring_MV2},
395 {0, 65536, &MPIR_Allgather_RD_MV2},
396 {65536, -1, &MPIR_Allgather_Ring_MV2},
404 {0, 32768, &MPIR_Allgather_RD_MV2},
405 {32768, -1, &MPIR_Allgather_Ring_MV2},
409 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
410 mv2_allgather_table_ppn_conf[1] = 2;
411 mv2_size_allgather_tuning_table[1] = 6;
412 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
418 {0, 524288, &MPIR_Allgather_RD_MV2},
419 {524288, -1, &MPIR_Allgather_Ring_MV2},
427 {0, 32768, &MPIR_Allgather_RD_MV2},
428 {32768, 524288, &MPIR_Allgather_Ring_MV2},
429 {524288, -1, &MPIR_Allgather_Ring_MV2},
437 {0, 16384, &MPIR_Allgather_RD_MV2},
438 {16384, 524288, &MPIR_Allgather_Ring_MV2},
439 {524288, -1, &MPIR_Allgather_Ring_MV2},
447 {0, 65536, &MPIR_Allgather_RD_MV2},
448 {65536, 524288, &MPIR_Allgather_Ring_MV2},
449 {524288, -1, &MPIR_Allgather_Ring_MV2},
457 {0, 32768, &MPIR_Allgather_RD_MV2},
458 {32768, 524288, &MPIR_Allgather_Ring_MV2},
459 {524288, -1, &MPIR_Allgather_Ring_MV2},
467 {0, 65536, &MPIR_Allgather_RD_MV2},
468 {65536, 524288, &MPIR_Allgather_Ring_MV2},
469 {524288, -1, &MPIR_Allgather_Ring_MV2},
473 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
474 mv2_allgather_table_ppn_conf[2] = 16;
475 mv2_size_allgather_tuning_table[2] = 6;
476 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
482 {0, 1024, &MPIR_Allgather_RD_MV2},
483 {1024, -1, &MPIR_Allgather_Ring_MV2},
491 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492 {1024, -1, &MPIR_Allgather_Ring_MV2},
500 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501 {1024, -1, &MPIR_Allgather_Ring_MV2},
509 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510 {1024, -1, &MPIR_Allgather_Ring_MV2},
518 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519 {1024, -1, &MPIR_Allgather_Ring_MV2},
527 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
528 {1024, -1, &MPIR_Allgather_Ring_MV2},
533 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
535 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
536 agg_table_sum += mv2_size_allgather_tuning_table[i];
538 mv2_allgather_thresholds_table[0] =
539 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
540 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
541 (sizeof(mv2_allgather_tuning_table)
542 * mv2_size_allgather_tuning_table[0]));
543 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
544 mv2_allgather_thresholds_table[i] =
545 mv2_allgather_thresholds_table[i - 1]
546 + mv2_size_allgather_tuning_table[i - 1];
547 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
548 (sizeof(mv2_allgather_tuning_table)
549 * mv2_size_allgather_tuning_table[i]));
551 xbt_free(table_ptrs);
555 /************ Gather variables and initializers */
560 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
561 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
562 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
563 } mv2_gather_tuning_element;
568 int size_inter_table;
569 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
570 int size_intra_table;
571 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
572 } mv2_gather_tuning_table;
574 int mv2_size_gather_tuning_table=7;
575 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
577 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
579 MPI_Datatype sendtype,
582 MPI_Datatype recvtype,
583 int root, MPI_Comm comm);
585 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
586 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
589 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
590 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
591 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
594 static void init_mv2_gather_tables_stampede(){
596 mv2_size_gather_tuning_table=7;
597 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
598 sizeof (mv2_gather_tuning_table));
599 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
601 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
602 {524288, -1, &MPIR_Gather_intra}},
603 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
605 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
606 {16384, 131072, &MPIR_Gather_intra},
607 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
608 1,{{0, -1, &MPIR_Gather_intra}}},
610 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
611 {256, 16384, &MPIR_Gather_MV2_Direct},
612 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
613 1,{{0, -1, &MPIR_Gather_intra}}},
615 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
616 {512, 16384, &MPIR_Gather_MV2_Direct},
617 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
618 1,{{0, -1, &MPIR_Gather_intra}}},
620 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
621 {512, 16384, &MPIR_Gather_MV2_Direct},
622 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
623 1,{{0, -1, &MPIR_Gather_intra}}},
625 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626 {512, 16384, &MPIR_Gather_MV2_Direct},
627 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
628 1,{{0, -1, &MPIR_Gather_intra}}},
630 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631 {512, 16384, &MPIR_Gather_MV2_Direct},
632 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
633 1,{{0, -1, &MPIR_Gather_intra}}},
636 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
637 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
642 /************ Allgatherv variables and initializers */
647 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
649 MPI_Datatype sendtype,
653 MPI_Datatype recvtype,
655 } mv2_allgatherv_tuning_element;
659 int size_inter_table;
660 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
661 } mv2_allgatherv_tuning_table;
663 int (*MV2_Allgatherv_function)(void *sendbuf,
665 MPI_Datatype sendtype,
669 MPI_Datatype recvtype,
672 int mv2_size_allgatherv_tuning_table = 0;
673 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
675 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
676 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
677 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
680 static void init_mv2_allgatherv_tables_stampede(){
681 mv2_size_allgatherv_tuning_table = 6;
682 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
683 sizeof (mv2_allgatherv_tuning_table));
684 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
689 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
690 {512, -1, &MPIR_Allgatherv_Ring_MV2},
697 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698 {512, -1, &MPIR_Allgatherv_Ring_MV2},
705 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
706 {256, -1, &MPIR_Allgatherv_Ring_MV2},
713 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714 {256, -1, &MPIR_Allgatherv_Ring_MV2},
721 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722 {256, -1, &MPIR_Allgatherv_Ring_MV2},
729 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730 {256, -1, &MPIR_Allgatherv_Ring_MV2},
735 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
736 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
740 /************ Allreduce variables and initializers */
745 int (*MV2_pt_Allreduce_function)(void *sendbuf,
748 MPI_Datatype datatype,
749 MPI_Op op, MPI_Comm comm);
750 } mv2_allreduce_tuning_element;
755 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
756 int size_inter_table;
757 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
758 int size_intra_table;
759 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
760 } mv2_allreduce_tuning_table;
763 int (*MV2_Allreduce_function)(void *sendbuf,
766 MPI_Datatype datatype,
767 MPI_Op op, MPI_Comm comm)=NULL;
770 int (*MV2_Allreduce_intra_function)( void *sendbuf,
773 MPI_Datatype datatype,
774 MPI_Op op, MPI_Comm comm)=NULL;
776 int mv2_size_allreduce_tuning_table = 0;
777 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
783 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
786 MPI_Datatype datatype,
787 MPI_Op op, MPI_Comm comm)
792 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
795 MPI_Datatype datatype,
796 MPI_Op op, MPI_Comm comm)
801 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
804 MPI_Datatype datatype,
805 MPI_Op op, MPI_Comm comm)
807 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
811 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
814 MPI_Datatype datatype,
815 MPI_Op op, MPI_Comm comm)
817 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
821 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
822 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
823 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
826 static void init_mv2_allreduce_tables_stampede(){
827 mv2_size_allreduce_tuning_table = 8;
828 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
829 sizeof (mv2_allreduce_tuning_table));
830 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
837 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
838 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
842 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
843 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
852 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
853 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
854 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
858 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
859 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
868 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
869 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
870 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
874 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
875 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
884 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
885 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
886 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
890 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
891 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
900 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
901 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
902 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
906 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
907 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
916 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
917 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
918 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
922 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
923 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
932 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
933 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
934 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
935 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
939 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
940 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
949 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
950 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
951 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
952 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
953 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
957 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
958 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
963 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
964 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
973 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
974 int root, MPI_Comm comm_ptr);
975 int zcpy_pipelined_knomial_factor;
976 } mv2_bcast_tuning_element;
980 int bcast_segment_size;
981 int intra_node_knomial_factor;
982 int inter_node_knomial_factor;
983 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
984 int size_inter_table;
985 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
986 int size_intra_table;
987 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
988 } mv2_bcast_tuning_table;
990 int mv2_size_bcast_tuning_table = 0;
991 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
994 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
995 int root, MPI_Comm comm_ptr) = NULL;
997 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
998 int root, MPI_Comm comm_ptr) = NULL;
1000 int zcpy_knomial_factor = 2;
1001 int mv2_pipelined_zcpy_knomial_factor = -1;
1002 int bcast_segment_size = 8192;
1003 int mv2_inter_node_knomial_factor = 4;
1004 int mv2_intra_node_knomial_factor = 4;
1005 #define INTRA_NODE_ROOT 0
1007 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1008 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1009 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_mpich
1010 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_mpich
1011 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_mpich
1012 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_mpich
1013 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1014 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mpich
1015 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mpich
1016 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mpich
1018 static void init_mv2_bcast_tables_stampede(){
1020 mv2_size_bcast_tuning_table=8;
1021 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1022 sizeof (mv2_bcast_tuning_table));
1024 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1028 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1031 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1032 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1033 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1034 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1035 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1036 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1037 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1038 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1039 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1040 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1041 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1045 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1046 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1047 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1048 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1049 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1050 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1051 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1052 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1053 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1054 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1055 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1061 {1, 1, 1, 1, 1, 1, 1, 1},
1064 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1065 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1066 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1067 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1068 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1069 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1070 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1071 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1075 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1076 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1077 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1078 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1079 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1080 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1081 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1082 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1088 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1091 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1092 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1093 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1094 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1095 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1096 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1097 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1098 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1099 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1103 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1104 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1105 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1106 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1107 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1108 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1109 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1110 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1111 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1120 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1121 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1122 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1123 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1127 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1128 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1129 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1130 {524288, -1, NULL, -1}
1139 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1140 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1141 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1142 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1143 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1147 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1148 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1149 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1150 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1151 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1160 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1161 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1162 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1163 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1164 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1168 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1169 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1170 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1171 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1172 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1181 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1182 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1183 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1184 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1185 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1189 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1190 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1191 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1192 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1193 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1199 {1, 1, 1, 1, 1, 1, 1},
1202 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1203 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1204 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1205 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1206 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1207 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1208 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1212 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1213 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1214 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1215 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1216 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1217 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1218 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1223 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1224 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1228 /************ Reduce variables and initializers */
1233 int (*MV2_pt_Reduce_function)(void *sendbuf,
1236 MPI_Datatype datatype,
1240 } mv2_reduce_tuning_element;
1246 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1247 int size_inter_table;
1248 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1249 int size_intra_table;
1250 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1251 } mv2_reduce_tuning_table;
1253 int mv2_size_reduce_tuning_table = 0;
1254 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1257 int mv2_reduce_intra_knomial_factor = 2;
1258 int mv2_reduce_inter_knomial_factor = 2;
1260 int (*MV2_Reduce_function)( void *sendbuf,
1263 MPI_Datatype datatype,
1266 MPI_Comm comm_ptr)=NULL;
1268 int (*MV2_Reduce_intra_function)( void *sendbuf,
1271 MPI_Datatype datatype,
1274 MPI_Comm comm_ptr)=NULL;
1277 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1278 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1279 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1280 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1281 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1282 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1285 static void init_mv2_reduce_tables_stampede(){
1287 mv2_size_reduce_tuning_table = 8;
1288 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1289 sizeof (mv2_reduce_tuning_table));
1290 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1298 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1299 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1300 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1304 {0, 65536, &MPIR_Reduce_shmem_MV2},
1305 {65536,-1, &MPIR_Reduce_binomial_MV2},
1312 {1, 1, 1, 1, 0, 0, 0},
1315 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1316 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1317 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1319 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1321 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1325 {0, 8192, &MPIR_Reduce_shmem_MV2},
1326 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1327 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1328 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1329 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1330 {262144,-1, &MPIR_Reduce_binomial_MV2},
1340 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1342 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1343 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1344 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1348 {0, 8192, &MPIR_Reduce_shmem_MV2},
1349 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1350 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1351 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1352 {262144, -1, &MPIR_Reduce_binomial_MV2},
1362 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1364 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1365 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1366 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1367 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1371 {0, 8192, &MPIR_Reduce_shmem_MV2},
1372 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1374 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1375 {262144, -1, &MPIR_Reduce_binomial_MV2},
1382 {1, 1, 1, 0, 1, 1, 0},
1385 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1386 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1388 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1389 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1390 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1391 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1395 {0, 8192, &MPIR_Reduce_shmem_MV2},
1396 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1397 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1398 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1399 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1400 {262144, -1, &MPIR_Reduce_binomial_MV2},
1410 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1411 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1412 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1413 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1414 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1415 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1419 {0, 8192, &MPIR_Reduce_shmem_MV2},
1420 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1422 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1423 {262144, -1, &MPIR_Reduce_binomial_MV2},
1433 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1434 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1435 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1436 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1437 {262144, -1, &MPIR_Reduce_binomial_MV2},
1441 {0, 8192, &MPIR_Reduce_shmem_MV2},
1442 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1443 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1444 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1445 {262144, -1, &MPIR_Reduce_binomial_MV2},
1455 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1456 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1457 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1458 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1459 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1460 {131072, -1, &MPIR_Reduce_binomial_MV2},
1464 {0, 2048, &MPIR_Reduce_shmem_MV2},
1465 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1466 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1467 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1468 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1469 {131072, -1, &MPIR_Reduce_shmem_MV2},
1474 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1475 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1478 /************ Reduce scatter variables and initializers */
1483 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1486 MPI_Datatype datatype,
1489 } mv2_red_scat_tuning_element;
1493 int size_inter_table;
1494 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1495 } mv2_red_scat_tuning_table;
1497 int mv2_size_red_scat_tuning_table = 0;
1498 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1501 int (*MV2_Red_scat_function)(void *sendbuf,
1504 MPI_Datatype datatype,
1510 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1513 MPI_Datatype datatype,
1517 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1520 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1521 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1522 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1527 static void init_mv2_reduce_scatter_tables_stampede(){
1528 mv2_size_red_scat_tuning_table = 6;
1529 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1530 sizeof (mv2_red_scat_tuning_table));
1531 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1536 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1537 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1538 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1545 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1546 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1547 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1554 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1555 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1556 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1563 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1564 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1571 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1572 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1579 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1580 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1585 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1586 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1589 /************ Scatter variables and initializers */
1594 int (*MV2_pt_Scatter_function)(void *sendbuf,
1596 MPI_Datatype sendtype,
1599 MPI_Datatype recvtype,
1600 int root, MPI_Comm comm);
1601 } mv2_scatter_tuning_element;
1605 int size_inter_table;
1606 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1607 int size_intra_table;
1608 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1609 } mv2_scatter_tuning_table;
1612 int *mv2_scatter_table_ppn_conf = NULL;
1613 int mv2_scatter_num_ppn_conf = 1;
1614 int *mv2_size_scatter_tuning_table = NULL;
1615 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1617 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1618 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1619 int root, MPI_Comm comm)=NULL;
1621 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1622 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1623 int root, MPI_Comm comm)=NULL;
1624 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1626 MPI_Datatype sendtype,
1629 MPI_Datatype recvtype,
1630 int root, MPI_Comm comm_ptr);
1632 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1634 MPI_Datatype sendtype,
1637 MPI_Datatype recvtype,
1638 int root, MPI_Comm comm_ptr)
1643 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1644 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1645 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1646 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1651 static void init_mv2_scatter_tables_stampede(){
1653 int agg_table_sum = 0;
1655 mv2_scatter_tuning_table **table_ptrs = NULL;
1656 mv2_scatter_num_ppn_conf = 3;
1657 mv2_scatter_thresholds_table
1658 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1659 * mv2_scatter_num_ppn_conf);
1660 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1661 * mv2_scatter_num_ppn_conf);
1662 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1663 mv2_scatter_num_ppn_conf);
1664 mv2_scatter_table_ppn_conf
1665 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1666 mv2_scatter_table_ppn_conf[0] = 1;
1667 mv2_size_scatter_tuning_table[0] = 6;
1668 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1672 {0, -1, &MPIR_Scatter_MV2_Binomial},
1676 {0, -1, &MPIR_Scatter_MV2_Binomial},
1683 {0, -1, &MPIR_Scatter_MV2_Direct},
1687 {0, -1, &MPIR_Scatter_MV2_Direct},
1694 {0, -1, &MPIR_Scatter_MV2_Direct},
1698 {0, -1, &MPIR_Scatter_MV2_Direct},
1705 {0, -1, &MPIR_Scatter_MV2_Direct},
1709 {0, -1, &MPIR_Scatter_MV2_Direct},
1716 {0, -1, &MPIR_Scatter_MV2_Direct},
1720 {0, -1, &MPIR_Scatter_MV2_Direct},
1727 {0, 32, &MPIR_Scatter_MV2_Binomial},
1728 {32, -1, &MPIR_Scatter_MV2_Direct},
1732 {0, -1, &MPIR_Scatter_MV2_Binomial},
1736 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1737 mv2_scatter_table_ppn_conf[1] = 2;
1738 mv2_size_scatter_tuning_table[1] = 6;
1739 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1743 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1744 {4096, -1, &MPIR_Scatter_MV2_Direct},
1748 {0, -1, &MPIR_Scatter_MV2_Direct},
1755 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1756 {512, -1, &MPIR_Scatter_MV2_Direct},
1760 {0, -1, &MPIR_Scatter_MV2_Binomial},
1767 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1768 {2048, -1, &MPIR_Scatter_MV2_Direct},
1772 {0, -1, &MPIR_Scatter_MV2_Binomial},
1779 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1780 {2048, -1, &MPIR_Scatter_MV2_Direct},
1784 {0, -1, &MPIR_Scatter_MV2_Binomial},
1791 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1792 {8192, -1, &MPIR_Scatter_MV2_Direct},
1796 {0, -1, &MPIR_Scatter_MV2_Binomial},
1803 {0, 16, &MPIR_Scatter_MV2_Binomial},
1804 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1805 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1806 {16384, -1, &MPIR_Scatter_MV2_Direct},
1810 {0, 128, &MPIR_Scatter_MV2_Direct},
1811 {128, -1, &MPIR_Scatter_MV2_Binomial},
1815 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1816 mv2_scatter_table_ppn_conf[2] = 16;
1817 mv2_size_scatter_tuning_table[2] = 8;
1818 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1823 {0, 256, &MPIR_Scatter_MV2_Binomial},
1824 {256, -1, &MPIR_Scatter_MV2_Direct},
1828 { 0, -1, &MPIR_Scatter_MV2_Direct},
1836 {0, 512, &MPIR_Scatter_MV2_Binomial},
1837 {512, -1, &MPIR_Scatter_MV2_Direct},
1841 { 0, -1, &MPIR_Scatter_MV2_Direct},
1849 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1850 {1024, -1, &MPIR_Scatter_MV2_Direct},
1854 { 0, -1, &MPIR_Scatter_MV2_Direct},
1862 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1863 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1864 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1865 {2048, -1, &MPIR_Scatter_MV2_Direct},
1869 { 0, -1, &MPIR_Scatter_MV2_Direct},
1877 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1878 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1879 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1880 {2048, -1, &MPIR_Scatter_MV2_Direct},
1884 { 0, -1, &MPIR_Scatter_MV2_Direct},
1892 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1893 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1894 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1895 {4096, -1, &MPIR_Scatter_MV2_Direct},
1899 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1906 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1907 {0, 16, &MPIR_Scatter_MV2_Binomial},
1908 {16, 32, &MPIR_Scatter_MV2_Binomial},
1909 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1910 {4096, -1, &MPIR_Scatter_MV2_Direct},
1914 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1921 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1922 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1923 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1924 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1925 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1926 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1927 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1931 {0, 16, &MPIR_Scatter_MV2_Binomial},
1932 {16, 128, &MPIR_Scatter_MV2_Binomial},
1933 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1934 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1935 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1936 {65536, -1, &MPIR_Scatter_MV2_Direct},
1940 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1942 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1943 agg_table_sum += mv2_size_scatter_tuning_table[i];
1945 mv2_scatter_thresholds_table[0] =
1946 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1947 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1948 (sizeof(mv2_scatter_tuning_table)
1949 * mv2_size_scatter_tuning_table[0]));
1950 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1951 mv2_scatter_thresholds_table[i] =
1952 mv2_scatter_thresholds_table[i - 1]
1953 + mv2_size_scatter_tuning_table[i - 1];
1954 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1955 (sizeof(mv2_scatter_tuning_table)
1956 * mv2_size_scatter_tuning_table[i]));
1958 xbt_free(table_ptrs);