1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
18 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19 void *recvbuf, int recvcount, MPI_Datatype recvtype,
21 } mv2_alltoall_tuning_element;
26 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
40 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
41 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
42 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
43 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
44 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
47 static void init_mv2_alltoall_tables_stampede(){
49 int agg_table_sum = 0;
50 mv2_alltoall_tuning_table **table_ptrs = NULL;
51 mv2_alltoall_num_ppn_conf = 3;
52 if(smpi_coll_cleanup_callback==NULL)
53 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
54 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55 * mv2_alltoall_num_ppn_conf);
56 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57 * mv2_alltoall_num_ppn_conf);
58 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
59 mv2_alltoall_num_ppn_conf);
60 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
61 mv2_alltoall_table_ppn_conf[0] = 1;
62 mv2_size_alltoall_tuning_table[0] = 6;
63 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
66 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
69 {{0, -1, &MPIR_Alltoall_inplace_MV2},
75 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
76 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
79 {{0, -1, &MPIR_Alltoall_inplace_MV2},
85 {{0, 8, &MPIR_Alltoall_RD_MV2},
86 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
89 {{0, -1, &MPIR_Alltoall_inplace_MV2},
95 {{0, 64, &MPIR_Alltoall_RD_MV2},
96 {64, 512, &MPIR_Alltoall_bruck_MV2},
97 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
100 {{0,-1, &MPIR_Alltoall_inplace_MV2},
106 {{0, 32, &MPIR_Alltoall_RD_MV2},
107 {32, 2048, &MPIR_Alltoall_bruck_MV2},
108 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
111 {{0, -1, &MPIR_Alltoall_inplace_MV2},
117 {{0, 8, &MPIR_Alltoall_RD_MV2},
118 {8, 1024, &MPIR_Alltoall_bruck_MV2},
119 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
122 {{0, -1, &MPIR_Alltoall_inplace_MV2},
126 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
127 mv2_alltoall_table_ppn_conf[1] = 2;
128 mv2_size_alltoall_tuning_table[1] = 6;
129 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
132 {{0, 32, &MPIR_Alltoall_RD_MV2},
133 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
136 {{0, -1, &MPIR_Alltoall_inplace_MV2},
142 {{0, 64, &MPIR_Alltoall_RD_MV2},
143 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
146 {{0, -1, &MPIR_Alltoall_inplace_MV2},
152 {{0, 64, &MPIR_Alltoall_RD_MV2},
153 {64, 2048, &MPIR_Alltoall_bruck_MV2},
154 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
157 {{0,-1, &MPIR_Alltoall_inplace_MV2},
163 {{0, 16, &MPIR_Alltoall_RD_MV2},
164 {16, 2048, &MPIR_Alltoall_bruck_MV2},
165 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
168 {{0, -1, &MPIR_Alltoall_inplace_MV2},
174 {{0, 8, &MPIR_Alltoall_RD_MV2},
175 {8, 1024, &MPIR_Alltoall_bruck_MV2},
176 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
179 {{0, -1, &MPIR_Alltoall_inplace_MV2},
185 {{0, 4, &MPIR_Alltoall_RD_MV2},
186 {4, 2048, &MPIR_Alltoall_bruck_MV2},
187 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
190 {{0, -1, &MPIR_Alltoall_inplace_MV2},
194 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
195 mv2_alltoall_table_ppn_conf[2] = 16;
196 mv2_size_alltoall_tuning_table[2] = 7;
197 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
200 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
201 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
204 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
210 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
211 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
214 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
220 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
221 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
222 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
225 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
231 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
232 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
235 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
241 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
242 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
245 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
251 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
252 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
255 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
260 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
261 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
264 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
269 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
271 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
272 agg_table_sum += mv2_size_alltoall_tuning_table[i];
274 mv2_alltoall_thresholds_table[0] =
275 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
276 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
277 (sizeof(mv2_alltoall_tuning_table)
278 * mv2_size_alltoall_tuning_table[0]));
279 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
280 mv2_alltoall_thresholds_table[i] =
281 mv2_alltoall_thresholds_table[i - 1]
282 + mv2_size_alltoall_tuning_table[i - 1];
283 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
284 (sizeof(mv2_alltoall_tuning_table)
285 * mv2_size_alltoall_tuning_table[i]));
287 xbt_free(table_ptrs);
293 /************ Allgather variables and initializers */
298 int (*MV2_pt_Allgather_function)(void *sendbuf,
300 MPI_Datatype sendtype,
303 MPI_Datatype recvtype, MPI_Comm comm_ptr);
304 } mv2_allgather_tuning_element;
308 int two_level[MV2_MAX_NB_THRESHOLDS];
309 int size_inter_table;
310 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
311 } mv2_allgather_tuning_table;
313 int (*MV2_Allgather_function)(void *sendbuf,
315 MPI_Datatype sendtype,
318 MPI_Datatype recvtype, MPI_Comm comm);
320 int *mv2_allgather_table_ppn_conf = NULL;
321 int mv2_allgather_num_ppn_conf = 1;
322 int *mv2_size_allgather_tuning_table = NULL;
323 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
325 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
327 MPI_Datatype sendtype,
330 MPI_Datatype recvtype, MPI_Comm comm_ptr)
335 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
336 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
337 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
338 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
340 static void init_mv2_allgather_tables_stampede(){
342 int agg_table_sum = 0;
343 mv2_allgather_tuning_table **table_ptrs = NULL;
344 mv2_allgather_num_ppn_conf = 3;
345 mv2_allgather_thresholds_table
346 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
347 * mv2_allgather_num_ppn_conf);
348 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
349 * mv2_allgather_num_ppn_conf);
350 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
351 mv2_allgather_num_ppn_conf);
352 mv2_allgather_table_ppn_conf
353 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
354 mv2_allgather_table_ppn_conf[0] = 1;
355 mv2_size_allgather_tuning_table[0] = 6;
356 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
362 {0, -1, &MPIR_Allgather_Ring_MV2},
370 {0, 262144, &MPIR_Allgather_RD_MV2},
371 {262144, -1, &MPIR_Allgather_Ring_MV2},
379 {0, 131072, &MPIR_Allgather_RD_MV2},
380 {131072, -1, &MPIR_Allgather_Ring_MV2},
388 {0, 131072, &MPIR_Allgather_RD_MV2},
389 {131072, -1, &MPIR_Allgather_Ring_MV2},
397 {0, 65536, &MPIR_Allgather_RD_MV2},
398 {65536, -1, &MPIR_Allgather_Ring_MV2},
406 {0, 32768, &MPIR_Allgather_RD_MV2},
407 {32768, -1, &MPIR_Allgather_Ring_MV2},
411 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
412 mv2_allgather_table_ppn_conf[1] = 2;
413 mv2_size_allgather_tuning_table[1] = 6;
414 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
420 {0, 524288, &MPIR_Allgather_RD_MV2},
421 {524288, -1, &MPIR_Allgather_Ring_MV2},
429 {0, 32768, &MPIR_Allgather_RD_MV2},
430 {32768, 524288, &MPIR_Allgather_Ring_MV2},
431 {524288, -1, &MPIR_Allgather_Ring_MV2},
439 {0, 16384, &MPIR_Allgather_RD_MV2},
440 {16384, 524288, &MPIR_Allgather_Ring_MV2},
441 {524288, -1, &MPIR_Allgather_Ring_MV2},
449 {0, 65536, &MPIR_Allgather_RD_MV2},
450 {65536, 524288, &MPIR_Allgather_Ring_MV2},
451 {524288, -1, &MPIR_Allgather_Ring_MV2},
459 {0, 32768, &MPIR_Allgather_RD_MV2},
460 {32768, 524288, &MPIR_Allgather_Ring_MV2},
461 {524288, -1, &MPIR_Allgather_Ring_MV2},
469 {0, 65536, &MPIR_Allgather_RD_MV2},
470 {65536, 524288, &MPIR_Allgather_Ring_MV2},
471 {524288, -1, &MPIR_Allgather_Ring_MV2},
475 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
476 mv2_allgather_table_ppn_conf[2] = 16;
477 mv2_size_allgather_tuning_table[2] = 6;
478 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
484 {0, 1024, &MPIR_Allgather_RD_MV2},
485 {1024, -1, &MPIR_Allgather_Ring_MV2},
493 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
494 {1024, -1, &MPIR_Allgather_Ring_MV2},
502 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
503 {1024, -1, &MPIR_Allgather_Ring_MV2},
511 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
512 {1024, -1, &MPIR_Allgather_Ring_MV2},
520 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
521 {1024, -1, &MPIR_Allgather_Ring_MV2},
529 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
530 {1024, -1, &MPIR_Allgather_Ring_MV2},
535 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
537 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
538 agg_table_sum += mv2_size_allgather_tuning_table[i];
540 mv2_allgather_thresholds_table[0] =
541 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
542 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
543 (sizeof(mv2_allgather_tuning_table)
544 * mv2_size_allgather_tuning_table[0]));
545 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
546 mv2_allgather_thresholds_table[i] =
547 mv2_allgather_thresholds_table[i - 1]
548 + mv2_size_allgather_tuning_table[i - 1];
549 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
550 (sizeof(mv2_allgather_tuning_table)
551 * mv2_size_allgather_tuning_table[i]));
553 xbt_free(table_ptrs);
557 /************ Gather variables and initializers */
562 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
563 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
564 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
565 } mv2_gather_tuning_element;
570 int size_inter_table;
571 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
572 int size_intra_table;
573 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
574 } mv2_gather_tuning_table;
576 int mv2_size_gather_tuning_table=7;
577 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
579 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
581 MPI_Datatype sendtype,
584 MPI_Datatype recvtype,
585 int root, MPI_Comm comm);
587 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
588 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
591 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
592 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
593 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
596 static void init_mv2_gather_tables_stampede(){
598 if(smpi_coll_cleanup_callback==NULL)
599 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
600 mv2_size_gather_tuning_table=7;
601 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
602 sizeof (mv2_gather_tuning_table));
603 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
605 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
606 {524288, -1, &MPIR_Gather_intra}},
607 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
609 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
610 {16384, 131072, &MPIR_Gather_intra},
611 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
612 1,{{0, -1, &MPIR_Gather_intra}}},
614 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
615 {256, 16384, &MPIR_Gather_MV2_Direct},
616 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
617 1,{{0, -1, &MPIR_Gather_intra}}},
619 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
620 {512, 16384, &MPIR_Gather_MV2_Direct},
621 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
622 1,{{0, -1, &MPIR_Gather_intra}}},
624 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
625 {512, 16384, &MPIR_Gather_MV2_Direct},
626 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
627 1,{{0, -1, &MPIR_Gather_intra}}},
629 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
630 {512, 16384, &MPIR_Gather_MV2_Direct},
631 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
632 1,{{0, -1, &MPIR_Gather_intra}}},
634 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
635 {512, 16384, &MPIR_Gather_MV2_Direct},
636 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
637 1,{{0, -1, &MPIR_Gather_intra}}},
640 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
641 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
646 /************ Allgatherv variables and initializers */
651 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
653 MPI_Datatype sendtype,
657 MPI_Datatype recvtype,
659 } mv2_allgatherv_tuning_element;
663 int size_inter_table;
664 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
665 } mv2_allgatherv_tuning_table;
667 int (*MV2_Allgatherv_function)(void *sendbuf,
669 MPI_Datatype sendtype,
673 MPI_Datatype recvtype,
676 int mv2_size_allgatherv_tuning_table = 0;
677 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
679 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
680 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
681 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
684 static void init_mv2_allgatherv_tables_stampede(){
685 if(smpi_coll_cleanup_callback==NULL)
686 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
687 mv2_size_allgatherv_tuning_table = 6;
688 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
689 sizeof (mv2_allgatherv_tuning_table));
690 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
695 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
696 {512, -1, &MPIR_Allgatherv_Ring_MV2},
703 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
704 {512, -1, &MPIR_Allgatherv_Ring_MV2},
711 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
712 {256, -1, &MPIR_Allgatherv_Ring_MV2},
719 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
720 {256, -1, &MPIR_Allgatherv_Ring_MV2},
727 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
728 {256, -1, &MPIR_Allgatherv_Ring_MV2},
735 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
736 {256, -1, &MPIR_Allgatherv_Ring_MV2},
741 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
742 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
746 /************ Allreduce variables and initializers */
751 int (*MV2_pt_Allreduce_function)(void *sendbuf,
754 MPI_Datatype datatype,
755 MPI_Op op, MPI_Comm comm);
756 } mv2_allreduce_tuning_element;
761 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
762 int size_inter_table;
763 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
764 int size_intra_table;
765 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
766 } mv2_allreduce_tuning_table;
769 int (*MV2_Allreduce_function)(void *sendbuf,
772 MPI_Datatype datatype,
773 MPI_Op op, MPI_Comm comm)=NULL;
776 int (*MV2_Allreduce_intra_function)( void *sendbuf,
779 MPI_Datatype datatype,
780 MPI_Op op, MPI_Comm comm)=NULL;
782 int mv2_size_allreduce_tuning_table = 0;
783 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
789 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
792 MPI_Datatype datatype,
793 MPI_Op op, MPI_Comm comm)
798 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
801 MPI_Datatype datatype,
802 MPI_Op op, MPI_Comm comm)
807 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
810 MPI_Datatype datatype,
811 MPI_Op op, MPI_Comm comm)
813 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
817 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
820 MPI_Datatype datatype,
821 MPI_Op op, MPI_Comm comm)
823 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
827 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
828 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
829 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
832 static void init_mv2_allreduce_tables_stampede(){
833 if(smpi_coll_cleanup_callback==NULL)
834 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
835 mv2_size_allreduce_tuning_table = 8;
836 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
837 sizeof (mv2_allreduce_tuning_table));
838 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
845 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
846 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
850 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
851 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
860 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
861 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
862 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
866 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
867 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
876 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
877 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
878 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
882 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
883 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
892 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
893 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
894 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
898 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
899 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
908 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
909 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
910 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
914 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
915 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
924 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
925 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
926 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
930 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
931 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
940 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
941 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
942 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
943 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
947 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
948 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
957 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
958 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
959 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
960 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
961 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
965 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
966 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
971 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
972 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
981 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
982 int root, MPI_Comm comm_ptr);
983 int zcpy_pipelined_knomial_factor;
984 } mv2_bcast_tuning_element;
988 int bcast_segment_size;
989 int intra_node_knomial_factor;
990 int inter_node_knomial_factor;
991 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
992 int size_inter_table;
993 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
994 int size_intra_table;
995 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
996 } mv2_bcast_tuning_table;
998 int mv2_size_bcast_tuning_table = 0;
999 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1002 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1003 int root, MPI_Comm comm_ptr) = NULL;
1005 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1006 int root, MPI_Comm comm_ptr) = NULL;
1008 int zcpy_knomial_factor = 2;
1009 int mv2_pipelined_zcpy_knomial_factor = -1;
1010 int bcast_segment_size = 8192;
1011 int mv2_inter_node_knomial_factor = 4;
1012 int mv2_intra_node_knomial_factor = 4;
1013 #define mv2_bcast_two_level_system_size 64
1014 #define mv2_bcast_short_msg 16384
1015 #define mv2_bcast_large_msg 512*1024
1017 #define INTRA_NODE_ROOT 0
1019 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1020 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1021 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1022 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1023 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1024 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1025 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1026 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1027 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1028 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1029 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1031 static void init_mv2_bcast_tables_stampede(){
1033 if(smpi_coll_cleanup_callback==NULL)
1034 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1035 mv2_size_bcast_tuning_table=8;
1036 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1037 sizeof (mv2_bcast_tuning_table));
1039 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1043 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1046 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1047 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1048 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1049 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1050 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1051 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1054 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1055 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1056 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1060 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1061 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1062 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1063 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1064 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1065 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1066 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1067 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1068 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1069 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1070 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1076 {1, 1, 1, 1, 1, 1, 1, 1},
1079 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1080 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1082 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1085 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1090 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1091 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1092 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1093 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1094 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1095 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1096 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1097 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1103 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1106 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1107 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1108 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1110 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1118 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1119 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1120 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1121 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1122 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1123 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1124 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1125 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1126 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1135 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1136 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1137 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1138 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1142 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1143 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1144 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1145 {524288, -1, NULL, -1}
1154 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1155 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1156 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1157 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1158 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1162 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1163 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1164 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1165 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1166 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1175 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1176 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1177 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1178 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1179 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1183 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1184 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1185 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1186 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1187 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1196 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1197 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1198 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1199 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1200 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1204 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1205 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1206 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1207 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1208 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1214 {1, 1, 1, 1, 1, 1, 1},
1217 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1218 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1219 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1220 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1221 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1222 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1223 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1227 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1228 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1229 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1230 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1231 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1232 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1233 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1238 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1239 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1243 /************ Reduce variables and initializers */
1248 int (*MV2_pt_Reduce_function)(void *sendbuf,
1251 MPI_Datatype datatype,
1255 } mv2_reduce_tuning_element;
1261 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1262 int size_inter_table;
1263 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1264 int size_intra_table;
1265 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1266 } mv2_reduce_tuning_table;
1268 int mv2_size_reduce_tuning_table = 0;
1269 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1272 int mv2_reduce_intra_knomial_factor = -1;
1273 int mv2_reduce_inter_knomial_factor = -1;
1275 int (*MV2_Reduce_function)( void *sendbuf,
1278 MPI_Datatype datatype,
1281 MPI_Comm comm_ptr)=NULL;
1283 int (*MV2_Reduce_intra_function)( void *sendbuf,
1286 MPI_Datatype datatype,
1289 MPI_Comm comm_ptr)=NULL;
1292 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1293 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1294 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1295 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1296 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1297 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1300 static void init_mv2_reduce_tables_stampede(){
1301 if(smpi_coll_cleanup_callback==NULL)
1302 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1304 mv2_size_reduce_tuning_table = 8;
1305 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1306 sizeof (mv2_reduce_tuning_table));
1307 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1315 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1316 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1317 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1321 {0, 65536, &MPIR_Reduce_shmem_MV2},
1322 {65536,-1, &MPIR_Reduce_binomial_MV2},
1329 {1, 1, 1, 1, 0, 0, 0},
1332 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1333 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1334 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1335 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1336 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1338 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1342 {0, 8192, &MPIR_Reduce_shmem_MV2},
1343 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1344 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1345 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1346 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1347 {262144,-1, &MPIR_Reduce_binomial_MV2},
1357 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1358 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1359 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1360 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1361 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1365 {0, 8192, &MPIR_Reduce_shmem_MV2},
1366 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1367 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1368 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1369 {262144, -1, &MPIR_Reduce_binomial_MV2},
1379 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1380 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1381 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1382 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1383 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1384 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1388 {0, 8192, &MPIR_Reduce_shmem_MV2},
1389 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1390 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1391 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1392 {262144, -1, &MPIR_Reduce_binomial_MV2},
1399 {1, 1, 1, 0, 1, 1, 0},
1402 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1403 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1404 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1405 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1406 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1407 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1408 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1412 {0, 8192, &MPIR_Reduce_shmem_MV2},
1413 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1414 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1415 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1416 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1417 {262144, -1, &MPIR_Reduce_binomial_MV2},
1427 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1428 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1429 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1430 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1431 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1432 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1436 {0, 8192, &MPIR_Reduce_shmem_MV2},
1437 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1438 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1439 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1440 {262144, -1, &MPIR_Reduce_binomial_MV2},
1450 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1451 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1452 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1453 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1454 {262144, -1, &MPIR_Reduce_binomial_MV2},
1458 {0, 8192, &MPIR_Reduce_shmem_MV2},
1459 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1460 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1461 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1462 {262144, -1, &MPIR_Reduce_binomial_MV2},
1472 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1473 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1474 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1475 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1476 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1477 {131072, -1, &MPIR_Reduce_binomial_MV2},
1481 {0, 2048, &MPIR_Reduce_shmem_MV2},
1482 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1483 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1484 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1485 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1486 {131072, -1, &MPIR_Reduce_shmem_MV2},
1491 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1492 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1495 /************ Reduce scatter variables and initializers */
1500 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1503 MPI_Datatype datatype,
1506 } mv2_red_scat_tuning_element;
1510 int size_inter_table;
1511 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1512 } mv2_red_scat_tuning_table;
1514 int mv2_size_red_scat_tuning_table = 0;
1515 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1518 int (*MV2_Red_scat_function)(void *sendbuf,
1521 MPI_Datatype datatype,
1527 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1530 MPI_Datatype datatype,
1534 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1537 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1538 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1539 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1544 static void init_mv2_reduce_scatter_tables_stampede(){
1545 if(smpi_coll_cleanup_callback==NULL)
1546 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1547 mv2_size_red_scat_tuning_table = 6;
1548 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1549 sizeof (mv2_red_scat_tuning_table));
1550 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1555 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1556 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1557 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1564 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1565 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1566 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1573 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1574 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1575 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1582 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1583 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1590 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1591 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1598 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1599 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1604 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1605 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1608 /************ Scatter variables and initializers */
1613 int (*MV2_pt_Scatter_function)(void *sendbuf,
1615 MPI_Datatype sendtype,
1618 MPI_Datatype recvtype,
1619 int root, MPI_Comm comm);
1620 } mv2_scatter_tuning_element;
1624 int size_inter_table;
1625 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1626 int size_intra_table;
1627 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1628 } mv2_scatter_tuning_table;
1631 int *mv2_scatter_table_ppn_conf = NULL;
1632 int mv2_scatter_num_ppn_conf = 1;
1633 int *mv2_size_scatter_tuning_table = NULL;
1634 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1636 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1637 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1638 int root, MPI_Comm comm)=NULL;
1640 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1641 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1642 int root, MPI_Comm comm)=NULL;
1643 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1645 MPI_Datatype sendtype,
1648 MPI_Datatype recvtype,
1649 int root, MPI_Comm comm_ptr);
1651 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1653 MPI_Datatype sendtype,
1656 MPI_Datatype recvtype,
1657 int root, MPI_Comm comm_ptr)
1662 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1663 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1664 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1665 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1670 static void init_mv2_scatter_tables_stampede(){
1671 if(smpi_coll_cleanup_callback==NULL)
1672 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1674 int agg_table_sum = 0;
1676 mv2_scatter_tuning_table **table_ptrs = NULL;
1677 mv2_scatter_num_ppn_conf = 3;
1678 mv2_scatter_thresholds_table
1679 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1680 * mv2_scatter_num_ppn_conf);
1681 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1682 * mv2_scatter_num_ppn_conf);
1683 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1684 mv2_scatter_num_ppn_conf);
1685 mv2_scatter_table_ppn_conf
1686 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1687 mv2_scatter_table_ppn_conf[0] = 1;
1688 mv2_size_scatter_tuning_table[0] = 6;
1689 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1693 {0, -1, &MPIR_Scatter_MV2_Binomial},
1697 {0, -1, &MPIR_Scatter_MV2_Binomial},
1704 {0, -1, &MPIR_Scatter_MV2_Direct},
1708 {0, -1, &MPIR_Scatter_MV2_Direct},
1715 {0, -1, &MPIR_Scatter_MV2_Direct},
1719 {0, -1, &MPIR_Scatter_MV2_Direct},
1726 {0, -1, &MPIR_Scatter_MV2_Direct},
1730 {0, -1, &MPIR_Scatter_MV2_Direct},
1737 {0, -1, &MPIR_Scatter_MV2_Direct},
1741 {0, -1, &MPIR_Scatter_MV2_Direct},
1748 {0, 32, &MPIR_Scatter_MV2_Binomial},
1749 {32, -1, &MPIR_Scatter_MV2_Direct},
1753 {0, -1, &MPIR_Scatter_MV2_Binomial},
1757 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1758 mv2_scatter_table_ppn_conf[1] = 2;
1759 mv2_size_scatter_tuning_table[1] = 6;
1760 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1764 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1765 {4096, -1, &MPIR_Scatter_MV2_Direct},
1769 {0, -1, &MPIR_Scatter_MV2_Direct},
1776 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1777 {512, -1, &MPIR_Scatter_MV2_Direct},
1781 {0, -1, &MPIR_Scatter_MV2_Binomial},
1788 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1789 {2048, -1, &MPIR_Scatter_MV2_Direct},
1793 {0, -1, &MPIR_Scatter_MV2_Binomial},
1800 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1801 {2048, -1, &MPIR_Scatter_MV2_Direct},
1805 {0, -1, &MPIR_Scatter_MV2_Binomial},
1812 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1813 {8192, -1, &MPIR_Scatter_MV2_Direct},
1817 {0, -1, &MPIR_Scatter_MV2_Binomial},
1824 {0, 16, &MPIR_Scatter_MV2_Binomial},
1825 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1826 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1827 {16384, -1, &MPIR_Scatter_MV2_Direct},
1831 {0, 128, &MPIR_Scatter_MV2_Direct},
1832 {128, -1, &MPIR_Scatter_MV2_Binomial},
1836 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1837 mv2_scatter_table_ppn_conf[2] = 16;
1838 mv2_size_scatter_tuning_table[2] = 8;
1839 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1844 {0, 256, &MPIR_Scatter_MV2_Binomial},
1845 {256, -1, &MPIR_Scatter_MV2_Direct},
1849 { 0, -1, &MPIR_Scatter_MV2_Direct},
1857 {0, 512, &MPIR_Scatter_MV2_Binomial},
1858 {512, -1, &MPIR_Scatter_MV2_Direct},
1862 { 0, -1, &MPIR_Scatter_MV2_Direct},
1870 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1871 {1024, -1, &MPIR_Scatter_MV2_Direct},
1875 { 0, -1, &MPIR_Scatter_MV2_Direct},
1883 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1884 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1885 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1886 {2048, -1, &MPIR_Scatter_MV2_Direct},
1890 { 0, -1, &MPIR_Scatter_MV2_Direct},
1898 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1899 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1900 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1901 {2048, -1, &MPIR_Scatter_MV2_Direct},
1905 { 0, -1, &MPIR_Scatter_MV2_Direct},
1913 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1914 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1915 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1916 {4096, -1, &MPIR_Scatter_MV2_Direct},
1920 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1927 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1928 {0, 16, &MPIR_Scatter_MV2_Binomial},
1929 {16, 32, &MPIR_Scatter_MV2_Binomial},
1930 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1931 {4096, -1, &MPIR_Scatter_MV2_Direct},
1935 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1942 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1943 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1944 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1945 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1946 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1947 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1948 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1952 {0, 16, &MPIR_Scatter_MV2_Binomial},
1953 {16, 128, &MPIR_Scatter_MV2_Binomial},
1954 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1955 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1956 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1957 {65536, -1, &MPIR_Scatter_MV2_Direct},
1961 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1963 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1964 agg_table_sum += mv2_size_scatter_tuning_table[i];
1966 mv2_scatter_thresholds_table[0] =
1967 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1968 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1969 (sizeof(mv2_scatter_tuning_table)
1970 * mv2_size_scatter_tuning_table[0]));
1971 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1972 mv2_scatter_thresholds_table[i] =
1973 mv2_scatter_thresholds_table[i - 1]
1974 + mv2_size_scatter_tuning_table[i - 1];
1975 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1976 (sizeof(mv2_scatter_tuning_table)
1977 * mv2_size_scatter_tuning_table[i]));
1979 xbt_free(table_ptrs);