1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
10 /************ Alltoall variables and initializers */
12 #define MV2_MAX_NB_THRESHOLDS 32
16 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
17 void *recvbuf, int recvcount, MPI_Datatype recvtype,
19 } mv2_alltoall_tuning_element;
24 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
25 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
26 } mv2_alltoall_tuning_table;
28 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
30 /* Indicates number of processes per node */
31 int *mv2_alltoall_table_ppn_conf = NULL;
32 /* Indicates total number of configurations */
33 int mv2_alltoall_num_ppn_conf = 1;
34 int *mv2_size_alltoall_tuning_table = NULL;
35 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
39 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
40 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
41 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
42 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
45 static void init_mv2_alltoall_tables_stampede(){
47 int agg_table_sum = 0;
48 mv2_alltoall_tuning_table **table_ptrs = NULL;
49 mv2_alltoall_num_ppn_conf = 3;
50 mv2_alltoall_thresholds_table = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
51 * mv2_alltoall_num_ppn_conf);
52 table_ptrs = xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
53 * mv2_alltoall_num_ppn_conf);
54 mv2_size_alltoall_tuning_table = xbt_malloc(sizeof(int) *
55 mv2_alltoall_num_ppn_conf);
56 mv2_alltoall_table_ppn_conf = xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
57 mv2_alltoall_table_ppn_conf[0] = 1;
58 mv2_size_alltoall_tuning_table[0] = 6;
59 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
62 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
65 {{0, -1, &MPIR_Alltoall_inplace_MV2},
71 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
72 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
75 {{0, -1, &MPIR_Alltoall_inplace_MV2},
81 {{0, 8, &MPIR_Alltoall_RD_MV2},
82 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
85 {{0, -1, &MPIR_Alltoall_inplace_MV2},
91 {{0, 64, &MPIR_Alltoall_RD_MV2},
92 {64, 512, &MPIR_Alltoall_bruck_MV2},
93 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
96 {{0,-1, &MPIR_Alltoall_inplace_MV2},
102 {{0, 32, &MPIR_Alltoall_RD_MV2},
103 {32, 2048, &MPIR_Alltoall_bruck_MV2},
104 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
107 {{0, -1, &MPIR_Alltoall_inplace_MV2},
113 {{0, 8, &MPIR_Alltoall_RD_MV2},
114 {8, 1024, &MPIR_Alltoall_bruck_MV2},
115 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
118 {{0, -1, &MPIR_Alltoall_inplace_MV2},
122 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
123 mv2_alltoall_table_ppn_conf[1] = 2;
124 mv2_size_alltoall_tuning_table[1] = 6;
125 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
128 {{0, 32, &MPIR_Alltoall_RD_MV2},
129 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
132 {{0, -1, &MPIR_Alltoall_inplace_MV2},
138 {{0, 64, &MPIR_Alltoall_RD_MV2},
139 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
142 {{0, -1, &MPIR_Alltoall_inplace_MV2},
148 {{0, 64, &MPIR_Alltoall_RD_MV2},
149 {64, 2048, &MPIR_Alltoall_bruck_MV2},
150 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
153 {{0,-1, &MPIR_Alltoall_inplace_MV2},
159 {{0, 16, &MPIR_Alltoall_RD_MV2},
160 {16, 2048, &MPIR_Alltoall_bruck_MV2},
161 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
164 {{0, -1, &MPIR_Alltoall_inplace_MV2},
170 {{0, 8, &MPIR_Alltoall_RD_MV2},
171 {8, 1024, &MPIR_Alltoall_bruck_MV2},
172 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
175 {{0, -1, &MPIR_Alltoall_inplace_MV2},
181 {{0, 4, &MPIR_Alltoall_RD_MV2},
182 {4, 2048, &MPIR_Alltoall_bruck_MV2},
183 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
186 {{0, -1, &MPIR_Alltoall_inplace_MV2},
190 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
191 mv2_alltoall_table_ppn_conf[2] = 16;
192 mv2_size_alltoall_tuning_table[2] = 7;
193 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
196 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
197 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
200 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
206 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
207 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
210 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
216 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
217 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
218 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
221 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
227 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
228 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
231 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
237 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
238 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
241 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
247 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
248 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
251 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
256 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
257 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
260 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
265 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
267 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
268 agg_table_sum += mv2_size_alltoall_tuning_table[i];
270 mv2_alltoall_thresholds_table[0] =
271 xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
272 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
273 (sizeof(mv2_alltoall_tuning_table)
274 * mv2_size_alltoall_tuning_table[0]));
275 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
276 mv2_alltoall_thresholds_table[i] =
277 mv2_alltoall_thresholds_table[i - 1]
278 + mv2_size_alltoall_tuning_table[i - 1];
279 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
280 (sizeof(mv2_alltoall_tuning_table)
281 * mv2_size_alltoall_tuning_table[i]));
283 xbt_free(table_ptrs);
289 /************ Allgather variables and initializers */
294 int (*MV2_pt_Allgather_function)(void *sendbuf,
296 MPI_Datatype sendtype,
299 MPI_Datatype recvtype, MPI_Comm comm_ptr);
300 } mv2_allgather_tuning_element;
304 int two_level[MV2_MAX_NB_THRESHOLDS];
305 int size_inter_table;
306 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
307 } mv2_allgather_tuning_table;
309 int (*MV2_Allgather_function)(void *sendbuf,
311 MPI_Datatype sendtype,
314 MPI_Datatype recvtype, MPI_Comm comm);
316 int *mv2_allgather_table_ppn_conf = NULL;
317 int mv2_allgather_num_ppn_conf = 1;
318 int *mv2_size_allgather_tuning_table = NULL;
319 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
321 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
323 MPI_Datatype sendtype,
326 MPI_Datatype recvtype, MPI_Comm comm_ptr)
331 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
332 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
333 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
336 static void init_mv2_allgather_tables_stampede(){
338 int agg_table_sum = 0;
339 mv2_allgather_tuning_table **table_ptrs = NULL;
340 mv2_allgather_num_ppn_conf = 3;
341 mv2_allgather_thresholds_table
342 = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
343 * mv2_allgather_num_ppn_conf);
344 table_ptrs = xbt_malloc(sizeof(mv2_allgather_tuning_table *)
345 * mv2_allgather_num_ppn_conf);
346 mv2_size_allgather_tuning_table = xbt_malloc(sizeof(int) *
347 mv2_allgather_num_ppn_conf);
348 mv2_allgather_table_ppn_conf
349 = xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int));
350 mv2_allgather_table_ppn_conf[0] = 1;
351 mv2_size_allgather_tuning_table[0] = 6;
352 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
358 {0, -1, &MPIR_Allgather_Ring_MV2},
366 {0, 262144, &MPIR_Allgather_RD_MV2},
367 {262144, -1, &MPIR_Allgather_Ring_MV2},
375 {0, 131072, &MPIR_Allgather_RD_MV2},
376 {131072, -1, &MPIR_Allgather_Ring_MV2},
384 {0, 131072, &MPIR_Allgather_RD_MV2},
385 {131072, -1, &MPIR_Allgather_Ring_MV2},
393 {0, 65536, &MPIR_Allgather_RD_MV2},
394 {65536, -1, &MPIR_Allgather_Ring_MV2},
402 {0, 32768, &MPIR_Allgather_RD_MV2},
403 {32768, -1, &MPIR_Allgather_Ring_MV2},
407 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
408 mv2_allgather_table_ppn_conf[1] = 2;
409 mv2_size_allgather_tuning_table[1] = 6;
410 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
416 {0, 524288, &MPIR_Allgather_RD_MV2},
417 {524288, -1, &MPIR_Allgather_Ring_MV2},
425 {0, 32768, &MPIR_Allgather_RD_MV2},
426 {32768, 524288, &MPIR_Allgather_Ring_MV2},
427 {524288, -1, &MPIR_Allgather_Ring_MV2},
435 {0, 16384, &MPIR_Allgather_RD_MV2},
436 {16384, 524288, &MPIR_Allgather_Ring_MV2},
437 {524288, -1, &MPIR_Allgather_Ring_MV2},
445 {0, 65536, &MPIR_Allgather_RD_MV2},
446 {65536, 524288, &MPIR_Allgather_Ring_MV2},
447 {524288, -1, &MPIR_Allgather_Ring_MV2},
455 {0, 32768, &MPIR_Allgather_RD_MV2},
456 {32768, 524288, &MPIR_Allgather_Ring_MV2},
457 {524288, -1, &MPIR_Allgather_Ring_MV2},
465 {0, 65536, &MPIR_Allgather_RD_MV2},
466 {65536, 524288, &MPIR_Allgather_Ring_MV2},
467 {524288, -1, &MPIR_Allgather_Ring_MV2},
471 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
472 mv2_allgather_table_ppn_conf[2] = 16;
473 mv2_size_allgather_tuning_table[2] = 6;
474 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
480 {0, 1024, &MPIR_Allgather_RD_MV2},
481 {1024, -1, &MPIR_Allgather_Ring_MV2},
489 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
490 {1024, -1, &MPIR_Allgather_Ring_MV2},
498 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499 {1024, -1, &MPIR_Allgather_Ring_MV2},
507 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508 {1024, -1, &MPIR_Allgather_Ring_MV2},
516 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517 {1024, -1, &MPIR_Allgather_Ring_MV2},
525 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
526 {1024, -1, &MPIR_Allgather_Ring_MV2},
531 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
533 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
534 agg_table_sum += mv2_size_allgather_tuning_table[i];
536 mv2_allgather_thresholds_table[0] =
537 xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
538 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
539 (sizeof(mv2_allgather_tuning_table)
540 * mv2_size_allgather_tuning_table[0]));
541 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
542 mv2_allgather_thresholds_table[i] =
543 mv2_allgather_thresholds_table[i - 1]
544 + mv2_size_allgather_tuning_table[i - 1];
545 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
546 (sizeof(mv2_allgather_tuning_table)
547 * mv2_size_allgather_tuning_table[i]));
549 xbt_free(table_ptrs);
553 /************ Gather variables and initializers */
558 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
559 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
560 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
561 } mv2_gather_tuning_element;
566 int size_inter_table;
567 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
568 int size_intra_table;
569 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
570 } mv2_gather_tuning_table;
572 int mv2_size_gather_tuning_table=7;
573 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
575 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
577 MPI_Datatype sendtype,
580 MPI_Datatype recvtype,
581 int root, MPI_Comm comm);
583 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
584 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
587 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
588 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_ompi_basic_linear
589 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
592 static void init_mv2_gather_tables_stampede(){
594 mv2_size_gather_tuning_table=7;
595 mv2_gather_thresholds_table = xbt_malloc(mv2_size_gather_tuning_table*
596 sizeof (mv2_gather_tuning_table));
597 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
599 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
600 {524288, -1, &MPIR_Gather_intra}},
601 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
603 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
604 {16384, 131072, &MPIR_Gather_intra},
605 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
606 1,{{0, -1, &MPIR_Gather_intra}}},
608 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
609 {256, 16384, &MPIR_Gather_MV2_Direct},
610 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
611 1,{{0, -1, &MPIR_Gather_intra}}},
613 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
614 {512, 16384, &MPIR_Gather_MV2_Direct},
615 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
616 1,{{0, -1, &MPIR_Gather_intra}}},
618 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
619 {512, 16384, &MPIR_Gather_MV2_Direct},
620 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
621 1,{{0, -1, &MPIR_Gather_intra}}},
623 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
624 {512, 16384, &MPIR_Gather_MV2_Direct},
625 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
626 1,{{0, -1, &MPIR_Gather_intra}}},
628 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
629 {512, 16384, &MPIR_Gather_MV2_Direct},
630 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
631 1,{{0, -1, &MPIR_Gather_intra}}},
634 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
635 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
640 /************ Allgatherv variables and initializers */
645 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
647 MPI_Datatype sendtype,
651 MPI_Datatype recvtype,
653 } mv2_allgatherv_tuning_element;
657 int size_inter_table;
658 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
659 } mv2_allgatherv_tuning_table;
661 int (*MV2_Allgatherv_function)(void *sendbuf,
663 MPI_Datatype sendtype,
667 MPI_Datatype recvtype,
670 int mv2_size_allgatherv_tuning_table = 0;
671 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
673 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
674 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
675 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
678 static void init_mv2_allgatherv_tables_stampede(){
679 mv2_size_allgatherv_tuning_table = 6;
680 mv2_allgatherv_thresholds_table = xbt_malloc(mv2_size_allgatherv_tuning_table *
681 sizeof (mv2_allgatherv_tuning_table));
682 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
687 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
688 {512, -1, &MPIR_Allgatherv_Ring_MV2},
695 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
696 {512, -1, &MPIR_Allgatherv_Ring_MV2},
703 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
704 {256, -1, &MPIR_Allgatherv_Ring_MV2},
711 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
712 {256, -1, &MPIR_Allgatherv_Ring_MV2},
719 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
720 {256, -1, &MPIR_Allgatherv_Ring_MV2},
727 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
728 {256, -1, &MPIR_Allgatherv_Ring_MV2},
733 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
734 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
738 /************ Allreduce variables and initializers */
743 int (*MV2_pt_Allreduce_function)(void *sendbuf,
746 MPI_Datatype datatype,
747 MPI_Op op, MPI_Comm comm);
748 } mv2_allreduce_tuning_element;
753 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
754 int size_inter_table;
755 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
756 int size_intra_table;
757 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
758 } mv2_allreduce_tuning_table;
761 int (*MV2_Allreduce_function)(void *sendbuf,
764 MPI_Datatype datatype,
765 MPI_Op op, MPI_Comm comm)=NULL;
768 int (*MV2_Allreduce_intra_function)( void *sendbuf,
771 MPI_Datatype datatype,
772 MPI_Op op, MPI_Comm comm)=NULL;
774 int mv2_size_allreduce_tuning_table = 0;
775 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
781 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
784 MPI_Datatype datatype,
785 MPI_Op op, MPI_Comm comm)
790 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
793 MPI_Datatype datatype,
794 MPI_Op op, MPI_Comm comm)
799 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
802 MPI_Datatype datatype,
803 MPI_Op op, MPI_Comm comm)
805 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
809 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
812 MPI_Datatype datatype,
813 MPI_Op op, MPI_Comm comm)
815 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
819 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
820 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
824 static void init_mv2_allreduce_tables_stampede(){
825 mv2_size_allreduce_tuning_table = 8;
826 mv2_allreduce_thresholds_table = xbt_malloc(mv2_size_allreduce_tuning_table *
827 sizeof (mv2_allreduce_tuning_table));
828 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
835 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
836 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
840 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
841 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
850 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
851 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
852 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
856 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
857 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
866 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
867 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
868 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
872 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
873 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
882 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
883 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
884 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
888 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
889 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
898 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
899 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
900 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
904 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
905 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
914 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
915 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
916 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
920 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
921 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
930 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
931 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
932 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
933 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
937 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
938 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
947 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
948 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
949 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
950 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
951 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
955 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
956 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
961 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
962 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
967 Bcast deactivated for now, defaults to mpich one
971 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
972 int root, MPI_Comm comm_ptr);
973 int zcpy_pipelined_knomial_factor;
974 } mv2_bcast_tuning_element;
978 int bcast_segment_size;
979 int intra_node_knomial_factor;
980 int inter_node_knomial_factor;
981 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
982 int size_inter_table;
983 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
984 int size_intra_table;
985 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
986 } mv2_bcast_tuning_table;
988 int mv2_size_bcast_tuning_table = 0;
989 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
992 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
993 int root, MPI_Comm comm_ptr) = NULL;
995 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
996 int root, MPI_Comm comm_ptr) = NULL;
1003 static void init_mv2_bcast_tables_stampede(){
1005 mv2_size_bcast_tuning_table=8;
1006 mv2_bcast_thresholds_table = xbt_malloc(mv2_size_bcast_tuning_table *
1007 sizeof (mv2_bcast_tuning_table));
1009 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1013 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1016 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1017 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1018 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1019 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1020 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1021 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1022 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1023 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1024 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1025 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1026 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1030 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1031 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1032 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1033 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1034 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1035 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1036 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1037 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1038 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1039 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1040 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1046 {1, 1, 1, 1, 1, 1, 1, 1},
1049 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1050 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1051 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1052 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1053 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1054 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1055 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1056 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1060 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1061 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1062 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1063 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1064 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1065 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1066 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1067 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1073 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1076 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1077 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1078 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1079 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1080 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1081 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1082 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1084 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1088 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1089 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1090 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1091 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1092 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1093 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1094 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1095 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1096 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1105 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1106 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1107 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1108 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1112 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1113 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1114 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1115 {524288, -1, NULL, -1}
1124 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1125 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1126 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1127 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1128 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1132 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1133 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1134 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1135 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1136 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1145 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1146 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1147 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1148 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1149 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1153 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1154 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1155 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1156 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1157 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1166 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1167 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1168 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1169 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1170 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1174 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1175 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1176 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1177 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1178 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1184 {1, 1, 1, 1, 1, 1, 1},
1187 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1188 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1189 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1190 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1191 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1192 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1193 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1197 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1198 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1199 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1200 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1201 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1202 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1203 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1208 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1209 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1213 /************ Reduce variables and initializers */
1218 int (*MV2_pt_Reduce_function)(void *sendbuf,
1221 MPI_Datatype datatype,
1225 } mv2_reduce_tuning_element;
1231 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1232 int size_inter_table;
1233 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1234 int size_intra_table;
1235 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1236 } mv2_reduce_tuning_table;
1238 int mv2_size_reduce_tuning_table = 0;
1239 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1242 int mv2_reduce_intra_knomial_factor = 2;
1243 int mv2_reduce_inter_knomial_factor = 2;
1245 int (*MV2_Reduce_function)( void *sendbuf,
1248 MPI_Datatype datatype,
1251 MPI_Comm comm_ptr)=NULL;
1253 int (*MV2_Reduce_intra_function)( void *sendbuf,
1256 MPI_Datatype datatype,
1259 MPI_Comm comm_ptr)=NULL;
1262 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1263 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1264 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1265 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1266 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1270 static void init_mv2_reduce_tables_stampede(){
1272 mv2_size_reduce_tuning_table = 8;
1273 mv2_reduce_thresholds_table = xbt_malloc(mv2_size_reduce_tuning_table *
1274 sizeof (mv2_reduce_tuning_table));
1275 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1283 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1284 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1285 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1289 {0, 65536, &MPIR_Reduce_shmem_MV2},
1290 {65536,-1, &MPIR_Reduce_binomial_MV2},
1297 {1, 1, 1, 1, 0, 0, 0},
1300 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1301 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1302 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1303 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1304 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1305 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1306 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1310 {0, 8192, &MPIR_Reduce_shmem_MV2},
1311 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1312 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1313 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1314 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1315 {262144,-1, &MPIR_Reduce_binomial_MV2},
1325 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1326 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1327 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1328 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1329 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1333 {0, 8192, &MPIR_Reduce_shmem_MV2},
1334 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1335 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1336 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1337 {262144, -1, &MPIR_Reduce_binomial_MV2},
1347 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1348 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1349 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1350 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1351 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1352 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1356 {0, 8192, &MPIR_Reduce_shmem_MV2},
1357 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1358 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1359 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1360 {262144, -1, &MPIR_Reduce_binomial_MV2},
1367 {1, 1, 1, 0, 1, 1, 0},
1370 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1371 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1372 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1373 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1374 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1375 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1376 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1380 {0, 8192, &MPIR_Reduce_shmem_MV2},
1381 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1382 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1383 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1384 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1385 {262144, -1, &MPIR_Reduce_binomial_MV2},
1395 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1396 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1397 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1398 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1399 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1400 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1404 {0, 8192, &MPIR_Reduce_shmem_MV2},
1405 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1406 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1407 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1408 {262144, -1, &MPIR_Reduce_binomial_MV2},
1418 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1419 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1420 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1421 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1422 {262144, -1, &MPIR_Reduce_binomial_MV2},
1426 {0, 8192, &MPIR_Reduce_shmem_MV2},
1427 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1428 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1429 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1430 {262144, -1, &MPIR_Reduce_binomial_MV2},
1440 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1441 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1442 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1443 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1444 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1445 {131072, -1, &MPIR_Reduce_binomial_MV2},
1449 {0, 2048, &MPIR_Reduce_shmem_MV2},
1450 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1451 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1452 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1453 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1454 {131072, -1, &MPIR_Reduce_shmem_MV2},
1459 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1460 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1463 /************ Reduce scatter variables and initializers */
1468 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1471 MPI_Datatype datatype,
1474 } mv2_red_scat_tuning_element;
1478 int size_inter_table;
1479 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1480 } mv2_red_scat_tuning_table;
1482 int mv2_size_red_scat_tuning_table = 0;
1483 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1486 int (*MV2_Red_scat_function)(void *sendbuf,
1489 MPI_Datatype datatype,
1495 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1498 MPI_Datatype datatype,
1502 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1505 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1506 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1507 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1512 static void init_mv2_reduce_scatter_tables_stampede(){
1513 mv2_size_red_scat_tuning_table = 6;
1514 mv2_red_scat_thresholds_table = xbt_malloc(mv2_size_red_scat_tuning_table *
1515 sizeof (mv2_red_scat_tuning_table));
1516 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1521 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1522 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1523 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1530 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1531 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1532 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1539 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1540 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1541 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1548 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1549 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1556 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1557 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1564 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1565 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1570 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1571 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1574 /************ Scatter variables and initializers */
1579 int (*MV2_pt_Scatter_function)(void *sendbuf,
1581 MPI_Datatype sendtype,
1584 MPI_Datatype recvtype,
1585 int root, MPI_Comm comm);
1586 } mv2_scatter_tuning_element;
1590 int size_inter_table;
1591 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1592 int size_intra_table;
1593 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1594 } mv2_scatter_tuning_table;
1597 int *mv2_scatter_table_ppn_conf = NULL;
1598 int mv2_scatter_num_ppn_conf = 1;
1599 int *mv2_size_scatter_tuning_table = NULL;
1600 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1602 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1603 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1604 int root, MPI_Comm comm)=NULL;
1606 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1607 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1608 int root, MPI_Comm comm)=NULL;
1609 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1611 MPI_Datatype sendtype,
1614 MPI_Datatype recvtype,
1615 int root, MPI_Comm comm_ptr);
1617 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1619 MPI_Datatype sendtype,
1622 MPI_Datatype recvtype,
1623 int root, MPI_Comm comm_ptr)
1628 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1629 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1630 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_ompi_binomial
1631 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1636 static void init_mv2_scatter_tables_stampede(){
1638 int agg_table_sum = 0;
1640 mv2_scatter_tuning_table **table_ptrs = NULL;
1641 mv2_scatter_num_ppn_conf = 3;
1642 mv2_scatter_thresholds_table
1643 = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1644 * mv2_scatter_num_ppn_conf);
1645 table_ptrs = xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1646 * mv2_scatter_num_ppn_conf);
1647 mv2_size_scatter_tuning_table = xbt_malloc(sizeof(int) *
1648 mv2_scatter_num_ppn_conf);
1649 mv2_scatter_table_ppn_conf
1650 = xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int));
1651 mv2_scatter_table_ppn_conf[0] = 1;
1652 mv2_size_scatter_tuning_table[0] = 6;
1653 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1657 {0, -1, &MPIR_Scatter_MV2_Binomial},
1661 {0, -1, &MPIR_Scatter_MV2_Binomial},
1668 {0, -1, &MPIR_Scatter_MV2_Direct},
1672 {0, -1, &MPIR_Scatter_MV2_Direct},
1679 {0, -1, &MPIR_Scatter_MV2_Direct},
1683 {0, -1, &MPIR_Scatter_MV2_Direct},
1690 {0, -1, &MPIR_Scatter_MV2_Direct},
1694 {0, -1, &MPIR_Scatter_MV2_Direct},
1701 {0, -1, &MPIR_Scatter_MV2_Direct},
1705 {0, -1, &MPIR_Scatter_MV2_Direct},
1712 {0, 32, &MPIR_Scatter_MV2_Binomial},
1713 {32, -1, &MPIR_Scatter_MV2_Direct},
1717 {0, -1, &MPIR_Scatter_MV2_Binomial},
1721 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1722 mv2_scatter_table_ppn_conf[1] = 2;
1723 mv2_size_scatter_tuning_table[1] = 6;
1724 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1728 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1729 {4096, -1, &MPIR_Scatter_MV2_Direct},
1733 {0, -1, &MPIR_Scatter_MV2_Direct},
1740 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1741 {512, -1, &MPIR_Scatter_MV2_Direct},
1745 {0, -1, &MPIR_Scatter_MV2_Binomial},
1752 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1753 {2048, -1, &MPIR_Scatter_MV2_Direct},
1757 {0, -1, &MPIR_Scatter_MV2_Binomial},
1764 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1765 {2048, -1, &MPIR_Scatter_MV2_Direct},
1769 {0, -1, &MPIR_Scatter_MV2_Binomial},
1776 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1777 {8192, -1, &MPIR_Scatter_MV2_Direct},
1781 {0, -1, &MPIR_Scatter_MV2_Binomial},
1788 {0, 16, &MPIR_Scatter_MV2_Binomial},
1789 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1790 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1791 {16384, -1, &MPIR_Scatter_MV2_Direct},
1795 {0, 128, &MPIR_Scatter_MV2_Direct},
1796 {128, -1, &MPIR_Scatter_MV2_Binomial},
1800 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1801 mv2_scatter_table_ppn_conf[2] = 16;
1802 mv2_size_scatter_tuning_table[2] = 8;
1803 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1808 {0, 256, &MPIR_Scatter_MV2_Binomial},
1809 {256, -1, &MPIR_Scatter_MV2_Direct},
1813 { 0, -1, &MPIR_Scatter_MV2_Direct},
1821 {0, 512, &MPIR_Scatter_MV2_Binomial},
1822 {512, -1, &MPIR_Scatter_MV2_Direct},
1826 { 0, -1, &MPIR_Scatter_MV2_Direct},
1834 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1835 {1024, -1, &MPIR_Scatter_MV2_Direct},
1839 { 0, -1, &MPIR_Scatter_MV2_Direct},
1847 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1848 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1849 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1850 {2048, -1, &MPIR_Scatter_MV2_Direct},
1854 { 0, -1, &MPIR_Scatter_MV2_Direct},
1862 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1863 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1864 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1865 {2048, -1, &MPIR_Scatter_MV2_Direct},
1869 { 0, -1, &MPIR_Scatter_MV2_Direct},
1877 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1878 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1879 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1880 {4096, -1, &MPIR_Scatter_MV2_Direct},
1884 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1891 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1892 {0, 16, &MPIR_Scatter_MV2_Binomial},
1893 {16, 32, &MPIR_Scatter_MV2_Binomial},
1894 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1895 {4096, -1, &MPIR_Scatter_MV2_Direct},
1899 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1906 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1907 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1908 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1909 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1910 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1911 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1912 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1916 {0, 16, &MPIR_Scatter_MV2_Binomial},
1917 {16, 128, &MPIR_Scatter_MV2_Binomial},
1918 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1919 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1920 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1921 {65536, -1, &MPIR_Scatter_MV2_Direct},
1925 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1927 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1928 agg_table_sum += mv2_size_scatter_tuning_table[i];
1930 mv2_scatter_thresholds_table[0] =
1931 xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table));
1932 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1933 (sizeof(mv2_scatter_tuning_table)
1934 * mv2_size_scatter_tuning_table[0]));
1935 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1936 mv2_scatter_thresholds_table[i] =
1937 mv2_scatter_thresholds_table[i - 1]
1938 + mv2_size_scatter_tuning_table[i - 1];
1939 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1940 (sizeof(mv2_scatter_tuning_table)
1941 * mv2_size_scatter_tuning_table[i]));
1943 xbt_free(table_ptrs);