1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 /************ Alltoall variables and initializers */
11 #define MV2_MAX_NB_THRESHOLDS 32
17 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
18 void *recvbuf, int recvcount, MPI_Datatype recvtype,
20 } mv2_alltoall_tuning_element;
25 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
26 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
27 } mv2_alltoall_tuning_table;
29 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31 /* Indicates number of processes per node */
32 int *mv2_alltoall_table_ppn_conf = NULL;
33 /* Indicates total number of configurations */
34 int mv2_alltoall_num_ppn_conf = 1;
35 int *mv2_size_alltoall_tuning_table = NULL;
36 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
39 #define MPIR_Alltoall_bruck_MV2 smpi_coll_tuned_alltoall_bruck
40 #define MPIR_Alltoall_RD_MV2 smpi_coll_tuned_alltoall_rdb
41 #define MPIR_Alltoall_Scatter_dest_MV2 smpi_coll_tuned_alltoall_mvapich2_scatter_dest
42 #define MPIR_Alltoall_pairwise_MV2 smpi_coll_tuned_alltoall_pair
43 #define MPIR_Alltoall_inplace_MV2 smpi_coll_tuned_alltoall_ring
46 static void init_mv2_alltoall_tables_stampede(){
48 int agg_table_sum = 0;
49 mv2_alltoall_tuning_table **table_ptrs = NULL;
50 mv2_alltoall_num_ppn_conf = 3;
51 if(smpi_coll_cleanup_callback==NULL)
52 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
53 mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
54 * mv2_alltoall_num_ppn_conf));
55 table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
56 * mv2_alltoall_num_ppn_conf));
57 mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
58 mv2_alltoall_num_ppn_conf));
59 mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
60 mv2_alltoall_table_ppn_conf[0] = 1;
61 mv2_size_alltoall_tuning_table[0] = 6;
62 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
65 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
68 {{0, -1, &MPIR_Alltoall_inplace_MV2},
74 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
75 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
78 {{0, -1, &MPIR_Alltoall_inplace_MV2},
84 {{0, 8, &MPIR_Alltoall_RD_MV2},
85 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
88 {{0, -1, &MPIR_Alltoall_inplace_MV2},
94 {{0, 64, &MPIR_Alltoall_RD_MV2},
95 {64, 512, &MPIR_Alltoall_bruck_MV2},
96 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
99 {{0,-1, &MPIR_Alltoall_inplace_MV2},
105 {{0, 32, &MPIR_Alltoall_RD_MV2},
106 {32, 2048, &MPIR_Alltoall_bruck_MV2},
107 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
110 {{0, -1, &MPIR_Alltoall_inplace_MV2},
116 {{0, 8, &MPIR_Alltoall_RD_MV2},
117 {8, 1024, &MPIR_Alltoall_bruck_MV2},
118 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
121 {{0, -1, &MPIR_Alltoall_inplace_MV2},
125 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
126 mv2_alltoall_table_ppn_conf[1] = 2;
127 mv2_size_alltoall_tuning_table[1] = 6;
128 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
131 {{0, 32, &MPIR_Alltoall_RD_MV2},
132 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
135 {{0, -1, &MPIR_Alltoall_inplace_MV2},
141 {{0, 64, &MPIR_Alltoall_RD_MV2},
142 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
145 {{0, -1, &MPIR_Alltoall_inplace_MV2},
151 {{0, 64, &MPIR_Alltoall_RD_MV2},
152 {64, 2048, &MPIR_Alltoall_bruck_MV2},
153 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
156 {{0,-1, &MPIR_Alltoall_inplace_MV2},
162 {{0, 16, &MPIR_Alltoall_RD_MV2},
163 {16, 2048, &MPIR_Alltoall_bruck_MV2},
164 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
167 {{0, -1, &MPIR_Alltoall_inplace_MV2},
173 {{0, 8, &MPIR_Alltoall_RD_MV2},
174 {8, 1024, &MPIR_Alltoall_bruck_MV2},
175 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
178 {{0, -1, &MPIR_Alltoall_inplace_MV2},
184 {{0, 4, &MPIR_Alltoall_RD_MV2},
185 {4, 2048, &MPIR_Alltoall_bruck_MV2},
186 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
189 {{0, -1, &MPIR_Alltoall_inplace_MV2},
193 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
194 mv2_alltoall_table_ppn_conf[2] = 16;
195 mv2_size_alltoall_tuning_table[2] = 7;
196 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
199 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
200 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
203 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
209 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
210 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
213 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
219 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
220 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
221 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
224 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
230 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
231 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
234 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
240 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
241 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
244 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
250 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
251 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
254 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
259 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
260 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
263 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
268 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
270 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
271 agg_table_sum += mv2_size_alltoall_tuning_table[i];
273 mv2_alltoall_thresholds_table[0] =
274 static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
275 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
276 (sizeof(mv2_alltoall_tuning_table)
277 * mv2_size_alltoall_tuning_table[0]));
278 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
279 mv2_alltoall_thresholds_table[i] =
280 mv2_alltoall_thresholds_table[i - 1]
281 + mv2_size_alltoall_tuning_table[i - 1];
282 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
283 (sizeof(mv2_alltoall_tuning_table)
284 * mv2_size_alltoall_tuning_table[i]));
286 xbt_free(table_ptrs);
292 /************ Allgather variables and initializers */
297 int (*MV2_pt_Allgather_function)(void *sendbuf,
299 MPI_Datatype sendtype,
302 MPI_Datatype recvtype, MPI_Comm comm_ptr);
303 } mv2_allgather_tuning_element;
307 int two_level[MV2_MAX_NB_THRESHOLDS];
308 int size_inter_table;
309 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
310 } mv2_allgather_tuning_table;
312 int (*MV2_Allgather_function)(void *sendbuf,
314 MPI_Datatype sendtype,
317 MPI_Datatype recvtype, MPI_Comm comm);
319 int *mv2_allgather_table_ppn_conf = NULL;
320 int mv2_allgather_num_ppn_conf = 1;
321 int *mv2_size_allgather_tuning_table = NULL;
322 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
324 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
326 MPI_Datatype sendtype,
329 MPI_Datatype recvtype, MPI_Comm comm_ptr)
334 #define MPIR_Allgather_Bruck_MV2 smpi_coll_tuned_allgather_bruck
335 #define MPIR_Allgather_RD_MV2 smpi_coll_tuned_allgather_rdb
336 #define MPIR_Allgather_Ring_MV2 smpi_coll_tuned_allgather_ring
337 #define MPIR_2lvl_Allgather_MV2 smpi_coll_tuned_allgather_mvapich2_smp
339 static void init_mv2_allgather_tables_stampede(){
341 int agg_table_sum = 0;
343 if(smpi_coll_cleanup_callback==NULL)
344 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
345 mv2_allgather_tuning_table **table_ptrs = NULL;
346 mv2_allgather_num_ppn_conf = 3;
347 mv2_allgather_thresholds_table
348 = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
349 * mv2_allgather_num_ppn_conf));
350 table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
351 * mv2_allgather_num_ppn_conf));
352 mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
353 mv2_allgather_num_ppn_conf));
354 mv2_allgather_table_ppn_conf
355 = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
356 mv2_allgather_table_ppn_conf[0] = 1;
357 mv2_size_allgather_tuning_table[0] = 6;
358 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
364 {0, -1, &MPIR_Allgather_Ring_MV2},
372 {0, 262144, &MPIR_Allgather_RD_MV2},
373 {262144, -1, &MPIR_Allgather_Ring_MV2},
381 {0, 131072, &MPIR_Allgather_RD_MV2},
382 {131072, -1, &MPIR_Allgather_Ring_MV2},
390 {0, 131072, &MPIR_Allgather_RD_MV2},
391 {131072, -1, &MPIR_Allgather_Ring_MV2},
399 {0, 65536, &MPIR_Allgather_RD_MV2},
400 {65536, -1, &MPIR_Allgather_Ring_MV2},
408 {0, 32768, &MPIR_Allgather_RD_MV2},
409 {32768, -1, &MPIR_Allgather_Ring_MV2},
413 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
414 mv2_allgather_table_ppn_conf[1] = 2;
415 mv2_size_allgather_tuning_table[1] = 6;
416 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
422 {0, 524288, &MPIR_Allgather_RD_MV2},
423 {524288, -1, &MPIR_Allgather_Ring_MV2},
431 {0, 32768, &MPIR_Allgather_RD_MV2},
432 {32768, 524288, &MPIR_Allgather_Ring_MV2},
433 {524288, -1, &MPIR_Allgather_Ring_MV2},
441 {0, 16384, &MPIR_Allgather_RD_MV2},
442 {16384, 524288, &MPIR_Allgather_Ring_MV2},
443 {524288, -1, &MPIR_Allgather_Ring_MV2},
451 {0, 65536, &MPIR_Allgather_RD_MV2},
452 {65536, 524288, &MPIR_Allgather_Ring_MV2},
453 {524288, -1, &MPIR_Allgather_Ring_MV2},
461 {0, 32768, &MPIR_Allgather_RD_MV2},
462 {32768, 524288, &MPIR_Allgather_Ring_MV2},
463 {524288, -1, &MPIR_Allgather_Ring_MV2},
471 {0, 65536, &MPIR_Allgather_RD_MV2},
472 {65536, 524288, &MPIR_Allgather_Ring_MV2},
473 {524288, -1, &MPIR_Allgather_Ring_MV2},
477 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
478 mv2_allgather_table_ppn_conf[2] = 16;
479 mv2_size_allgather_tuning_table[2] = 6;
480 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
486 {0, 1024, &MPIR_Allgather_RD_MV2},
487 {1024, -1, &MPIR_Allgather_Ring_MV2},
495 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
496 {1024, -1, &MPIR_Allgather_Ring_MV2},
504 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
505 {1024, -1, &MPIR_Allgather_Ring_MV2},
513 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
514 {1024, -1, &MPIR_Allgather_Ring_MV2},
522 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
523 {1024, -1, &MPIR_Allgather_Ring_MV2},
531 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
532 {1024, -1, &MPIR_Allgather_Ring_MV2},
537 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
539 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
540 agg_table_sum += mv2_size_allgather_tuning_table[i];
542 mv2_allgather_thresholds_table[0] =
543 static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
544 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
545 (sizeof(mv2_allgather_tuning_table)
546 * mv2_size_allgather_tuning_table[0]));
547 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
548 mv2_allgather_thresholds_table[i] =
549 mv2_allgather_thresholds_table[i - 1]
550 + mv2_size_allgather_tuning_table[i - 1];
551 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
552 (sizeof(mv2_allgather_tuning_table)
553 * mv2_size_allgather_tuning_table[i]));
555 xbt_free(table_ptrs);
559 /************ Gather variables and initializers */
564 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
565 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
566 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
567 } mv2_gather_tuning_element;
572 int size_inter_table;
573 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
574 int size_intra_table;
575 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
576 } mv2_gather_tuning_table;
578 int mv2_size_gather_tuning_table=7;
579 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
581 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
583 MPI_Datatype sendtype,
586 MPI_Datatype recvtype,
587 int root, MPI_Comm comm);
589 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
590 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
593 #define MPIR_Gather_MV2_Direct smpi_coll_tuned_gather_ompi_basic_linear
594 #define MPIR_Gather_MV2_two_level_Direct smpi_coll_tuned_gather_mvapich2_two_level
595 #define MPIR_Gather_intra smpi_coll_tuned_gather_mpich
598 static void init_mv2_gather_tables_stampede(){
600 if(smpi_coll_cleanup_callback==NULL)
601 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
602 mv2_size_gather_tuning_table=7;
603 mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
604 sizeof (mv2_gather_tuning_table)));
605 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
607 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
608 {524288, -1, &MPIR_Gather_intra}},
609 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
611 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
612 {16384, 131072, &MPIR_Gather_intra},
613 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
614 1,{{0, -1, &MPIR_Gather_intra}}},
616 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
617 {256, 16384, &MPIR_Gather_MV2_Direct},
618 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
619 1,{{0, -1, &MPIR_Gather_intra}}},
621 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
622 {512, 16384, &MPIR_Gather_MV2_Direct},
623 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
624 1,{{0, -1, &MPIR_Gather_intra}}},
626 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
627 {512, 16384, &MPIR_Gather_MV2_Direct},
628 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
629 1,{{0, -1, &MPIR_Gather_intra}}},
631 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
632 {512, 16384, &MPIR_Gather_MV2_Direct},
633 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
634 1,{{0, -1, &MPIR_Gather_intra}}},
636 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
637 {512, 16384, &MPIR_Gather_MV2_Direct},
638 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
639 1,{{0, -1, &MPIR_Gather_intra}}},
642 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
643 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
648 /************ Allgatherv variables and initializers */
653 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
655 MPI_Datatype sendtype,
659 MPI_Datatype recvtype,
661 } mv2_allgatherv_tuning_element;
665 int size_inter_table;
666 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
667 } mv2_allgatherv_tuning_table;
669 int (*MV2_Allgatherv_function)(void *sendbuf,
671 MPI_Datatype sendtype,
675 MPI_Datatype recvtype,
678 int mv2_size_allgatherv_tuning_table = 0;
679 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
681 #define MPIR_Allgatherv_Rec_Doubling_MV2 smpi_coll_tuned_allgatherv_mpich_rdb
682 #define MPIR_Allgatherv_Bruck_MV2 smpi_coll_tuned_allgatherv_ompi_bruck
683 #define MPIR_Allgatherv_Ring_MV2 smpi_coll_tuned_allgatherv_mpich_ring
686 static void init_mv2_allgatherv_tables_stampede(){
687 if(smpi_coll_cleanup_callback==NULL)
688 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
689 mv2_size_allgatherv_tuning_table = 6;
690 mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
691 sizeof (mv2_allgatherv_tuning_table)));
692 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
697 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
698 {512, -1, &MPIR_Allgatherv_Ring_MV2},
705 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
706 {512, -1, &MPIR_Allgatherv_Ring_MV2},
713 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
714 {256, -1, &MPIR_Allgatherv_Ring_MV2},
721 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
722 {256, -1, &MPIR_Allgatherv_Ring_MV2},
729 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
730 {256, -1, &MPIR_Allgatherv_Ring_MV2},
737 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
738 {256, -1, &MPIR_Allgatherv_Ring_MV2},
743 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
744 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
748 /************ Allreduce variables and initializers */
753 int (*MV2_pt_Allreduce_function)(void *sendbuf,
756 MPI_Datatype datatype,
757 MPI_Op op, MPI_Comm comm);
758 } mv2_allreduce_tuning_element;
763 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
764 int size_inter_table;
765 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
766 int size_intra_table;
767 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
768 } mv2_allreduce_tuning_table;
771 int (*MV2_Allreduce_function)(void *sendbuf,
774 MPI_Datatype datatype,
775 MPI_Op op, MPI_Comm comm)=NULL;
778 int (*MV2_Allreduce_intra_function)( void *sendbuf,
781 MPI_Datatype datatype,
782 MPI_Op op, MPI_Comm comm)=NULL;
784 int mv2_size_allreduce_tuning_table = 0;
785 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
791 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
794 MPI_Datatype datatype,
795 MPI_Op op, MPI_Comm comm)
800 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
803 MPI_Datatype datatype,
804 MPI_Op op, MPI_Comm comm)
809 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
812 MPI_Datatype datatype,
813 MPI_Op op, MPI_Comm comm)
815 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
819 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
822 MPI_Datatype datatype,
823 MPI_Op op, MPI_Comm comm)
825 mpi_coll_reduce_fun(sendbuf,recvbuf,count,datatype,op,0,comm);
829 #define MPIR_Allreduce_pt2pt_rd_MV2 smpi_coll_tuned_allreduce_rdb
830 #define MPIR_Allreduce_pt2pt_rs_MV2 smpi_coll_tuned_allreduce_mvapich2_rs
831 #define MPIR_Allreduce_two_level_MV2 smpi_coll_tuned_allreduce_mvapich2_two_level
834 static void init_mv2_allreduce_tables_stampede(){
835 if(smpi_coll_cleanup_callback==NULL)
836 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
837 mv2_size_allreduce_tuning_table = 8;
838 mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
839 sizeof (mv2_allreduce_tuning_table)));
840 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
847 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
848 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
852 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
853 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
862 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
863 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
864 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
868 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
869 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
878 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
879 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
880 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
884 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
885 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
894 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
895 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
896 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
900 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
901 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
910 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
911 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
912 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
916 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
917 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
926 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
927 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
928 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
932 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
933 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
942 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
943 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
944 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
945 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
949 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
950 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
959 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
960 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
961 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
962 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
963 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
967 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
968 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
973 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
974 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
983 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
984 int root, MPI_Comm comm_ptr);
985 int zcpy_pipelined_knomial_factor;
986 } mv2_bcast_tuning_element;
990 int bcast_segment_size;
991 int intra_node_knomial_factor;
992 int inter_node_knomial_factor;
993 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
994 int size_inter_table;
995 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
996 int size_intra_table;
997 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
998 } mv2_bcast_tuning_table;
1000 int mv2_size_bcast_tuning_table = 0;
1001 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1004 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1005 int root, MPI_Comm comm_ptr) = NULL;
1007 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1008 int root, MPI_Comm comm_ptr) = NULL;
1010 int zcpy_knomial_factor = 2;
1011 int mv2_pipelined_zcpy_knomial_factor = -1;
1012 int bcast_segment_size = 8192;
1013 int mv2_inter_node_knomial_factor = 4;
1014 int mv2_intra_node_knomial_factor = 4;
1015 #define mv2_bcast_two_level_system_size 64
1016 #define mv2_bcast_short_msg 16384
1017 #define mv2_bcast_large_msg 512*1024
1019 #define INTRA_NODE_ROOT 0
1021 #define MPIR_Pipelined_Bcast_Zcpy_MV2 smpi_coll_tuned_bcast_mpich
1022 #define MPIR_Pipelined_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1023 #define MPIR_Bcast_binomial_MV2 smpi_coll_tuned_bcast_binomial_tree
1024 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1025 #define MPIR_Bcast_scatter_doubling_allgather_MV2 smpi_coll_tuned_bcast_scatter_rdb_allgather
1026 #define MPIR_Bcast_scatter_ring_allgather_MV2 smpi_coll_tuned_bcast_scatter_LR_allgather
1027 #define MPIR_Shmem_Bcast_MV2 smpi_coll_tuned_bcast_mpich
1028 #define MPIR_Bcast_tune_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1029 #define MPIR_Bcast_inter_node_helper_MV2 smpi_coll_tuned_bcast_mvapich2_inter_node
1030 #define MPIR_Knomial_Bcast_intra_node_MV2 smpi_coll_tuned_bcast_mvapich2_knomial_intra_node
1031 #define MPIR_Bcast_intra_MV2 smpi_coll_tuned_bcast_mvapich2_intra_node
1033 static void init_mv2_bcast_tables_stampede(){
1035 if(smpi_coll_cleanup_callback==NULL)
1036 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1037 mv2_size_bcast_tuning_table=8;
1038 mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1039 sizeof (mv2_bcast_tuning_table)));
1041 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1045 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1048 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1049 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1050 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1051 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1053 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1055 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1056 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1057 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1058 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1062 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1063 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1064 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1065 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1066 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1067 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1068 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1069 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1070 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1071 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1072 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1078 {1, 1, 1, 1, 1, 1, 1, 1},
1081 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1082 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1083 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1087 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1092 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1093 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1094 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1095 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1096 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1097 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1098 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1099 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1105 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1108 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1109 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1110 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1120 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1121 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1122 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1123 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1124 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1125 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1126 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1127 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1128 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1137 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1138 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1139 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1140 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1144 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1145 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1146 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1147 {524288, -1, NULL, -1}
1156 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1157 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1158 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1159 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1164 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1165 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1166 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1167 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1168 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1177 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1178 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1179 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1180 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1181 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1185 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1186 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1187 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1188 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1189 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1198 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1199 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1200 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1201 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1202 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1206 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1207 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1208 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1209 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1210 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1216 {1, 1, 1, 1, 1, 1, 1},
1219 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1220 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1221 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1222 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1223 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1224 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1225 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1229 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1230 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1231 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1232 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1233 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1234 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1235 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1240 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1241 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1245 /************ Reduce variables and initializers */
1250 int (*MV2_pt_Reduce_function)(void *sendbuf,
1253 MPI_Datatype datatype,
1257 } mv2_reduce_tuning_element;
1263 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1264 int size_inter_table;
1265 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1266 int size_intra_table;
1267 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1268 } mv2_reduce_tuning_table;
1270 int mv2_size_reduce_tuning_table = 0;
1271 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1274 int mv2_reduce_intra_knomial_factor = -1;
1275 int mv2_reduce_inter_knomial_factor = -1;
1277 int (*MV2_Reduce_function)( void *sendbuf,
1280 MPI_Datatype datatype,
1283 MPI_Comm comm_ptr)=NULL;
1285 int (*MV2_Reduce_intra_function)( void *sendbuf,
1288 MPI_Datatype datatype,
1291 MPI_Comm comm_ptr)=NULL;
1294 #define MPIR_Reduce_inter_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1295 #define MPIR_Reduce_intra_knomial_wrapper_MV2 smpi_coll_tuned_reduce_mvapich2_knomial
1296 #define MPIR_Reduce_binomial_MV2 smpi_coll_tuned_reduce_binomial
1297 #define MPIR_Reduce_redscat_gather_MV2 smpi_coll_tuned_reduce_scatter_gather
1298 #define MPIR_Reduce_shmem_MV2 smpi_coll_tuned_reduce_ompi_basic_linear
1299 #define MPIR_Reduce_two_level_helper_MV2 smpi_coll_tuned_reduce_mvapich2_two_level
1302 static void init_mv2_reduce_tables_stampede(){
1303 if(smpi_coll_cleanup_callback==NULL)
1304 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1306 mv2_size_reduce_tuning_table = 8;
1307 mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1308 sizeof (mv2_reduce_tuning_table)));
1309 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1317 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1318 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1319 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1323 {0, 65536, &MPIR_Reduce_shmem_MV2},
1324 {65536,-1, &MPIR_Reduce_binomial_MV2},
1331 {1, 1, 1, 1, 0, 0, 0},
1334 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1335 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1336 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1338 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1340 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1344 {0, 8192, &MPIR_Reduce_shmem_MV2},
1345 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1346 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1347 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1348 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1349 {262144,-1, &MPIR_Reduce_binomial_MV2},
1359 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1360 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1361 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1362 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1367 {0, 8192, &MPIR_Reduce_shmem_MV2},
1368 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1369 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1370 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1371 {262144, -1, &MPIR_Reduce_binomial_MV2},
1381 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1382 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1383 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1384 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1385 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1386 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1390 {0, 8192, &MPIR_Reduce_shmem_MV2},
1391 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1392 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1393 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1394 {262144, -1, &MPIR_Reduce_binomial_MV2},
1401 {1, 1, 1, 0, 1, 1, 0},
1404 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1405 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1406 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1407 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1408 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1409 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1410 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1414 {0, 8192, &MPIR_Reduce_shmem_MV2},
1415 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1416 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1417 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1418 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1419 {262144, -1, &MPIR_Reduce_binomial_MV2},
1429 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1430 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1431 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1432 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1433 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1434 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1438 {0, 8192, &MPIR_Reduce_shmem_MV2},
1439 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1440 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1441 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1442 {262144, -1, &MPIR_Reduce_binomial_MV2},
1452 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1453 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1454 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1455 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1456 {262144, -1, &MPIR_Reduce_binomial_MV2},
1460 {0, 8192, &MPIR_Reduce_shmem_MV2},
1461 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1462 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1463 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1464 {262144, -1, &MPIR_Reduce_binomial_MV2},
1474 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1475 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1476 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1477 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1478 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1479 {131072, -1, &MPIR_Reduce_binomial_MV2},
1483 {0, 2048, &MPIR_Reduce_shmem_MV2},
1484 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1485 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1486 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1487 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1488 {131072, -1, &MPIR_Reduce_shmem_MV2},
1493 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1494 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1497 /************ Reduce scatter variables and initializers */
1502 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1505 MPI_Datatype datatype,
1508 } mv2_red_scat_tuning_element;
1512 int size_inter_table;
1513 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1514 } mv2_red_scat_tuning_table;
1516 int mv2_size_red_scat_tuning_table = 0;
1517 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1520 int (*MV2_Red_scat_function)(void *sendbuf,
1523 MPI_Datatype datatype,
1529 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1532 MPI_Datatype datatype,
1536 smpi_mpi_reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1539 #define MPIR_Reduce_scatter_non_comm_MV2 smpi_coll_tuned_reduce_scatter_mpich_noncomm
1540 #define MPIR_Reduce_scatter_Rec_Halving_MV2 smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving
1541 #define MPIR_Reduce_scatter_Pair_Wise_MV2 smpi_coll_tuned_reduce_scatter_mpich_pair
1546 static void init_mv2_reduce_scatter_tables_stampede(){
1547 if(smpi_coll_cleanup_callback==NULL)
1548 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1549 mv2_size_red_scat_tuning_table = 6;
1550 mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1551 sizeof (mv2_red_scat_tuning_table)));
1552 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1557 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1558 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1559 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1566 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1567 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1568 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1575 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1576 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1577 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1584 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1585 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1592 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1593 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1600 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1601 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1606 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1607 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1610 /************ Scatter variables and initializers */
1615 int (*MV2_pt_Scatter_function)(void *sendbuf,
1617 MPI_Datatype sendtype,
1620 MPI_Datatype recvtype,
1621 int root, MPI_Comm comm);
1622 } mv2_scatter_tuning_element;
1626 int size_inter_table;
1627 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1628 int size_intra_table;
1629 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1630 } mv2_scatter_tuning_table;
1633 int *mv2_scatter_table_ppn_conf = NULL;
1634 int mv2_scatter_num_ppn_conf = 1;
1635 int *mv2_size_scatter_tuning_table = NULL;
1636 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1638 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1639 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1640 int root, MPI_Comm comm)=NULL;
1642 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1643 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1644 int root, MPI_Comm comm)=NULL;
1645 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1647 MPI_Datatype sendtype,
1650 MPI_Datatype recvtype,
1651 int root, MPI_Comm comm_ptr);
1653 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1655 MPI_Datatype sendtype,
1658 MPI_Datatype recvtype,
1659 int root, MPI_Comm comm_ptr)
1664 #define MPIR_Scatter_MV2_Binomial smpi_coll_tuned_scatter_ompi_binomial
1665 #define MPIR_Scatter_MV2_Direct smpi_coll_tuned_scatter_ompi_basic_linear
1666 #define MPIR_Scatter_MV2_two_level_Binomial smpi_coll_tuned_scatter_mvapich2_two_level_binomial
1667 #define MPIR_Scatter_MV2_two_level_Direct smpi_coll_tuned_scatter_mvapich2_two_level_direct
1672 static void init_mv2_scatter_tables_stampede(){
1673 if(smpi_coll_cleanup_callback==NULL)
1674 smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1676 int agg_table_sum = 0;
1678 mv2_scatter_tuning_table **table_ptrs = NULL;
1679 mv2_scatter_num_ppn_conf = 3;
1680 mv2_scatter_thresholds_table
1681 = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1682 * mv2_scatter_num_ppn_conf));
1683 table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1684 * mv2_scatter_num_ppn_conf));
1685 mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1686 mv2_scatter_num_ppn_conf));
1687 mv2_scatter_table_ppn_conf
1688 = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1689 mv2_scatter_table_ppn_conf[0] = 1;
1690 mv2_size_scatter_tuning_table[0] = 6;
1691 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1695 {0, -1, &MPIR_Scatter_MV2_Binomial},
1699 {0, -1, &MPIR_Scatter_MV2_Binomial},
1706 {0, -1, &MPIR_Scatter_MV2_Direct},
1710 {0, -1, &MPIR_Scatter_MV2_Direct},
1717 {0, -1, &MPIR_Scatter_MV2_Direct},
1721 {0, -1, &MPIR_Scatter_MV2_Direct},
1728 {0, -1, &MPIR_Scatter_MV2_Direct},
1732 {0, -1, &MPIR_Scatter_MV2_Direct},
1739 {0, -1, &MPIR_Scatter_MV2_Direct},
1743 {0, -1, &MPIR_Scatter_MV2_Direct},
1750 {0, 32, &MPIR_Scatter_MV2_Binomial},
1751 {32, -1, &MPIR_Scatter_MV2_Direct},
1755 {0, -1, &MPIR_Scatter_MV2_Binomial},
1759 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1760 mv2_scatter_table_ppn_conf[1] = 2;
1761 mv2_size_scatter_tuning_table[1] = 6;
1762 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1766 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1767 {4096, -1, &MPIR_Scatter_MV2_Direct},
1771 {0, -1, &MPIR_Scatter_MV2_Direct},
1778 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1779 {512, -1, &MPIR_Scatter_MV2_Direct},
1783 {0, -1, &MPIR_Scatter_MV2_Binomial},
1790 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1791 {2048, -1, &MPIR_Scatter_MV2_Direct},
1795 {0, -1, &MPIR_Scatter_MV2_Binomial},
1802 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1803 {2048, -1, &MPIR_Scatter_MV2_Direct},
1807 {0, -1, &MPIR_Scatter_MV2_Binomial},
1814 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1815 {8192, -1, &MPIR_Scatter_MV2_Direct},
1819 {0, -1, &MPIR_Scatter_MV2_Binomial},
1826 {0, 16, &MPIR_Scatter_MV2_Binomial},
1827 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1828 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1829 {16384, -1, &MPIR_Scatter_MV2_Direct},
1833 {0, 128, &MPIR_Scatter_MV2_Direct},
1834 {128, -1, &MPIR_Scatter_MV2_Binomial},
1838 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1839 mv2_scatter_table_ppn_conf[2] = 16;
1840 mv2_size_scatter_tuning_table[2] = 8;
1841 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1846 {0, 256, &MPIR_Scatter_MV2_Binomial},
1847 {256, -1, &MPIR_Scatter_MV2_Direct},
1851 { 0, -1, &MPIR_Scatter_MV2_Direct},
1859 {0, 512, &MPIR_Scatter_MV2_Binomial},
1860 {512, -1, &MPIR_Scatter_MV2_Direct},
1864 { 0, -1, &MPIR_Scatter_MV2_Direct},
1872 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1873 {1024, -1, &MPIR_Scatter_MV2_Direct},
1877 { 0, -1, &MPIR_Scatter_MV2_Direct},
1885 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1886 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1887 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1888 {2048, -1, &MPIR_Scatter_MV2_Direct},
1892 { 0, -1, &MPIR_Scatter_MV2_Direct},
1900 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1901 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1902 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1903 {2048, -1, &MPIR_Scatter_MV2_Direct},
1907 { 0, -1, &MPIR_Scatter_MV2_Direct},
1915 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1916 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1917 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1918 {4096, -1, &MPIR_Scatter_MV2_Direct},
1922 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1929 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1930 {0, 16, &MPIR_Scatter_MV2_Binomial},
1931 {16, 32, &MPIR_Scatter_MV2_Binomial},
1932 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1933 {4096, -1, &MPIR_Scatter_MV2_Direct},
1937 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1944 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1945 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1946 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1947 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1948 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1949 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1950 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1954 {0, 16, &MPIR_Scatter_MV2_Binomial},
1955 {16, 128, &MPIR_Scatter_MV2_Binomial},
1956 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1957 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1958 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1959 {65536, -1, &MPIR_Scatter_MV2_Direct},
1963 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1965 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1966 agg_table_sum += mv2_size_scatter_tuning_table[i];
1968 mv2_scatter_thresholds_table[0] =
1969 static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1970 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1971 (sizeof(mv2_scatter_tuning_table)
1972 * mv2_size_scatter_tuning_table[0]));
1973 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1974 mv2_scatter_thresholds_table[i] =
1975 mv2_scatter_thresholds_table[i - 1]
1976 + mv2_size_scatter_tuning_table[i - 1];
1977 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1978 (sizeof(mv2_scatter_tuning_table)
1979 * mv2_size_scatter_tuning_table[i]));
1981 xbt_free(table_ptrs);