1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 /************ Alltoall variables and initializers */
11 #define MV2_MAX_NB_THRESHOLDS 32
13 using namespace simgrid::smpi;
15 XBT_PUBLIC(void) smpi_coll_cleanup_mvapich2(void);
20 int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
21 void *recvbuf, int recvcount, MPI_Datatype recvtype,
23 } mv2_alltoall_tuning_element;
28 mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
29 mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
30 } mv2_alltoall_tuning_table;
32 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
34 /* Indicates number of processes per node */
35 int *mv2_alltoall_table_ppn_conf = NULL;
36 /* Indicates total number of configurations */
37 int mv2_alltoall_num_ppn_conf = 1;
38 int *mv2_size_alltoall_tuning_table = NULL;
39 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
42 #define MPIR_Alltoall_bruck_MV2 Coll_alltoall_bruck::alltoall
43 #define MPIR_Alltoall_RD_MV2 Coll_alltoall_rdb::alltoall
44 #define MPIR_Alltoall_Scatter_dest_MV2 Coll_alltoall_mvapich2_scatter_dest::alltoall
45 #define MPIR_Alltoall_pairwise_MV2 Coll_alltoall_pair::alltoall
46 #define MPIR_Alltoall_inplace_MV2 Coll_alltoall_ring::alltoall
49 static void init_mv2_alltoall_tables_stampede(){
51 int agg_table_sum = 0;
52 mv2_alltoall_tuning_table **table_ptrs = NULL;
53 mv2_alltoall_num_ppn_conf = 3;
54 if(Colls::smpi_coll_cleanup_callback==NULL)
55 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
56 mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57 * mv2_alltoall_num_ppn_conf));
58 table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
59 * mv2_alltoall_num_ppn_conf));
60 mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
61 mv2_alltoall_num_ppn_conf));
62 mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
63 mv2_alltoall_table_ppn_conf[0] = 1;
64 mv2_size_alltoall_tuning_table[0] = 6;
65 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
68 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
71 {{0, -1, &MPIR_Alltoall_inplace_MV2},
77 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
78 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
81 {{0, -1, &MPIR_Alltoall_inplace_MV2},
87 {{0, 8, &MPIR_Alltoall_RD_MV2},
88 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
91 {{0, -1, &MPIR_Alltoall_inplace_MV2},
97 {{0, 64, &MPIR_Alltoall_RD_MV2},
98 {64, 512, &MPIR_Alltoall_bruck_MV2},
99 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
102 {{0,-1, &MPIR_Alltoall_inplace_MV2},
108 {{0, 32, &MPIR_Alltoall_RD_MV2},
109 {32, 2048, &MPIR_Alltoall_bruck_MV2},
110 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
113 {{0, -1, &MPIR_Alltoall_inplace_MV2},
119 {{0, 8, &MPIR_Alltoall_RD_MV2},
120 {8, 1024, &MPIR_Alltoall_bruck_MV2},
121 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
124 {{0, -1, &MPIR_Alltoall_inplace_MV2},
128 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
129 mv2_alltoall_table_ppn_conf[1] = 2;
130 mv2_size_alltoall_tuning_table[1] = 6;
131 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
134 {{0, 32, &MPIR_Alltoall_RD_MV2},
135 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
138 {{0, -1, &MPIR_Alltoall_inplace_MV2},
144 {{0, 64, &MPIR_Alltoall_RD_MV2},
145 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
148 {{0, -1, &MPIR_Alltoall_inplace_MV2},
154 {{0, 64, &MPIR_Alltoall_RD_MV2},
155 {64, 2048, &MPIR_Alltoall_bruck_MV2},
156 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
159 {{0,-1, &MPIR_Alltoall_inplace_MV2},
165 {{0, 16, &MPIR_Alltoall_RD_MV2},
166 {16, 2048, &MPIR_Alltoall_bruck_MV2},
167 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
170 {{0, -1, &MPIR_Alltoall_inplace_MV2},
176 {{0, 8, &MPIR_Alltoall_RD_MV2},
177 {8, 1024, &MPIR_Alltoall_bruck_MV2},
178 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
181 {{0, -1, &MPIR_Alltoall_inplace_MV2},
187 {{0, 4, &MPIR_Alltoall_RD_MV2},
188 {4, 2048, &MPIR_Alltoall_bruck_MV2},
189 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
192 {{0, -1, &MPIR_Alltoall_inplace_MV2},
196 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
197 mv2_alltoall_table_ppn_conf[2] = 16;
198 mv2_size_alltoall_tuning_table[2] = 7;
199 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
202 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
203 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
206 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
212 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
213 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
216 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
222 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
223 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
224 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
227 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
233 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
234 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
237 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
243 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
244 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
247 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
253 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
254 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
257 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
262 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
263 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
266 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
271 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
273 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
274 agg_table_sum += mv2_size_alltoall_tuning_table[i];
276 mv2_alltoall_thresholds_table[0] =
277 static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
278 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
279 (sizeof(mv2_alltoall_tuning_table)
280 * mv2_size_alltoall_tuning_table[0]));
281 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
282 mv2_alltoall_thresholds_table[i] =
283 mv2_alltoall_thresholds_table[i - 1]
284 + mv2_size_alltoall_tuning_table[i - 1];
285 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
286 (sizeof(mv2_alltoall_tuning_table)
287 * mv2_size_alltoall_tuning_table[i]));
289 xbt_free(table_ptrs);
295 /************ Allgather variables and initializers */
300 int (*MV2_pt_Allgatherction)(void *sendbuf,
302 MPI_Datatype sendtype,
305 MPI_Datatype recvtype, MPI_Comm comm_ptr);
306 } mv2_allgather_tuning_element;
310 int two_level[MV2_MAX_NB_THRESHOLDS];
311 int size_inter_table;
312 mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
313 } mv2_allgather_tuning_table;
315 int (*MV2_Allgatherction)(void *sendbuf,
317 MPI_Datatype sendtype,
320 MPI_Datatype recvtype, MPI_Comm comm);
322 int *mv2_allgather_table_ppn_conf = NULL;
323 int mv2_allgather_num_ppn_conf = 1;
324 int *mv2_size_allgather_tuning_table = NULL;
325 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
327 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
329 MPI_Datatype sendtype,
332 MPI_Datatype recvtype, MPI_Comm comm_ptr)
337 #define MPIR_Allgather_Bruck_MV2 Coll_allgather_bruck::allgather
338 #define MPIR_Allgather_RD_MV2 Coll_allgather_rdb::allgather
339 #define MPIR_Allgather_Ring_MV2 Coll_allgather_ring::allgather
340 #define MPIR_2lvl_Allgather_MV2 Coll_allgather_mvapich2_smp::allgather
342 static void init_mv2_allgather_tables_stampede(){
344 int agg_table_sum = 0;
346 if(Colls::smpi_coll_cleanup_callback==NULL)
347 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
348 mv2_allgather_tuning_table **table_ptrs = NULL;
349 mv2_allgather_num_ppn_conf = 3;
350 mv2_allgather_thresholds_table
351 = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
352 * mv2_allgather_num_ppn_conf));
353 table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
354 * mv2_allgather_num_ppn_conf));
355 mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
356 mv2_allgather_num_ppn_conf));
357 mv2_allgather_table_ppn_conf
358 = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
359 mv2_allgather_table_ppn_conf[0] = 1;
360 mv2_size_allgather_tuning_table[0] = 6;
361 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
367 {0, -1, &MPIR_Allgather_Ring_MV2},
375 {0, 262144, &MPIR_Allgather_RD_MV2},
376 {262144, -1, &MPIR_Allgather_Ring_MV2},
384 {0, 131072, &MPIR_Allgather_RD_MV2},
385 {131072, -1, &MPIR_Allgather_Ring_MV2},
393 {0, 131072, &MPIR_Allgather_RD_MV2},
394 {131072, -1, &MPIR_Allgather_Ring_MV2},
402 {0, 65536, &MPIR_Allgather_RD_MV2},
403 {65536, -1, &MPIR_Allgather_Ring_MV2},
411 {0, 32768, &MPIR_Allgather_RD_MV2},
412 {32768, -1, &MPIR_Allgather_Ring_MV2},
416 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
417 mv2_allgather_table_ppn_conf[1] = 2;
418 mv2_size_allgather_tuning_table[1] = 6;
419 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
425 {0, 524288, &MPIR_Allgather_RD_MV2},
426 {524288, -1, &MPIR_Allgather_Ring_MV2},
434 {0, 32768, &MPIR_Allgather_RD_MV2},
435 {32768, 524288, &MPIR_Allgather_Ring_MV2},
436 {524288, -1, &MPIR_Allgather_Ring_MV2},
444 {0, 16384, &MPIR_Allgather_RD_MV2},
445 {16384, 524288, &MPIR_Allgather_Ring_MV2},
446 {524288, -1, &MPIR_Allgather_Ring_MV2},
454 {0, 65536, &MPIR_Allgather_RD_MV2},
455 {65536, 524288, &MPIR_Allgather_Ring_MV2},
456 {524288, -1, &MPIR_Allgather_Ring_MV2},
464 {0, 32768, &MPIR_Allgather_RD_MV2},
465 {32768, 524288, &MPIR_Allgather_Ring_MV2},
466 {524288, -1, &MPIR_Allgather_Ring_MV2},
474 {0, 65536, &MPIR_Allgather_RD_MV2},
475 {65536, 524288, &MPIR_Allgather_Ring_MV2},
476 {524288, -1, &MPIR_Allgather_Ring_MV2},
480 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
481 mv2_allgather_table_ppn_conf[2] = 16;
482 mv2_size_allgather_tuning_table[2] = 6;
483 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
489 {0, 1024, &MPIR_Allgather_RD_MV2},
490 {1024, -1, &MPIR_Allgather_Ring_MV2},
498 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
499 {1024, -1, &MPIR_Allgather_Ring_MV2},
507 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
508 {1024, -1, &MPIR_Allgather_Ring_MV2},
516 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
517 {1024, -1, &MPIR_Allgather_Ring_MV2},
525 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
526 {1024, -1, &MPIR_Allgather_Ring_MV2},
534 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
535 {1024, -1, &MPIR_Allgather_Ring_MV2},
540 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
542 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
543 agg_table_sum += mv2_size_allgather_tuning_table[i];
545 mv2_allgather_thresholds_table[0] =
546 static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
547 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
548 (sizeof(mv2_allgather_tuning_table)
549 * mv2_size_allgather_tuning_table[0]));
550 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
551 mv2_allgather_thresholds_table[i] =
552 mv2_allgather_thresholds_table[i - 1]
553 + mv2_size_allgather_tuning_table[i - 1];
554 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
555 (sizeof(mv2_allgather_tuning_table)
556 * mv2_size_allgather_tuning_table[i]));
558 xbt_free(table_ptrs);
562 /************ Gather variables and initializers */
567 int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
568 MPI_Datatype sendtype, void *recvbuf, int recvcnt,
569 MPI_Datatype recvtype, int root, MPI_Comm comm_ptr);
570 } mv2_gather_tuning_element;
575 int size_inter_table;
576 mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
577 int size_intra_table;
578 mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
579 } mv2_gather_tuning_table;
581 int mv2_size_gather_tuning_table=7;
582 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL;
584 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
586 MPI_Datatype sendtype,
589 MPI_Datatype recvtype,
590 int root, MPI_Comm comm);
592 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
593 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
597 #define MPIR_Gather_MV2_Direct Coll_gather_ompi_basic_linear::gather
598 #define MPIR_Gather_MV2_two_level_Direct Coll_gather_mvapich2_two_level::gather
599 #define MPIR_Gather_intra Coll_gather_mpich::gather
602 static void init_mv2_gather_tables_stampede(){
604 if(Colls::smpi_coll_cleanup_callback==NULL)
605 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
606 mv2_size_gather_tuning_table=7;
607 mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
608 sizeof (mv2_gather_tuning_table)));
609 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
611 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
612 {524288, -1, &MPIR_Gather_intra}},
613 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
615 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
616 {16384, 131072, &MPIR_Gather_intra},
617 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
618 1,{{0, -1, &MPIR_Gather_intra}}},
620 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
621 {256, 16384, &MPIR_Gather_MV2_Direct},
622 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
623 1,{{0, -1, &MPIR_Gather_intra}}},
625 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
626 {512, 16384, &MPIR_Gather_MV2_Direct},
627 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
628 1,{{0, -1, &MPIR_Gather_intra}}},
630 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
631 {512, 16384, &MPIR_Gather_MV2_Direct},
632 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
633 1,{{0, -1, &MPIR_Gather_intra}}},
635 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
636 {512, 16384, &MPIR_Gather_MV2_Direct},
637 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
638 1,{{0, -1, &MPIR_Gather_intra}}},
640 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
641 {512, 16384, &MPIR_Gather_MV2_Direct},
642 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
643 1,{{0, -1, &MPIR_Gather_intra}}},
646 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
647 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
652 /************ Allgatherv variables and initializers */
657 int (*MV2_pt_Allgatherv_function)(void *sendbuf,
659 MPI_Datatype sendtype,
663 MPI_Datatype recvtype,
665 } mv2_allgatherv_tuning_element;
669 int size_inter_table;
670 mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
671 } mv2_allgatherv_tuning_table;
673 int (*MV2_Allgatherv_function)(void *sendbuf,
675 MPI_Datatype sendtype,
679 MPI_Datatype recvtype,
682 int mv2_size_allgatherv_tuning_table = 0;
683 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
685 #define MPIR_Allgatherv_Rec_Doubling_MV2 Coll_allgatherv_mpich_rdb::allgatherv
686 #define MPIR_Allgatherv_Bruck_MV2 Coll_allgatherv_ompi_bruck::allgatherv
687 #define MPIR_Allgatherv_Ring_MV2 Coll_allgatherv_mpich_ring::allgatherv
690 static void init_mv2_allgatherv_tables_stampede(){
691 if(Colls::smpi_coll_cleanup_callback==NULL)
692 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
693 mv2_size_allgatherv_tuning_table = 6;
694 mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
695 sizeof (mv2_allgatherv_tuning_table)));
696 mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
701 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
702 {512, -1, &MPIR_Allgatherv_Ring_MV2},
709 {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
710 {512, -1, &MPIR_Allgatherv_Ring_MV2},
717 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
718 {256, -1, &MPIR_Allgatherv_Ring_MV2},
725 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
726 {256, -1, &MPIR_Allgatherv_Ring_MV2},
733 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
734 {256, -1, &MPIR_Allgatherv_Ring_MV2},
741 {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
742 {256, -1, &MPIR_Allgatherv_Ring_MV2},
747 memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
748 mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
752 /************ Allreduce variables and initializers */
757 int (*MV2_pt_Allreducection)(void *sendbuf,
760 MPI_Datatype datatype,
761 MPI_Op op, MPI_Comm comm);
762 } mv2_allreduce_tuning_element;
767 int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
768 int size_inter_table;
769 mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
770 int size_intra_table;
771 mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
772 } mv2_allreduce_tuning_table;
775 int (*MV2_Allreducection)(void *sendbuf,
778 MPI_Datatype datatype,
779 MPI_Op op, MPI_Comm comm)=NULL;
782 int (*MV2_Allreduce_intra_function)( void *sendbuf,
785 MPI_Datatype datatype,
786 MPI_Op op, MPI_Comm comm)=NULL;
788 int mv2_size_allreduce_tuning_table = 0;
789 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
795 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
798 MPI_Datatype datatype,
799 MPI_Op op, MPI_Comm comm)
804 static int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
807 MPI_Datatype datatype,
808 MPI_Op op, MPI_Comm comm)
813 static int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
816 MPI_Datatype datatype,
817 MPI_Op op, MPI_Comm comm)
819 Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
823 static int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
826 MPI_Datatype datatype,
827 MPI_Op op, MPI_Comm comm)
829 Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
833 #define MPIR_Allreduce_pt2pt_rd_MV2 Coll_allreduce_rdb::allreduce
834 #define MPIR_Allreduce_pt2pt_rs_MV2 Coll_allreduce_mvapich2_rs::allreduce
835 #define MPIR_Allreduce_two_level_MV2 Coll_allreduce_mvapich2_two_level::allreduce
838 static void init_mv2_allreduce_tables_stampede(){
839 if(Colls::smpi_coll_cleanup_callback==NULL)
840 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
841 mv2_size_allreduce_tuning_table = 8;
842 mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
843 sizeof (mv2_allreduce_tuning_table)));
844 mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
851 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
852 {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
856 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
857 {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
866 {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
867 {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
868 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
872 {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
873 {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
882 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
883 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
884 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
888 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
889 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
898 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
899 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
900 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
904 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
905 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
914 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
915 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
916 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
920 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
921 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
930 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
931 {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
932 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
936 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
937 {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
946 {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
947 {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
948 {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
949 {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
953 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
954 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
963 {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
964 {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
965 {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
966 {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
967 {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
971 {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
972 {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
977 memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
978 mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
987 int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
988 int root, MPI_Comm comm_ptr);
989 int zcpy_pipelined_knomial_factor;
990 } mv2_bcast_tuning_element;
994 int bcast_segment_size;
995 int intra_node_knomial_factor;
996 int inter_node_knomial_factor;
997 int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
998 int size_inter_table;
999 mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1000 int size_intra_table;
1001 mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1002 } mv2_bcast_tuning_table;
1004 int mv2_size_bcast_tuning_table = 0;
1005 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1008 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1009 int root, MPI_Comm comm_ptr) = NULL;
1011 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1012 int root, MPI_Comm comm_ptr) = NULL;
1014 int zcpy_knomial_factor = 2;
1015 int mv2_pipelined_zcpy_knomial_factor = -1;
1016 int bcast_segment_size = 8192;
1017 int mv2_inter_node_knomial_factor = 4;
1018 int mv2_intra_node_knomial_factor = 4;
1019 #define mv2_bcast_two_level_system_size 64
1020 #define mv2_bcast_short_msg 16384
1021 #define mv2_bcast_large_msg 512*1024
1023 #define INTRA_NODE_ROOT 0
1025 #define MPIR_Pipelined_Bcast_Zcpy_MV2 Coll_bcast_mpich::bcast
1026 #define MPIR_Pipelined_Bcast_MV2 Coll_bcast_mpich::bcast
1027 #define MPIR_Bcast_binomial_MV2 Coll_bcast_binomial_tree::bcast
1028 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 Coll_bcast_scatter_LR_allgather::bcast
1029 #define MPIR_Bcast_scatter_doubling_allgather_MV2 Coll_bcast_scatter_rdb_allgather::bcast
1030 #define MPIR_Bcast_scatter_ring_allgather_MV2 Coll_bcast_scatter_LR_allgather::bcast
1031 #define MPIR_Shmem_Bcast_MV2 Coll_bcast_mpich::bcast
1032 #define MPIR_Bcast_tune_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1033 #define MPIR_Bcast_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1034 #define MPIR_Knomial_Bcast_intra_node_MV2 Coll_bcast_mvapich2_knomial_intra_node::bcast
1035 #define MPIR_Bcast_intra_MV2 Coll_bcast_mvapich2_intra_node::bcast
1037 static void init_mv2_bcast_tables_stampede(){
1039 if(Colls::smpi_coll_cleanup_callback==NULL)
1040 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1041 mv2_size_bcast_tuning_table=8;
1042 mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1043 sizeof (mv2_bcast_tuning_table)));
1045 mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1049 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1052 {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053 {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054 {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1055 {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1056 {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1057 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1058 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1059 {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1060 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1061 {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1062 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1066 {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1067 {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1068 {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1069 {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1070 {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1071 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1072 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1073 {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1074 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1075 {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1076 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1082 {1, 1, 1, 1, 1, 1, 1, 1},
1085 {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1087 {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088 {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1089 {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1090 {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1091 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1092 {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1096 {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1097 {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1098 {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1099 {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1100 {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1101 {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1102 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1103 {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1109 {1, 1, 1, 1, 1, 1, 1, 1, 1},
1112 {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113 {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114 {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116 {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1117 {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1118 {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1119 {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1120 {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1124 {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1125 {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1126 {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1127 {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1128 {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1129 {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1130 {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1131 {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1132 {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1141 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1142 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1143 {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1144 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1148 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1149 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1150 {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1151 {524288, -1, NULL, -1}
1160 {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1161 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1162 {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1163 {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1164 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1168 {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1169 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1170 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1171 {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1172 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1181 {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1182 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1183 {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1184 {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1185 {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1189 {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1190 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1191 {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1192 {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1193 {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1202 {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1203 {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1204 {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1205 {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1206 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1210 {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1211 {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1212 {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1213 {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1214 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1220 {1, 1, 1, 1, 1, 1, 1},
1223 {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1224 {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1225 {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1226 {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1227 {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1228 {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1229 {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1233 {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1234 {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1235 {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1236 {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1237 {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1238 {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1239 {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1244 memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1245 mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1249 /************ Reduce variables and initializers */
1254 int (*MV2_pt_Reduce_function)(void *sendbuf,
1257 MPI_Datatype datatype,
1261 } mv2_reduce_tuning_element;
1267 int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1268 int size_inter_table;
1269 mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1270 int size_intra_table;
1271 mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1272 } mv2_reduce_tuning_table;
1274 int mv2_size_reduce_tuning_table = 0;
1275 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1278 int mv2_reduce_intra_knomial_factor = -1;
1279 int mv2_reduce_inter_knomial_factor = -1;
1281 int (*MV2_Reduce_function)( void *sendbuf,
1284 MPI_Datatype datatype,
1287 MPI_Comm comm_ptr)=NULL;
1289 int (*MV2_Reduce_intra_function)( void *sendbuf,
1292 MPI_Datatype datatype,
1295 MPI_Comm comm_ptr)=NULL;
1298 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1299 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1300 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
1301 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
1302 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
1303 #define MPIR_Reduce_two_level_helper_MV2 Coll_reduce_mvapich2_two_level::reduce
1306 static void init_mv2_reduce_tables_stampede(){
1307 if(Colls::smpi_coll_cleanup_callback==NULL)
1308 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1310 mv2_size_reduce_tuning_table = 8;
1311 mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1312 sizeof (mv2_reduce_tuning_table)));
1313 mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1321 {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1322 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1323 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1327 {0, 65536, &MPIR_Reduce_shmem_MV2},
1328 {65536,-1, &MPIR_Reduce_binomial_MV2},
1335 {1, 1, 1, 1, 0, 0, 0},
1338 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1340 {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1342 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1343 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1344 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1348 {0, 8192, &MPIR_Reduce_shmem_MV2},
1349 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1350 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1351 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1352 {65536, 262144, &MPIR_Reduce_shmem_MV2},
1353 {262144,-1, &MPIR_Reduce_binomial_MV2},
1363 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1364 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1365 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1366 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1367 {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1371 {0, 8192, &MPIR_Reduce_shmem_MV2},
1372 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1374 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1375 {262144, -1, &MPIR_Reduce_binomial_MV2},
1385 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1386 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1388 {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1389 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1390 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1394 {0, 8192, &MPIR_Reduce_shmem_MV2},
1395 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1396 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1397 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1398 {262144, -1, &MPIR_Reduce_binomial_MV2},
1405 {1, 1, 1, 0, 1, 1, 0},
1408 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1409 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1410 {16384, 32768, &MPIR_Reduce_binomial_MV2},
1411 {32768, 65536, &MPIR_Reduce_binomial_MV2},
1412 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1413 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1414 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1418 {0, 8192, &MPIR_Reduce_shmem_MV2},
1419 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1420 {16384, 32768, &MPIR_Reduce_shmem_MV2},
1421 {32768, 65536, &MPIR_Reduce_shmem_MV2},
1422 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1423 {262144, -1, &MPIR_Reduce_binomial_MV2},
1433 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1434 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1435 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1436 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1437 {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1438 {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1442 {0, 8192, &MPIR_Reduce_shmem_MV2},
1443 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1445 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1446 {262144, -1, &MPIR_Reduce_binomial_MV2},
1456 {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1457 {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1458 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1459 {65536, 262144, &MPIR_Reduce_binomial_MV2},
1460 {262144, -1, &MPIR_Reduce_binomial_MV2},
1464 {0, 8192, &MPIR_Reduce_shmem_MV2},
1465 {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1466 {16384, 65536, &MPIR_Reduce_shmem_MV2},
1467 {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1468 {262144, -1, &MPIR_Reduce_binomial_MV2},
1478 {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1479 {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1480 {4096, 16384, &MPIR_Reduce_binomial_MV2},
1481 {16384, 65536, &MPIR_Reduce_binomial_MV2},
1482 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1483 {131072, -1, &MPIR_Reduce_binomial_MV2},
1487 {0, 2048, &MPIR_Reduce_shmem_MV2},
1488 {2048, 4096, &MPIR_Reduce_shmem_MV2},
1489 {4096, 16384, &MPIR_Reduce_shmem_MV2},
1490 {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1491 {65536, 131072, &MPIR_Reduce_binomial_MV2},
1492 {131072, -1, &MPIR_Reduce_shmem_MV2},
1497 memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1498 mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1501 /************ Reduce scatter variables and initializers */
1506 int (*MV2_pt_Red_scat_function)(void *sendbuf,
1509 MPI_Datatype datatype,
1512 } mv2_red_scat_tuning_element;
1516 int size_inter_table;
1517 mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1518 } mv2_red_scat_tuning_table;
1520 int mv2_size_red_scat_tuning_table = 0;
1521 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1524 int (*MV2_Red_scat_function)(void *sendbuf,
1527 MPI_Datatype datatype,
1533 static int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1536 MPI_Datatype datatype,
1540 Coll_reduce_scatter_default::reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1543 #define MPIR_Reduce_scatter_non_comm_MV2 Coll_reduce_scatter_mpich_noncomm::reduce_scatter
1544 #define MPIR_Reduce_scatter_Rec_Halving_MV2 Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter
1545 #define MPIR_Reduce_scatter_Pair_Wise_MV2 Coll_reduce_scatter_mpich_pair::reduce_scatter
1550 static void init_mv2_reduce_scatter_tables_stampede(){
1551 if(Colls::smpi_coll_cleanup_callback==NULL)
1552 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1553 mv2_size_red_scat_tuning_table = 6;
1554 mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1555 sizeof (mv2_red_scat_tuning_table)));
1556 mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1561 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1562 {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1563 {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1570 {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1571 {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1572 {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1579 {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1580 {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1581 {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1588 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1589 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1596 {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1597 {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1604 {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1605 {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1610 memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1611 mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1614 /************ Scatter variables and initializers */
1619 int (*MV2_pt_Scatter_function)(void *sendbuf,
1621 MPI_Datatype sendtype,
1624 MPI_Datatype recvtype,
1625 int root, MPI_Comm comm);
1626 } mv2_scatter_tuning_element;
1630 int size_inter_table;
1631 mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1632 int size_intra_table;
1633 mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1634 } mv2_scatter_tuning_table;
1637 int *mv2_scatter_table_ppn_conf = NULL;
1638 int mv2_scatter_num_ppn_conf = 1;
1639 int *mv2_size_scatter_tuning_table = NULL;
1640 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1642 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1643 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1644 int root, MPI_Comm comm)=NULL;
1646 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1647 void *recvbuf, int recvcount, MPI_Datatype recvtype,
1648 int root, MPI_Comm comm)=NULL;
1649 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1651 MPI_Datatype sendtype,
1654 MPI_Datatype recvtype,
1655 int root, MPI_Comm comm_ptr);
1657 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1659 MPI_Datatype sendtype,
1662 MPI_Datatype recvtype,
1663 int root, MPI_Comm comm_ptr)
1668 #define MPIR_Scatter_MV2_Binomial Coll_scatter_ompi_binomial::scatter
1669 #define MPIR_Scatter_MV2_Direct Coll_scatter_ompi_basic_linear::scatter
1670 #define MPIR_Scatter_MV2_two_level_Binomial Coll_scatter_mvapich2_two_level_binomial::scatter
1671 #define MPIR_Scatter_MV2_two_level_Direct Coll_scatter_mvapich2_two_level_direct::scatter
1676 static void init_mv2_scatter_tables_stampede(){
1677 if(Colls::smpi_coll_cleanup_callback==NULL)
1678 Colls::smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1680 int agg_table_sum = 0;
1682 mv2_scatter_tuning_table **table_ptrs = NULL;
1683 mv2_scatter_num_ppn_conf = 3;
1684 mv2_scatter_thresholds_table
1685 = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1686 * mv2_scatter_num_ppn_conf));
1687 table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1688 * mv2_scatter_num_ppn_conf));
1689 mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1690 mv2_scatter_num_ppn_conf));
1691 mv2_scatter_table_ppn_conf
1692 = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1693 mv2_scatter_table_ppn_conf[0] = 1;
1694 mv2_size_scatter_tuning_table[0] = 6;
1695 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1699 {0, -1, &MPIR_Scatter_MV2_Binomial},
1703 {0, -1, &MPIR_Scatter_MV2_Binomial},
1710 {0, -1, &MPIR_Scatter_MV2_Direct},
1714 {0, -1, &MPIR_Scatter_MV2_Direct},
1721 {0, -1, &MPIR_Scatter_MV2_Direct},
1725 {0, -1, &MPIR_Scatter_MV2_Direct},
1732 {0, -1, &MPIR_Scatter_MV2_Direct},
1736 {0, -1, &MPIR_Scatter_MV2_Direct},
1743 {0, -1, &MPIR_Scatter_MV2_Direct},
1747 {0, -1, &MPIR_Scatter_MV2_Direct},
1754 {0, 32, &MPIR_Scatter_MV2_Binomial},
1755 {32, -1, &MPIR_Scatter_MV2_Direct},
1759 {0, -1, &MPIR_Scatter_MV2_Binomial},
1763 table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1764 mv2_scatter_table_ppn_conf[1] = 2;
1765 mv2_size_scatter_tuning_table[1] = 6;
1766 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1770 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1771 {4096, -1, &MPIR_Scatter_MV2_Direct},
1775 {0, -1, &MPIR_Scatter_MV2_Direct},
1782 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1783 {512, -1, &MPIR_Scatter_MV2_Direct},
1787 {0, -1, &MPIR_Scatter_MV2_Binomial},
1794 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1795 {2048, -1, &MPIR_Scatter_MV2_Direct},
1799 {0, -1, &MPIR_Scatter_MV2_Binomial},
1806 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1807 {2048, -1, &MPIR_Scatter_MV2_Direct},
1811 {0, -1, &MPIR_Scatter_MV2_Binomial},
1818 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1819 {8192, -1, &MPIR_Scatter_MV2_Direct},
1823 {0, -1, &MPIR_Scatter_MV2_Binomial},
1830 {0, 16, &MPIR_Scatter_MV2_Binomial},
1831 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1832 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1833 {16384, -1, &MPIR_Scatter_MV2_Direct},
1837 {0, 128, &MPIR_Scatter_MV2_Direct},
1838 {128, -1, &MPIR_Scatter_MV2_Binomial},
1842 table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1843 mv2_scatter_table_ppn_conf[2] = 16;
1844 mv2_size_scatter_tuning_table[2] = 8;
1845 mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1850 {0, 256, &MPIR_Scatter_MV2_Binomial},
1851 {256, -1, &MPIR_Scatter_MV2_Direct},
1855 { 0, -1, &MPIR_Scatter_MV2_Direct},
1863 {0, 512, &MPIR_Scatter_MV2_Binomial},
1864 {512, -1, &MPIR_Scatter_MV2_Direct},
1868 { 0, -1, &MPIR_Scatter_MV2_Direct},
1876 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1877 {1024, -1, &MPIR_Scatter_MV2_Direct},
1881 { 0, -1, &MPIR_Scatter_MV2_Direct},
1889 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1890 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1891 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1892 {2048, -1, &MPIR_Scatter_MV2_Direct},
1896 { 0, -1, &MPIR_Scatter_MV2_Direct},
1904 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1905 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1906 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1907 {2048, -1, &MPIR_Scatter_MV2_Direct},
1911 { 0, -1, &MPIR_Scatter_MV2_Direct},
1919 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1920 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1921 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1922 {4096, -1, &MPIR_Scatter_MV2_Direct},
1926 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1933 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1934 {0, 16, &MPIR_Scatter_MV2_Binomial},
1935 {16, 32, &MPIR_Scatter_MV2_Binomial},
1936 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1937 {4096, -1, &MPIR_Scatter_MV2_Direct},
1941 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1948 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1949 {0, 16, &MPIR_Scatter_MV2_two_level_Binomial},
1950 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1951 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1952 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1953 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1954 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1958 {0, 16, &MPIR_Scatter_MV2_Binomial},
1959 {16, 128, &MPIR_Scatter_MV2_Binomial},
1960 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1961 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1962 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1963 {65536, -1, &MPIR_Scatter_MV2_Direct},
1967 table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1969 for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1970 agg_table_sum += mv2_size_scatter_tuning_table[i];
1972 mv2_scatter_thresholds_table[0] =
1973 static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1974 memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1975 (sizeof(mv2_scatter_tuning_table)
1976 * mv2_size_scatter_tuning_table[0]));
1977 for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1978 mv2_scatter_thresholds_table[i] =
1979 mv2_scatter_thresholds_table[i - 1]
1980 + mv2_size_scatter_tuning_table[i - 1];
1981 memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1982 (sizeof(mv2_scatter_tuning_table)
1983 * mv2_size_scatter_tuning_table[i]));
1985 xbt_free(table_ptrs);