1 /* selector for collective algorithms based on mvapich decision logic */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 #include "smpi_mvapich2_selector_stampede.h"
14 static void init_mv2_alltoall_tables_stampede(){
16 int agg_table_sum = 0;
17 mv2_alltoall_tuning_table **table_ptrs = NULL;
18 mv2_alltoall_num_ppn_conf = 3;
19 mv2_alltoall_thresholds_table
20 = malloc(sizeof(mv2_alltoall_tuning_table *)
21 * mv2_alltoall_num_ppn_conf);
22 table_ptrs = malloc(sizeof(mv2_alltoall_tuning_table *)
23 * mv2_alltoall_num_ppn_conf);
24 mv2_size_alltoall_tuning_table = malloc(sizeof(int) *
25 mv2_alltoall_num_ppn_conf);
26 mv2_alltoall_table_ppn_conf =malloc(mv2_alltoall_num_ppn_conf * sizeof(int));
27 mv2_alltoall_table_ppn_conf[0] = 1;
28 mv2_size_alltoall_tuning_table[0] = 6;
29 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
32 {{0, -1, &MPIR_Alltoall_pairwise_MV2},
35 {{0, -1, &MPIR_Alltoall_inplace_MV2},
41 {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
42 {262144, -1, &MPIR_Alltoall_pairwise_MV2},
45 {{0, -1, &MPIR_Alltoall_inplace_MV2},
51 {{0, 8, &MPIR_Alltoall_RD_MV2},
52 {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
55 {{0, -1, &MPIR_Alltoall_inplace_MV2},
61 {{0, 64, &MPIR_Alltoall_RD_MV2},
62 {64, 512, &MPIR_Alltoall_bruck_MV2},
63 {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
66 {{0,-1, &MPIR_Alltoall_inplace_MV2},
72 {{0, 32, &MPIR_Alltoall_RD_MV2},
73 {32, 2048, &MPIR_Alltoall_bruck_MV2},
74 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
77 {{0, -1, &MPIR_Alltoall_inplace_MV2},
83 {{0, 8, &MPIR_Alltoall_RD_MV2},
84 {8, 1024, &MPIR_Alltoall_bruck_MV2},
85 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
88 {{0, -1, &MPIR_Alltoall_inplace_MV2},
92 table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
93 mv2_alltoall_table_ppn_conf[1] = 2;
94 mv2_size_alltoall_tuning_table[1] = 6;
95 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
98 {{0, 32, &MPIR_Alltoall_RD_MV2},
99 {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
102 {{0, -1, &MPIR_Alltoall_inplace_MV2},
108 {{0, 64, &MPIR_Alltoall_RD_MV2},
109 {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
112 {{0, -1, &MPIR_Alltoall_inplace_MV2},
118 {{0, 64, &MPIR_Alltoall_RD_MV2},
119 {64, 2048, &MPIR_Alltoall_bruck_MV2},
120 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
123 {{0,-1, &MPIR_Alltoall_inplace_MV2},
129 {{0, 16, &MPIR_Alltoall_RD_MV2},
130 {16, 2048, &MPIR_Alltoall_bruck_MV2},
131 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134 {{0, -1, &MPIR_Alltoall_inplace_MV2},
140 {{0, 8, &MPIR_Alltoall_RD_MV2},
141 {8, 1024, &MPIR_Alltoall_bruck_MV2},
142 {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
145 {{0, -1, &MPIR_Alltoall_inplace_MV2},
151 {{0, 4, &MPIR_Alltoall_RD_MV2},
152 {4, 2048, &MPIR_Alltoall_bruck_MV2},
153 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
156 {{0, -1, &MPIR_Alltoall_inplace_MV2},
160 table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
161 mv2_alltoall_table_ppn_conf[2] = 16;
162 mv2_size_alltoall_tuning_table[2] = 7;
163 mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
166 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
167 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
170 {{32768, -1, &MPIR_Alltoall_inplace_MV2},
176 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
177 {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
180 {{16384, -1, &MPIR_Alltoall_inplace_MV2},
186 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
187 {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
188 {16384, -1, &MPIR_Alltoall_pairwise_MV2},
191 {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
197 {{0, 2048, &MPIR_Alltoall_bruck_MV2},
198 {2048, -1, &MPIR_Alltoall_pairwise_MV2},
201 {{16384,65536, &MPIR_Alltoall_inplace_MV2},
207 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
208 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
211 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
217 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
218 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
221 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
226 {{0, 1024, &MPIR_Alltoall_bruck_MV2},
227 {1024, -1, &MPIR_Alltoall_pairwise_MV2},
230 {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
235 table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
237 for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
238 agg_table_sum += mv2_size_alltoall_tuning_table[i];
240 mv2_alltoall_thresholds_table[0] =
241 malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table));
242 memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
243 (sizeof(mv2_alltoall_tuning_table)
244 * mv2_size_alltoall_tuning_table[0]));
245 for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
246 mv2_alltoall_thresholds_table[i] =
247 mv2_alltoall_thresholds_table[i - 1]
248 + mv2_size_alltoall_tuning_table[i - 1];
249 memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
250 (sizeof(mv2_alltoall_tuning_table)
251 * mv2_size_alltoall_tuning_table[i]));
258 int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount,
259 MPI_Datatype sendtype,
260 void* recvbuf, int recvcount,
261 MPI_Datatype recvtype,
265 if(mv2_alltoall_table_ppn_conf==NULL)
266 init_mv2_alltoall_tables_stampede();
268 int sendtype_size, recvtype_size, nbytes, comm_size;
269 char * tmp_buf = NULL;
270 int mpi_errno=MPI_SUCCESS;
272 int range_threshold = 0;
274 comm_size = smpi_comm_size(comm);
276 sendtype_size=smpi_datatype_size(sendtype);
277 recvtype_size=smpi_datatype_size(recvtype);
278 nbytes = sendtype_size * sendcount;
280 /* check if safe to use partial subscription mode */
282 /* Search for the corresponding system size inside the tuning table */
283 while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
284 (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
287 /* Search for corresponding inter-leader function */
288 while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
290 mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
291 && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
294 MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
295 .MV2_pt_Alltoall_function;
297 if(sendbuf != MPI_IN_PLACE) {
298 mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
299 recvbuf, recvcount, recvtype,
304 mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
305 ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
307 tmp_buf = (char *)malloc( comm_size * recvcount * recvtype_size );
308 mpi_errno = smpi_datatype_copy((char *)recvbuf,
309 comm_size*recvcount, recvtype,
311 comm_size*recvcount, recvtype);
313 mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
314 recvbuf, recvcount, recvtype,
318 mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
319 recvbuf, recvcount, recvtype,
329 static void init_mv2_allgather_tables_stampede(){
331 int agg_table_sum = 0;
332 mv2_allgather_tuning_table **table_ptrs = NULL;
333 mv2_allgather_num_ppn_conf = 3;
334 mv2_allgather_thresholds_table
335 = malloc(sizeof(mv2_allgather_tuning_table *)
336 * mv2_allgather_num_ppn_conf);
337 table_ptrs = malloc(sizeof(mv2_allgather_tuning_table *)
338 * mv2_allgather_num_ppn_conf);
339 mv2_size_allgather_tuning_table = malloc(sizeof(int) *
340 mv2_allgather_num_ppn_conf);
341 mv2_allgather_table_ppn_conf
342 = malloc(mv2_allgather_num_ppn_conf * sizeof(int));
343 mv2_allgather_table_ppn_conf[0] = 1;
344 mv2_size_allgather_tuning_table[0] = 6;
345 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
351 {0, -1, &MPIR_Allgather_Ring_MV2},
359 {0, 262144, &MPIR_Allgather_RD_MV2},
360 {262144, -1, &MPIR_Allgather_Ring_MV2},
368 {0, 131072, &MPIR_Allgather_RD_MV2},
369 {131072, -1, &MPIR_Allgather_Ring_MV2},
377 {0, 131072, &MPIR_Allgather_RD_MV2},
378 {131072, -1, &MPIR_Allgather_Ring_MV2},
386 {0, 65536, &MPIR_Allgather_RD_MV2},
387 {65536, -1, &MPIR_Allgather_Ring_MV2},
395 {0, 32768, &MPIR_Allgather_RD_MV2},
396 {32768, -1, &MPIR_Allgather_Ring_MV2},
400 table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
401 mv2_allgather_table_ppn_conf[1] = 2;
402 mv2_size_allgather_tuning_table[1] = 6;
403 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
409 {0, 524288, &MPIR_Allgather_RD_MV2},
410 {524288, -1, &MPIR_Allgather_Ring_MV2},
418 {0, 32768, &MPIR_Allgather_RD_MV2},
419 {32768, 524288, &MPIR_Allgather_Ring_MV2},
420 {524288, -1, &MPIR_Allgather_Ring_MV2},
428 {0, 16384, &MPIR_Allgather_RD_MV2},
429 {16384, 524288, &MPIR_Allgather_Ring_MV2},
430 {524288, -1, &MPIR_Allgather_Ring_MV2},
438 {0, 65536, &MPIR_Allgather_RD_MV2},
439 {65536, 524288, &MPIR_Allgather_Ring_MV2},
440 {524288, -1, &MPIR_Allgather_Ring_MV2},
448 {0, 32768, &MPIR_Allgather_RD_MV2},
449 {32768, 524288, &MPIR_Allgather_Ring_MV2},
450 {524288, -1, &MPIR_Allgather_Ring_MV2},
458 {0, 65536, &MPIR_Allgather_RD_MV2},
459 {65536, 524288, &MPIR_Allgather_Ring_MV2},
460 {524288, -1, &MPIR_Allgather_Ring_MV2},
464 table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
465 mv2_allgather_table_ppn_conf[2] = 16;
466 mv2_size_allgather_tuning_table[2] = 6;
467 mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
473 {0, 1024, &MPIR_Allgather_RD_MV2},
474 {1024, -1, &MPIR_Allgather_Ring_MV2},
482 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
483 {1024, -1, &MPIR_Allgather_Ring_MV2},
491 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
492 {1024, -1, &MPIR_Allgather_Ring_MV2},
500 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
501 {1024, -1, &MPIR_Allgather_Ring_MV2},
509 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
510 {1024, -1, &MPIR_Allgather_Ring_MV2},
518 {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
519 {1024, -1, &MPIR_Allgather_Ring_MV2},
524 table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
526 for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
527 agg_table_sum += mv2_size_allgather_tuning_table[i];
529 mv2_allgather_thresholds_table[0] =
530 malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table));
531 memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
532 (sizeof(mv2_allgather_tuning_table)
533 * mv2_size_allgather_tuning_table[0]));
534 for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
535 mv2_allgather_thresholds_table[i] =
536 mv2_allgather_thresholds_table[i - 1]
537 + mv2_size_allgather_tuning_table[i - 1];
538 memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
539 (sizeof(mv2_allgather_tuning_table)
540 * mv2_size_allgather_tuning_table[i]));
545 int smpi_coll_tuned_allgather_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype,
546 void *recvbuf, int recvcount, MPI_Datatype recvtype,
550 int mpi_errno = MPI_SUCCESS;
551 int nbytes = 0, comm_size, recvtype_size;
553 //int partial_sub_ok = 0;
555 int range_threshold = 0;
556 int is_two_level = 0;
557 //int local_size = -1;
558 //MPI_Comm shmem_comm;
559 //MPI_Comm *shmem_commptr=NULL;
560 /* Get the size of the communicator */
561 comm_size = smpi_comm_size(comm);
562 recvtype_size=smpi_datatype_size(recvtype);
563 nbytes = recvtype_size * recvcount;
565 if(mv2_allgather_table_ppn_conf==NULL)
566 init_mv2_allgather_tables_stampede();
569 /* check if safe to use partial subscription mode */
570 /* if (comm->ch.shmem_coll_ok == 1 && comm->ch.is_uniform) {
572 shmem_comm = comm->ch.shmem_comm;
573 MPID_Comm_get_ptr(shmem_comm, shmem_commptr);
574 local_size = shmem_commptr->local_size;
576 if (mv2_allgather_table_ppn_conf[0] == -1) {
577 // Indicating user defined tuning
582 if (local_size == mv2_allgather_table_ppn_conf[i]) {
588 } while(i < mv2_allgather_num_ppn_conf);
592 if (partial_sub_ok != 1) {
595 /* Search for the corresponding system size inside the tuning table */
596 while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
598 mv2_allgather_thresholds_table[conf_index][range].numproc)) {
601 /* Search for corresponding inter-leader function */
602 while ((range_threshold <
603 (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
604 && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
605 && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
610 /* Set inter-leader pt */
611 MV2_Allgather_function =
612 mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
613 MV2_pt_Allgather_function;
615 is_two_level = mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
617 /* intracommunicator */
618 if(is_two_level ==1){
620 /* if(comm->ch.shmem_coll_ok == 1){
621 MPIR_T_PVAR_COUNTER_INC(MV2, mv2_num_shmem_coll_calls, 1);
622 if (1 == comm->ch.is_blocked) {
623 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
624 recvbuf, recvcount, recvtype,
628 mpi_errno = MPIR_Allgather_intra(sendbuf, sendcount, sendtype,
629 recvbuf, recvcount, recvtype,
633 mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
634 recvbuf, recvcount, recvtype,
637 } else if(MV2_Allgather_function == &MPIR_Allgather_Bruck_MV2
638 || MV2_Allgather_function == &MPIR_Allgather_RD_MV2
639 || MV2_Allgather_function == &MPIR_Allgather_Ring_MV2) {
640 mpi_errno = MV2_Allgather_function(sendbuf, sendcount, sendtype,
641 recvbuf, recvcount, recvtype,
644 return MPI_ERR_OTHER;
650 static void init_mv2_gather_tables_stampede(){
652 mv2_size_gather_tuning_table=7;
653 mv2_gather_thresholds_table = malloc(mv2_size_gather_tuning_table*
654 sizeof (mv2_gather_tuning_table));
655 mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
657 2,{{0, 524288, &MPIR_Gather_MV2_Direct},
658 {524288, -1, &MPIR_Gather_intra}},
659 1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
661 3,{{0, 16384, &MPIR_Gather_MV2_Direct},
662 {16384, 131072, &MPIR_Gather_intra},
663 {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
664 1,{{0, -1, &MPIR_Gather_intra}}},
666 3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
667 {256, 16384, &MPIR_Gather_MV2_Direct},
668 {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
669 1,{{0, -1, &MPIR_Gather_intra}}},
671 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
672 {512, 16384, &MPIR_Gather_MV2_Direct},
673 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
674 1,{{0, -1, &MPIR_Gather_intra}}},
676 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
677 {512, 16384, &MPIR_Gather_MV2_Direct},
678 {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
679 1,{{0, -1, &MPIR_Gather_intra}}},
681 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
682 {512, 16384, &MPIR_Gather_MV2_Direct},
683 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
684 1,{{0, -1, &MPIR_Gather_intra}}},
686 3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
687 {512, 16384, &MPIR_Gather_MV2_Direct},
688 {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
689 1,{{0, -1, &MPIR_Gather_intra}}},
692 memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
693 mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
698 int smpi_coll_tuned_gather_mvapich2(void *sendbuf,
700 MPI_Datatype sendtype,
703 MPI_Datatype recvtype,
704 int root, MPI_Comm comm)
706 if(mv2_alltoall_table_ppn_conf==NULL)
707 init_mv2_alltoall_tables_stampede();
709 int mpi_errno = MPI_SUCCESS;
711 int range_threshold = 0;
712 int range_intra_threshold = 0;
715 int recvtype_size, sendtype_size;
717 comm_size = smpi_comm_size(comm);
718 rank = smpi_comm_rank(comm);
721 recvtype_size=smpi_datatype_size(recvtype);
722 nbytes = recvcnt * recvtype_size;
724 sendtype_size=smpi_datatype_size(sendtype);
725 nbytes = sendcnt * sendtype_size;
728 /* Search for the corresponding system size inside the tuning table */
729 while ((range < (mv2_size_gather_tuning_table - 1)) &&
730 (comm_size > mv2_gather_thresholds_table[range].numproc)) {
733 /* Search for corresponding inter-leader function */
734 while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
736 mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
737 && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
742 /* Search for corresponding intra node function */
743 while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
745 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
746 && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
748 range_intra_threshold++;
751 if (comm->ch.is_global_block == 1 && mv2_use_direct_gather == 1 &&
752 mv2_use_two_level_gather == 1 && comm->ch.shmem_coll_ok == 1) {
753 // Set intra-node function pt for gather_two_level
754 MV2_Gather_intra_node_function =
755 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
756 MV2_pt_Gather_function;
757 //Set inter-leader pt
758 MV2_Gather_inter_leader_function =
759 mv2_gather_thresholds_table[range].inter_leader[range_threshold].
760 MV2_pt_Gather_function;
761 // We call Gather function
763 MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
764 recvtype, root, comm);
767 // Indded, direct (non SMP-aware)gather is MPICH one
768 mpi_errno = smpi_coll_tuned_gather_mpich(sendbuf, sendcnt, sendtype,
769 recvbuf, recvcnt, recvtype,