1 /* selector for collective algorithms based on mvapich decision logic */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 #include "smpi_mvapich2_selector_stampede.h"
15 int smpi_coll_tuned_alltoall_mvapich2( void *sendbuf, int sendcount,
16 MPI_Datatype sendtype,
17 void* recvbuf, int recvcount,
18 MPI_Datatype recvtype,
22 if(mv2_alltoall_table_ppn_conf==NULL)
23 init_mv2_alltoall_tables_stampede();
25 int sendtype_size, recvtype_size, nbytes, comm_size;
26 char * tmp_buf = NULL;
27 int mpi_errno=MPI_SUCCESS;
29 int range_threshold = 0;
31 comm_size = smpi_comm_size(comm);
33 sendtype_size=smpi_datatype_size(sendtype);
34 recvtype_size=smpi_datatype_size(recvtype);
35 nbytes = sendtype_size * sendcount;
37 /* check if safe to use partial subscription mode */
39 /* Search for the corresponding system size inside the tuning table */
40 while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
41 (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
44 /* Search for corresponding inter-leader function */
45 while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
47 mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
48 && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
51 MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
52 .MV2_pt_Alltoall_function;
54 if(sendbuf != MPI_IN_PLACE) {
55 mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
56 recvbuf, recvcount, recvtype,
61 mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
62 ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
64 tmp_buf = (char *)smpi_get_tmp_sendbuffer( comm_size * recvcount * recvtype_size );
65 mpi_errno = smpi_datatype_copy((char *)recvbuf,
66 comm_size*recvcount, recvtype,
68 comm_size*recvcount, recvtype);
70 mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
71 recvbuf, recvcount, recvtype,
73 smpi_free_tmp_buffer(tmp_buf);
75 mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
76 recvbuf, recvcount, recvtype,
87 int smpi_coll_tuned_allgather_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype,
88 void *recvbuf, int recvcount, MPI_Datatype recvtype,
92 int mpi_errno = MPI_SUCCESS;
93 int nbytes = 0, comm_size, recvtype_size;
95 int partial_sub_ok = 0;
97 int range_threshold = 0;
101 //MPI_Comm *shmem_commptr=NULL;
102 /* Get the size of the communicator */
103 comm_size = smpi_comm_size(comm);
104 recvtype_size=smpi_datatype_size(recvtype);
105 nbytes = recvtype_size * recvcount;
107 if(mv2_allgather_table_ppn_conf==NULL)
108 init_mv2_allgather_tables_stampede();
110 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
111 smpi_comm_init_smp(comm);
115 if (smpi_comm_is_uniform(comm)){
116 shmem_comm = smpi_comm_get_intra_comm(comm);
117 local_size = smpi_comm_size(shmem_comm);
119 if (mv2_allgather_table_ppn_conf[0] == -1) {
120 // Indicating user defined tuning
125 if (local_size == mv2_allgather_table_ppn_conf[i]) {
131 } while(i < mv2_allgather_num_ppn_conf);
134 if (partial_sub_ok != 1) {
138 /* Search for the corresponding system size inside the tuning table */
139 while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
141 mv2_allgather_thresholds_table[conf_index][range].numproc)) {
144 /* Search for corresponding inter-leader function */
145 while ((range_threshold <
146 (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
147 && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
148 && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
153 /* Set inter-leader pt */
154 MV2_Allgather_function =
155 mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
156 MV2_pt_Allgather_function;
158 is_two_level = mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
160 /* intracommunicator */
161 if(is_two_level ==1){
162 if(partial_sub_ok ==1){
163 if (smpi_comm_is_blocked(comm)){
164 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
165 recvbuf, recvcount, recvtype,
168 mpi_errno = smpi_coll_tuned_allgather_mpich(sendbuf, sendcount, sendtype,
169 recvbuf, recvcount, recvtype,
173 mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
174 recvbuf, recvcount, recvtype,
177 } else if(MV2_Allgather_function == &MPIR_Allgather_Bruck_MV2
178 || MV2_Allgather_function == &MPIR_Allgather_RD_MV2
179 || MV2_Allgather_function == &MPIR_Allgather_Ring_MV2) {
180 mpi_errno = MV2_Allgather_function(sendbuf, sendcount, sendtype,
181 recvbuf, recvcount, recvtype,
184 return MPI_ERR_OTHER;
191 int smpi_coll_tuned_gather_mvapich2(void *sendbuf,
193 MPI_Datatype sendtype,
196 MPI_Datatype recvtype,
197 int root, MPI_Comm comm)
199 if(mv2_gather_thresholds_table==NULL)
200 init_mv2_gather_tables_stampede();
202 int mpi_errno = MPI_SUCCESS;
204 int range_threshold = 0;
205 int range_intra_threshold = 0;
208 int recvtype_size, sendtype_size;
210 comm_size = smpi_comm_size(comm);
211 rank = smpi_comm_rank(comm);
214 recvtype_size=smpi_datatype_size(recvtype);
215 nbytes = recvcnt * recvtype_size;
217 sendtype_size=smpi_datatype_size(sendtype);
218 nbytes = sendcnt * sendtype_size;
221 /* Search for the corresponding system size inside the tuning table */
222 while ((range < (mv2_size_gather_tuning_table - 1)) &&
223 (comm_size > mv2_gather_thresholds_table[range].numproc)) {
226 /* Search for corresponding inter-leader function */
227 while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
229 mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
230 && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
235 /* Search for corresponding intra node function */
236 while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
238 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
239 && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
241 range_intra_threshold++;
244 if (smpi_comm_is_blocked(comm) ) {
245 // Set intra-node function pt for gather_two_level
246 MV2_Gather_intra_node_function =
247 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
248 MV2_pt_Gather_function;
249 //Set inter-leader pt
250 MV2_Gather_inter_leader_function =
251 mv2_gather_thresholds_table[range].inter_leader[range_threshold].
252 MV2_pt_Gather_function;
253 // We call Gather function
255 MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
256 recvtype, root, comm);
259 // Indeed, direct (non SMP-aware)gather is MPICH one
260 mpi_errno = smpi_coll_tuned_gather_mpich(sendbuf, sendcnt, sendtype,
261 recvbuf, recvcnt, recvtype,
269 int smpi_coll_tuned_allgatherv_mvapich2(void *sendbuf, int sendcount, MPI_Datatype sendtype,
270 void *recvbuf, int *recvcounts, int *displs,
271 MPI_Datatype recvtype, MPI_Comm comm )
273 int mpi_errno = MPI_SUCCESS;
274 int range = 0, comm_size, total_count, recvtype_size, i;
275 int range_threshold = 0;
278 if(mv2_allgatherv_thresholds_table==NULL)
279 init_mv2_allgatherv_tables_stampede();
281 comm_size = smpi_comm_size(comm);
283 for (i = 0; i < comm_size; i++)
284 total_count += recvcounts[i];
286 recvtype_size=smpi_datatype_size(recvtype);
287 nbytes = total_count * recvtype_size;
289 /* Search for the corresponding system size inside the tuning table */
290 while ((range < (mv2_size_allgatherv_tuning_table - 1)) &&
291 (comm_size > mv2_allgatherv_thresholds_table[range].numproc)) {
294 /* Search for corresponding inter-leader function */
295 while ((range_threshold < (mv2_allgatherv_thresholds_table[range].size_inter_table - 1))
297 comm_size * mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max)
298 && (mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max !=
302 /* Set inter-leader pt */
303 MV2_Allgatherv_function =
304 mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].
305 MV2_pt_Allgatherv_function;
307 if (MV2_Allgatherv_function == &MPIR_Allgatherv_Rec_Doubling_MV2)
309 if(!(comm_size & (comm_size - 1)))
312 MPIR_Allgatherv_Rec_Doubling_MV2(sendbuf, sendcount,
318 MPIR_Allgatherv_Bruck_MV2(sendbuf, sendcount,
325 MV2_Allgatherv_function(sendbuf, sendcount, sendtype,
326 recvbuf, recvcounts, displs,
335 int smpi_coll_tuned_allreduce_mvapich2(void *sendbuf,
338 MPI_Datatype datatype,
339 MPI_Op op, MPI_Comm comm)
342 int mpi_errno = MPI_SUCCESS;
346 comm_size = smpi_comm_size(comm);
347 //rank = smpi_comm_rank(comm);
353 if (mv2_allreduce_thresholds_table == NULL)
354 init_mv2_allreduce_tables_stampede();
356 /* check if multiple threads are calling this collective function */
358 MPI_Aint sendtype_size = 0;
360 int range = 0, range_threshold = 0, range_threshold_intra = 0;
361 int is_two_level = 0;
362 int is_commutative = 0;
363 MPI_Aint true_lb, true_extent;
365 sendtype_size=smpi_datatype_size(datatype);
366 nbytes = count * sendtype_size;
368 smpi_datatype_extent(datatype, &true_lb, &true_extent);
370 //is_commutative = smpi_op_is_commute(op);
373 /* Search for the corresponding system size inside the tuning table */
374 while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
375 (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
378 /* Search for corresponding inter-leader function */
379 /* skip mcast poiters if mcast is not available */
380 if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
381 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
382 && ((mv2_allreduce_thresholds_table[range].
383 inter_leader[range_threshold].MV2_pt_Allreduce_function
384 == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
385 (mv2_allreduce_thresholds_table[range].
386 inter_leader[range_threshold].MV2_pt_Allreduce_function
387 == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
392 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
394 mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
395 && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
398 if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
401 /* Search for corresponding intra-node function */
402 while ((range_threshold_intra <
403 (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
405 mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
406 && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
408 range_threshold_intra++;
411 MV2_Allreduce_function = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
412 .MV2_pt_Allreduce_function;
414 MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
415 .MV2_pt_Allreduce_function;
417 /* check if mcast is ready, otherwise replace mcast with other algorithm */
418 if((MV2_Allreduce_function == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
419 (MV2_Allreduce_function == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
421 MV2_Allreduce_function = &MPIR_Allreduce_pt2pt_rd_MV2;
423 if(is_two_level != 1) {
424 MV2_Allreduce_function = &MPIR_Allreduce_pt2pt_rd_MV2;
428 if(is_two_level == 1){
429 // check if shm is ready, if not use other algorithm first
430 if (is_commutative) {
431 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
432 smpi_comm_init_smp(comm);
434 mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
437 mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
441 mpi_errno = MV2_Allreduce_function(sendbuf, recvbuf, count,
446 //comm->ch.intra_node_done=0;
454 int smpi_coll_tuned_alltoallv_mvapich2(void *sbuf, int *scounts, int *sdisps,
456 void *rbuf, int *rcounts, int *rdisps,
462 if (sbuf == MPI_IN_PLACE) {
463 return smpi_coll_tuned_alltoallv_ompi_basic_linear(sbuf, scounts, sdisps, sdtype,
464 rbuf, rcounts, rdisps,rdtype,
466 } else /* For starters, just keep the original algorithm. */
467 return smpi_coll_tuned_alltoallv_ring(sbuf, scounts, sdisps, sdtype,
468 rbuf, rcounts, rdisps,rdtype,
473 int smpi_coll_tuned_barrier_mvapich2(MPI_Comm comm)
475 return smpi_coll_tuned_barrier_mvapich2_pair(comm);
481 int smpi_coll_tuned_bcast_mvapich2(void *buffer,
483 MPI_Datatype datatype,
484 int root, MPI_Comm comm)
486 int mpi_errno = MPI_SUCCESS;
487 int comm_size/*, rank*/;
488 int two_level_bcast = 1;
491 int range_threshold = 0;
492 int range_threshold_intra = 0;
493 int is_homogeneous, is_contig;
496 void *tmp_buf = NULL;
498 //MPID_Datatype *dtp;
502 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
503 smpi_comm_init_smp(comm);
505 if(!mv2_bcast_thresholds_table)
506 init_mv2_bcast_tables_stampede();
507 comm_size = smpi_comm_size(comm);
508 //rank = smpi_comm_rank(comm);
511 /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
514 /* MPID_Datatype_get_ptr(datatype, dtp);*/
515 /* is_contig = dtp->is_contig;*/
520 /* MPI_Type_size() might not give the accurate size of the packed
521 * datatype for heterogeneous systems (because of padding, encoding,
522 * etc). On the other hand, MPI_Pack_size() can become very
523 * expensive, depending on the implementation, especially for
524 * heterogeneous systems. We want to use MPI_Type_size() wherever
525 * possible, and MPI_Pack_size() in other places.
527 //if (is_homogeneous) {
528 type_size=smpi_datatype_size(datatype);
531 MPIR_Pack_size_impl(1, datatype, &type_size);
533 nbytes = (size_t) (count) * (type_size);
535 /* Search for the corresponding system size inside the tuning table */
536 while ((range < (mv2_size_bcast_tuning_table - 1)) &&
537 (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
540 /* Search for corresponding inter-leader function */
541 while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
543 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
544 && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
548 /* Search for corresponding intra-node function */
549 while ((range_threshold_intra <
550 (mv2_bcast_thresholds_table[range].size_intra_table - 1))
552 mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
553 && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
555 range_threshold_intra++;
559 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
560 MV2_pt_Bcast_function;
562 MV2_Bcast_intra_node_function =
563 mv2_bcast_thresholds_table[range].
564 intra_node[range_threshold_intra].MV2_pt_Bcast_function;
566 /* if (mv2_user_bcast_intra == NULL && */
567 /* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
568 /* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
571 if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
572 zcpy_pipelined_knomial_factor != -1) {
573 zcpy_knomial_factor =
574 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
575 zcpy_pipelined_knomial_factor;
578 if (mv2_pipelined_zcpy_knomial_factor != -1) {
579 zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
582 if(MV2_Bcast_intra_node_function == NULL) {
583 /* if tuning table do not have any intra selection, set func pointer to
584 ** default one for mcast intra node */
585 MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
588 /* Set value of pipeline segment size */
589 bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
591 /* Set value of inter node knomial factor */
592 mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;
594 /* Set value of intra node knomial factor */
595 mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;
597 /* Check if we will use a two level algorithm or not */
599 #if defined(_MCST_SUPPORT_)
600 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
601 || comm->ch.is_mcast_ok;
603 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
605 if (two_level_bcast == 1) {
606 if (!is_contig || !is_homogeneous) {
607 tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);
610 /* if (rank == root) {*/
612 /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
614 /* MPIU_ERR_POP(mpi_errno);*/
617 #ifdef CHANNEL_MRAIL_GEN2
618 if ((mv2_enable_zcpy_bcast == 1) &&
619 (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
620 if (!is_contig || !is_homogeneous) {
621 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE,
624 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
628 #endif /* defined(CHANNEL_MRAIL_GEN2) */
630 shmem_comm = smpi_comm_get_intra_comm(comm);
631 if (!is_contig || !is_homogeneous) {
633 MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
637 MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root,
641 /* We are now done with the inter-node phase */
644 root = INTRA_NODE_ROOT;
647 if (!is_contig || !is_homogeneous) {
648 mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes,
649 MPI_BYTE, root, shmem_comm);
651 mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
652 datatype, root, shmem_comm);
656 /* if (!is_contig || !is_homogeneous) {*/
657 /* if (rank != root) {*/
659 /* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
660 /* count, datatype);*/
664 /* We use Knomial for intra node */
665 MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
666 /* if (mv2_enable_shmem_bcast == 0) {*/
667 /* Fall back to non-tuned version */
668 /* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
670 mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
683 int smpi_coll_tuned_reduce_mvapich2( void *sendbuf,
686 MPI_Datatype datatype,
687 MPI_Op op, int root, MPI_Comm comm)
689 if(mv2_reduce_thresholds_table == NULL)
690 init_mv2_reduce_tables_stampede();
692 int mpi_errno = MPI_SUCCESS;
694 int range_threshold = 0;
695 int range_intra_threshold = 0;
696 int is_commutative, pof2;
700 int is_two_level = 0;
702 comm_size = smpi_comm_size(comm);
703 sendtype_size=smpi_datatype_size(datatype);
704 nbytes = count * sendtype_size;
709 is_commutative = smpi_op_is_commute(op);
711 /* find nearest power-of-two less than or equal to comm_size */
712 for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
716 /* Search for the corresponding system size inside the tuning table */
717 while ((range < (mv2_size_reduce_tuning_table - 1)) &&
718 (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
721 /* Search for corresponding inter-leader function */
722 while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
724 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
725 && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
730 /* Search for corresponding intra node function */
731 while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
733 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
734 && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
736 range_intra_threshold++;
739 /* Set intra-node function pt for reduce_two_level */
740 MV2_Reduce_intra_function =
741 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
742 MV2_pt_Reduce_function;
743 /* Set inter-leader pt */
744 MV2_Reduce_function =
745 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
746 MV2_pt_Reduce_function;
748 if(mv2_reduce_intra_knomial_factor<0)
750 mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
752 if(mv2_reduce_inter_knomial_factor<0)
754 mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
756 if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
759 /* We call Reduce function */
760 if(is_two_level == 1)
762 if (is_commutative == 1) {
763 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
764 smpi_comm_init_smp(comm);
766 mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
767 datatype, op, root, comm);
769 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
770 datatype, op, root, comm);
772 } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
773 if(is_commutative ==1)
775 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
776 datatype, op, root, comm);
778 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
779 datatype, op, root, comm);
781 } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
782 if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
784 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
785 datatype, op, root, comm);
787 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
788 datatype, op, root, comm);
791 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
792 datatype, op, root, comm);
801 int smpi_coll_tuned_reduce_scatter_mvapich2(void *sendbuf, void *recvbuf, int *recvcnts,
802 MPI_Datatype datatype, MPI_Op op,
805 int mpi_errno = MPI_SUCCESS;
806 int i = 0, comm_size = smpi_comm_size(comm), total_count = 0, type_size =
809 int range_threshold = 0;
810 int is_commutative = 0;
811 int *disps = xbt_malloc(comm_size * sizeof (int));
813 if(mv2_red_scat_thresholds_table==NULL)
814 init_mv2_reduce_scatter_tables_stampede();
816 is_commutative=smpi_op_is_commute(op);
817 for (i = 0; i < comm_size; i++) {
818 disps[i] = total_count;
819 total_count += recvcnts[i];
822 type_size=smpi_datatype_size(datatype);
823 nbytes = total_count * type_size;
825 if (is_commutative) {
827 /* Search for the corresponding system size inside the tuning table */
828 while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
829 (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
832 /* Search for corresponding inter-leader function */
833 while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
835 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
836 && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
841 /* Set inter-leader pt */
842 MV2_Red_scat_function =
843 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
844 MV2_pt_Red_scat_function;
846 mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
850 int is_block_regular = 1;
851 for (i = 0; i < (comm_size - 1); ++i) {
852 if (recvcnts[i] != recvcnts[i+1]) {
853 is_block_regular = 0;
858 while (pof2 < comm_size) pof2 <<= 1;
859 if (pof2 == comm_size && is_block_regular) {
860 /* noncommutative, pof2 size, and block regular */
861 mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
865 mpi_errno = smpi_coll_tuned_reduce_scatter_mpich_rdb(sendbuf, recvbuf,
876 int smpi_coll_tuned_scatter_mvapich2(void *sendbuf,
878 MPI_Datatype sendtype,
881 MPI_Datatype recvtype,
882 int root, MPI_Comm comm)
884 int range = 0, range_threshold = 0, range_threshold_intra = 0;
885 int mpi_errno = MPI_SUCCESS;
886 // int mpi_errno_ret = MPI_SUCCESS;
887 int rank, nbytes, comm_size;
888 int recvtype_size, sendtype_size;
889 int partial_sub_ok = 0;
894 // MPID_Comm *shmem_commptr=NULL;
895 if(mv2_scatter_thresholds_table==NULL)
896 init_mv2_scatter_tables_stampede();
898 if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
899 smpi_comm_init_smp(comm);
902 comm_size = smpi_comm_size(comm);
904 rank = smpi_comm_rank(comm);
907 sendtype_size=smpi_datatype_size(sendtype);
908 nbytes = sendcnt * sendtype_size;
910 recvtype_size=smpi_datatype_size(recvtype);
911 nbytes = recvcnt * recvtype_size;
914 // check if safe to use partial subscription mode
915 if (smpi_comm_is_uniform(comm)) {
917 shmem_comm = smpi_comm_get_intra_comm(comm);
918 local_size = smpi_comm_size(shmem_comm);
920 if (mv2_scatter_table_ppn_conf[0] == -1) {
921 // Indicating user defined tuning
925 if (local_size == mv2_scatter_table_ppn_conf[i]) {
931 } while(i < mv2_scatter_num_ppn_conf);
935 if (partial_sub_ok != 1) {
939 /* Search for the corresponding system size inside the tuning table */
940 while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
941 (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
944 /* Search for corresponding inter-leader function */
945 while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
947 mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
948 && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
952 /* Search for corresponding intra-node function */
953 while ((range_threshold_intra <
954 (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
956 mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
957 && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
959 range_threshold_intra++;
962 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
963 .MV2_pt_Scatter_function;
965 if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
966 #if defined(_MCST_SUPPORT_)
967 if(comm->ch.is_mcast_ok == 1
968 && mv2_use_mcast_scatter == 1
969 && comm->ch.shmem_coll_ok == 1) {
970 MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
972 #endif /*#if defined(_MCST_SUPPORT_) */
974 if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
975 MV2_pt_Scatter_function != NULL) {
976 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
977 .MV2_pt_Scatter_function;
980 MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
985 if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
986 (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
987 if( smpi_comm_is_blocked(comm)) {
988 MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
989 .MV2_pt_Scatter_function;
992 MV2_Scatter_function(sendbuf, sendcnt, sendtype,
993 recvbuf, recvcnt, recvtype, root,
996 mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
997 recvbuf, recvcnt, recvtype, root,
1002 mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
1003 recvbuf, recvcnt, recvtype, root,