1 /* selector for collective algorithms based on mvapich decision logic */
3 /* Copyright (c) 2009-2010, 2013-2014. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
9 #include "colls_private.h"
11 #include "smpi_mvapich2_selector_stampede.h"
18 int Coll_alltoall_mvapich2::alltoall( void *sendbuf, int sendcount,
19 MPI_Datatype sendtype,
20 void* recvbuf, int recvcount,
21 MPI_Datatype recvtype,
25 if(mv2_alltoall_table_ppn_conf==NULL)
26 init_mv2_alltoall_tables_stampede();
28 int sendtype_size, recvtype_size, comm_size;
29 char * tmp_buf = NULL;
30 int mpi_errno=MPI_SUCCESS;
32 int range_threshold = 0;
34 comm_size = comm->size();
36 sendtype_size=sendtype->size();
37 recvtype_size=recvtype->size();
38 long nbytes = sendtype_size * sendcount;
40 /* check if safe to use partial subscription mode */
42 /* Search for the corresponding system size inside the tuning table */
43 while ((range < (mv2_size_alltoall_tuning_table[conf_index] - 1)) &&
44 (comm_size > mv2_alltoall_thresholds_table[conf_index][range].numproc)) {
47 /* Search for corresponding inter-leader function */
48 while ((range_threshold < (mv2_alltoall_thresholds_table[conf_index][range].size_table - 1))
50 mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max)
51 && (mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold].max != -1)) {
54 MV2_Alltoall_function = mv2_alltoall_thresholds_table[conf_index][range].algo_table[range_threshold]
55 .MV2_pt_Alltoall_function;
57 if(sendbuf != MPI_IN_PLACE) {
58 mpi_errno = MV2_Alltoall_function(sendbuf, sendcount, sendtype,
59 recvbuf, recvcount, recvtype,
64 mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].min
65 ||nbytes > mv2_alltoall_thresholds_table[conf_index][range].in_place_algo_table[range_threshold].max
67 tmp_buf = (char *)smpi_get_tmp_sendbuffer( comm_size * recvcount * recvtype_size );
68 mpi_errno = Datatype::copy((char *)recvbuf,
69 comm_size*recvcount, recvtype,
71 comm_size*recvcount, recvtype);
73 mpi_errno = MV2_Alltoall_function(tmp_buf, recvcount, recvtype,
74 recvbuf, recvcount, recvtype,
76 smpi_free_tmp_buffer(tmp_buf);
78 mpi_errno = MPIR_Alltoall_inplace_MV2(sendbuf, sendcount, sendtype,
79 recvbuf, recvcount, recvtype,
88 int Coll_allgather_mvapich2::allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
89 void *recvbuf, int recvcount, MPI_Datatype recvtype,
93 int mpi_errno = MPI_SUCCESS;
94 long nbytes = 0, comm_size, recvtype_size;
96 int partial_sub_ok = 0;
98 int range_threshold = 0;
102 //MPI_Comm *shmem_commptr=NULL;
103 /* Get the size of the communicator */
104 comm_size = comm->size();
105 recvtype_size=recvtype->size();
106 nbytes = recvtype_size * recvcount;
108 if(mv2_allgather_table_ppn_conf==NULL)
109 init_mv2_allgather_tables_stampede();
111 if(comm->get_leaders_comm()==MPI_COMM_NULL){
116 if (comm->is_uniform()){
117 shmem_comm = comm->get_intra_comm();
118 local_size = shmem_comm->size();
120 if (mv2_allgather_table_ppn_conf[0] == -1) {
121 // Indicating user defined tuning
126 if (local_size == mv2_allgather_table_ppn_conf[i]) {
132 } while(i < mv2_allgather_num_ppn_conf);
135 if (partial_sub_ok != 1) {
139 /* Search for the corresponding system size inside the tuning table */
140 while ((range < (mv2_size_allgather_tuning_table[conf_index] - 1)) &&
142 mv2_allgather_thresholds_table[conf_index][range].numproc)) {
145 /* Search for corresponding inter-leader function */
146 while ((range_threshold <
147 (mv2_allgather_thresholds_table[conf_index][range].size_inter_table - 1))
148 && (nbytes > mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
149 && (mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].max !=
154 /* Set inter-leader pt */
156 mv2_allgather_thresholds_table[conf_index][range].inter_leader[range_threshold].
157 MV2_pt_Allgatherction;
159 is_two_level = mv2_allgather_thresholds_table[conf_index][range].two_level[range_threshold];
161 /* intracommunicator */
162 if(is_two_level ==1){
163 if(partial_sub_ok ==1){
164 if (comm->is_blocked()){
165 mpi_errno = MPIR_2lvl_Allgather_MV2(sendbuf, sendcount, sendtype,
166 recvbuf, recvcount, recvtype,
169 mpi_errno = Coll_allgather_mpich::allgather(sendbuf, sendcount, sendtype,
170 recvbuf, recvcount, recvtype,
174 mpi_errno = MPIR_Allgather_RD_MV2(sendbuf, sendcount, sendtype,
175 recvbuf, recvcount, recvtype,
178 } else if(MV2_Allgatherction == &MPIR_Allgather_Bruck_MV2
179 || MV2_Allgatherction == &MPIR_Allgather_RD_MV2
180 || MV2_Allgatherction == &MPIR_Allgather_Ring_MV2) {
181 mpi_errno = MV2_Allgatherction(sendbuf, sendcount, sendtype,
182 recvbuf, recvcount, recvtype,
185 return MPI_ERR_OTHER;
191 int Coll_gather_mvapich2::gather(void *sendbuf,
193 MPI_Datatype sendtype,
196 MPI_Datatype recvtype,
197 int root, MPI_Comm comm)
199 if(mv2_gather_thresholds_table==NULL)
200 init_mv2_gather_tables_stampede();
202 int mpi_errno = MPI_SUCCESS;
204 int range_threshold = 0;
205 int range_intra_threshold = 0;
208 int recvtype_size, sendtype_size;
210 comm_size = comm->size();
214 recvtype_size=recvtype->size();
215 nbytes = recvcnt * recvtype_size;
217 sendtype_size=sendtype->size();
218 nbytes = sendcnt * sendtype_size;
221 /* Search for the corresponding system size inside the tuning table */
222 while ((range < (mv2_size_gather_tuning_table - 1)) &&
223 (comm_size > mv2_gather_thresholds_table[range].numproc)) {
226 /* Search for corresponding inter-leader function */
227 while ((range_threshold < (mv2_gather_thresholds_table[range].size_inter_table - 1))
229 mv2_gather_thresholds_table[range].inter_leader[range_threshold].max)
230 && (mv2_gather_thresholds_table[range].inter_leader[range_threshold].max !=
235 /* Search for corresponding intra node function */
236 while ((range_intra_threshold < (mv2_gather_thresholds_table[range].size_intra_table - 1))
238 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max)
239 && (mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].max !=
241 range_intra_threshold++;
244 if (comm->is_blocked() ) {
245 // Set intra-node function pt for gather_two_level
246 MV2_Gather_intra_node_function =
247 mv2_gather_thresholds_table[range].intra_node[range_intra_threshold].
248 MV2_pt_Gather_function;
249 //Set inter-leader pt
250 MV2_Gather_inter_leader_function =
251 mv2_gather_thresholds_table[range].inter_leader[range_threshold].
252 MV2_pt_Gather_function;
253 // We call Gather function
255 MV2_Gather_inter_leader_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,
256 recvtype, root, comm);
259 // Indeed, direct (non SMP-aware)gather is MPICH one
260 mpi_errno = Coll_gather_mpich::gather(sendbuf, sendcnt, sendtype,
261 recvbuf, recvcnt, recvtype,
268 int Coll_allgatherv_mvapich2::allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
269 void *recvbuf, int *recvcounts, int *displs,
270 MPI_Datatype recvtype, MPI_Comm comm )
272 int mpi_errno = MPI_SUCCESS;
273 int range = 0, comm_size, total_count, recvtype_size, i;
274 int range_threshold = 0;
277 if(mv2_allgatherv_thresholds_table==NULL)
278 init_mv2_allgatherv_tables_stampede();
280 comm_size = comm->size();
282 for (i = 0; i < comm_size; i++)
283 total_count += recvcounts[i];
285 recvtype_size=recvtype->size();
286 nbytes = total_count * recvtype_size;
288 /* Search for the corresponding system size inside the tuning table */
289 while ((range < (mv2_size_allgatherv_tuning_table - 1)) &&
290 (comm_size > mv2_allgatherv_thresholds_table[range].numproc)) {
293 /* Search for corresponding inter-leader function */
294 while ((range_threshold < (mv2_allgatherv_thresholds_table[range].size_inter_table - 1))
296 comm_size * mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max)
297 && (mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].max !=
301 /* Set inter-leader pt */
302 MV2_Allgatherv_function =
303 mv2_allgatherv_thresholds_table[range].inter_leader[range_threshold].
304 MV2_pt_Allgatherv_function;
306 if (MV2_Allgatherv_function == &MPIR_Allgatherv_Rec_Doubling_MV2)
308 if(!(comm_size & (comm_size - 1)))
311 MPIR_Allgatherv_Rec_Doubling_MV2(sendbuf, sendcount,
317 MPIR_Allgatherv_Bruck_MV2(sendbuf, sendcount,
324 MV2_Allgatherv_function(sendbuf, sendcount, sendtype,
325 recvbuf, recvcounts, displs,
334 int Coll_allreduce_mvapich2::allreduce(void *sendbuf,
337 MPI_Datatype datatype,
338 MPI_Op op, MPI_Comm comm)
341 int mpi_errno = MPI_SUCCESS;
345 comm_size = comm->size();
346 //rank = comm->rank();
352 if (mv2_allreduce_thresholds_table == NULL)
353 init_mv2_allreduce_tables_stampede();
355 /* check if multiple threads are calling this collective function */
357 MPI_Aint sendtype_size = 0;
359 int range = 0, range_threshold = 0, range_threshold_intra = 0;
360 int is_two_level = 0;
361 int is_commutative = 0;
362 MPI_Aint true_lb, true_extent;
364 sendtype_size=datatype->size();
365 nbytes = count * sendtype_size;
367 datatype->extent(&true_lb, &true_extent);
369 //is_commutative = op->is_commutative();
372 /* Search for the corresponding system size inside the tuning table */
373 while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
374 (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
377 /* Search for corresponding inter-leader function */
378 /* skip mcast poiters if mcast is not available */
379 if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
380 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
381 && ((mv2_allreduce_thresholds_table[range].
382 inter_leader[range_threshold].MV2_pt_Allreducection
383 == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
384 (mv2_allreduce_thresholds_table[range].
385 inter_leader[range_threshold].MV2_pt_Allreducection
386 == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
391 while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
393 mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
394 && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
397 if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
400 /* Search for corresponding intra-node function */
401 while ((range_threshold_intra <
402 (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
404 mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
405 && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
407 range_threshold_intra++;
410 MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
411 .MV2_pt_Allreducection;
413 MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
414 .MV2_pt_Allreducection;
416 /* check if mcast is ready, otherwise replace mcast with other algorithm */
417 if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
418 (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
420 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
422 if(is_two_level != 1) {
423 MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
427 if(is_two_level == 1){
428 // check if shm is ready, if not use other algorithm first
429 if (is_commutative) {
430 if(comm->get_leaders_comm()==MPI_COMM_NULL){
433 mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
436 mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
440 mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
445 //comm->ch.intra_node_done=0;
453 int Coll_alltoallv_mvapich2::alltoallv(void *sbuf, int *scounts, int *sdisps,
455 void *rbuf, int *rcounts, int *rdisps,
461 if (sbuf == MPI_IN_PLACE) {
462 return Coll_alltoallv_ompi_basic_linear::alltoallv(sbuf, scounts, sdisps, sdtype,
463 rbuf, rcounts, rdisps,rdtype,
465 } else /* For starters, just keep the original algorithm. */
466 return Coll_alltoallv_ring::alltoallv(sbuf, scounts, sdisps, sdtype,
467 rbuf, rcounts, rdisps,rdtype,
472 int Coll_barrier_mvapich2::barrier(MPI_Comm comm)
474 return Coll_barrier_mvapich2_pair::barrier(comm);
480 int Coll_bcast_mvapich2::bcast(void *buffer,
482 MPI_Datatype datatype,
483 int root, MPI_Comm comm)
485 int mpi_errno = MPI_SUCCESS;
486 int comm_size/*, rank*/;
487 int two_level_bcast = 1;
490 int range_threshold = 0;
491 int range_threshold_intra = 0;
492 int is_homogeneous, is_contig;
495 void *tmp_buf = NULL;
497 //MPID_Datatype *dtp;
501 if(comm->get_leaders_comm()==MPI_COMM_NULL){
504 if(!mv2_bcast_thresholds_table)
505 init_mv2_bcast_tables_stampede();
506 comm_size = comm->size();
507 //rank = comm->rank();
510 /* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
513 /* MPID_Datatype_get_ptr(datatype, dtp);*/
514 /* is_contig = dtp->is_contig;*/
519 /* MPI_Type_size() might not give the accurate size of the packed
520 * datatype for heterogeneous systems (because of padding, encoding,
521 * etc). On the other hand, MPI_Pack_size() can become very
522 * expensive, depending on the implementation, especially for
523 * heterogeneous systems. We want to use MPI_Type_size() wherever
524 * possible, and MPI_Pack_size() in other places.
526 //if (is_homogeneous) {
527 type_size=datatype->size();
530 MPIR_Pack_size_impl(1, datatype, &type_size);
532 nbytes = (count) * (type_size);
534 /* Search for the corresponding system size inside the tuning table */
535 while ((range < (mv2_size_bcast_tuning_table - 1)) &&
536 (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
539 /* Search for corresponding inter-leader function */
540 while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
542 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
543 && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
547 /* Search for corresponding intra-node function */
548 while ((range_threshold_intra <
549 (mv2_bcast_thresholds_table[range].size_intra_table - 1))
551 mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
552 && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
554 range_threshold_intra++;
558 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
559 MV2_pt_Bcast_function;
561 MV2_Bcast_intra_node_function =
562 mv2_bcast_thresholds_table[range].
563 intra_node[range_threshold_intra].MV2_pt_Bcast_function;
565 /* if (mv2_user_bcast_intra == NULL && */
566 /* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
567 /* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
570 if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
571 zcpy_pipelined_knomial_factor != -1) {
572 zcpy_knomial_factor =
573 mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
574 zcpy_pipelined_knomial_factor;
577 if (mv2_pipelined_zcpy_knomial_factor != -1) {
578 zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
581 if(MV2_Bcast_intra_node_function == NULL) {
582 /* if tuning table do not have any intra selection, set func pointer to
583 ** default one for mcast intra node */
584 MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
587 /* Set value of pipeline segment size */
588 bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
590 /* Set value of inter node knomial factor */
591 mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;
593 /* Set value of intra node knomial factor */
594 mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;
596 /* Check if we will use a two level algorithm or not */
598 #if defined(_MCST_SUPPORT_)
599 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
600 || comm->ch.is_mcast_ok;
602 mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
604 if (two_level_bcast == 1) {
605 if (!is_contig || !is_homogeneous) {
606 tmp_buf=(void *)smpi_get_tmp_sendbuffer(nbytes);
609 /* if (rank == root) {*/
611 /* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
613 /* MPIU_ERR_POP(mpi_errno);*/
616 #ifdef CHANNEL_MRAIL_GEN2
617 if ((mv2_enable_zcpy_bcast == 1) &&
618 (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
619 if (!is_contig || !is_homogeneous) {
620 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE,
623 mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
627 #endif /* defined(CHANNEL_MRAIL_GEN2) */
629 shmem_comm = comm->get_intra_comm();
630 if (!is_contig || !is_homogeneous) {
632 MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
636 MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root,
640 /* We are now done with the inter-node phase */
643 root = INTRA_NODE_ROOT;
646 if (!is_contig || !is_homogeneous) {
647 mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes,
648 MPI_BYTE, root, shmem_comm);
650 mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
651 datatype, root, shmem_comm);
655 /* if (!is_contig || !is_homogeneous) {*/
656 /* if (rank != root) {*/
658 /* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
659 /* count, datatype);*/
663 /* We use Knomial for intra node */
664 MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
665 /* if (mv2_enable_shmem_bcast == 0) {*/
666 /* Fall back to non-tuned version */
667 /* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
669 mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
682 int Coll_reduce_mvapich2::reduce( void *sendbuf,
685 MPI_Datatype datatype,
686 MPI_Op op, int root, MPI_Comm comm)
688 if(mv2_reduce_thresholds_table == NULL)
689 init_mv2_reduce_tables_stampede();
691 int mpi_errno = MPI_SUCCESS;
693 int range_threshold = 0;
694 int range_intra_threshold = 0;
695 int is_commutative, pof2;
699 int is_two_level = 0;
701 comm_size = comm->size();
702 sendtype_size=datatype->size();
703 nbytes = count * sendtype_size;
708 is_commutative = (op==MPI_OP_NULL || op->is_commutative());
710 /* find nearest power-of-two less than or equal to comm_size */
711 for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
715 /* Search for the corresponding system size inside the tuning table */
716 while ((range < (mv2_size_reduce_tuning_table - 1)) &&
717 (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
720 /* Search for corresponding inter-leader function */
721 while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
723 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
724 && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
729 /* Search for corresponding intra node function */
730 while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
732 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
733 && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
735 range_intra_threshold++;
738 /* Set intra-node function pt for reduce_two_level */
739 MV2_Reduce_intra_function =
740 mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
741 MV2_pt_Reduce_function;
742 /* Set inter-leader pt */
743 MV2_Reduce_function =
744 mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
745 MV2_pt_Reduce_function;
747 if(mv2_reduce_intra_knomial_factor<0)
749 mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
751 if(mv2_reduce_inter_knomial_factor<0)
753 mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
755 if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
758 /* We call Reduce function */
759 if(is_two_level == 1)
761 if (is_commutative == 1) {
762 if(comm->get_leaders_comm()==MPI_COMM_NULL){
765 mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
766 datatype, op, root, comm);
768 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
769 datatype, op, root, comm);
771 } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
772 if(is_commutative ==1)
774 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
775 datatype, op, root, comm);
777 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
778 datatype, op, root, comm);
780 } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
781 if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
783 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
784 datatype, op, root, comm);
786 mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
787 datatype, op, root, comm);
790 mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
791 datatype, op, root, comm);
800 int Coll_reduce_scatter_mvapich2::reduce_scatter(void *sendbuf, void *recvbuf, int *recvcnts,
801 MPI_Datatype datatype, MPI_Op op,
804 int mpi_errno = MPI_SUCCESS;
805 int i = 0, comm_size = comm->size(), total_count = 0, type_size =
808 int range_threshold = 0;
809 int is_commutative = 0;
810 int *disps = static_cast<int*>(xbt_malloc(comm_size * sizeof (int)));
812 if(mv2_red_scat_thresholds_table==NULL)
813 init_mv2_reduce_scatter_tables_stampede();
815 is_commutative=(op==MPI_OP_NULL || op->is_commutative());
816 for (i = 0; i < comm_size; i++) {
817 disps[i] = total_count;
818 total_count += recvcnts[i];
821 type_size=datatype->size();
822 nbytes = total_count * type_size;
824 if (is_commutative) {
826 /* Search for the corresponding system size inside the tuning table */
827 while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
828 (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
831 /* Search for corresponding inter-leader function */
832 while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
834 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
835 && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
840 /* Set inter-leader pt */
841 MV2_Red_scat_function =
842 mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
843 MV2_pt_Red_scat_function;
845 mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
849 int is_block_regular = 1;
850 for (i = 0; i < (comm_size - 1); ++i) {
851 if (recvcnts[i] != recvcnts[i+1]) {
852 is_block_regular = 0;
857 while (pof2 < comm_size) pof2 <<= 1;
858 if (pof2 == comm_size && is_block_regular) {
859 /* noncommutative, pof2 size, and block regular */
860 mpi_errno = MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
864 mpi_errno = Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf,
875 int Coll_scatter_mvapich2::scatter(void *sendbuf,
877 MPI_Datatype sendtype,
880 MPI_Datatype recvtype,
881 int root, MPI_Comm comm)
883 int range = 0, range_threshold = 0, range_threshold_intra = 0;
884 int mpi_errno = MPI_SUCCESS;
885 // int mpi_errno_ret = MPI_SUCCESS;
886 int rank, nbytes, comm_size;
887 int recvtype_size, sendtype_size;
888 int partial_sub_ok = 0;
893 // MPID_Comm *shmem_commptr=NULL;
894 if(mv2_scatter_thresholds_table==NULL)
895 init_mv2_scatter_tables_stampede();
897 if(comm->get_leaders_comm()==MPI_COMM_NULL){
901 comm_size = comm->size();
906 sendtype_size=sendtype->size();
907 nbytes = sendcnt * sendtype_size;
909 recvtype_size=recvtype->size();
910 nbytes = recvcnt * recvtype_size;
913 // check if safe to use partial subscription mode
914 if (comm->is_uniform()) {
916 shmem_comm = comm->get_intra_comm();
917 local_size = shmem_comm->size();
919 if (mv2_scatter_table_ppn_conf[0] == -1) {
920 // Indicating user defined tuning
924 if (local_size == mv2_scatter_table_ppn_conf[i]) {
930 } while(i < mv2_scatter_num_ppn_conf);
934 if (partial_sub_ok != 1) {
938 /* Search for the corresponding system size inside the tuning table */
939 while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
940 (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
943 /* Search for corresponding inter-leader function */
944 while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
946 mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
947 && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
951 /* Search for corresponding intra-node function */
952 while ((range_threshold_intra <
953 (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
955 mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
956 && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
958 range_threshold_intra++;
961 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
962 .MV2_pt_Scatter_function;
964 if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
965 #if defined(_MCST_SUPPORT_)
966 if(comm->ch.is_mcast_ok == 1
967 && mv2_use_mcast_scatter == 1
968 && comm->ch.shmem_coll_ok == 1) {
969 MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
971 #endif /*#if defined(_MCST_SUPPORT_) */
973 if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
974 MV2_pt_Scatter_function != NULL) {
975 MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
976 .MV2_pt_Scatter_function;
979 MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
984 if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
985 (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
986 if( comm->is_blocked()) {
987 MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
988 .MV2_pt_Scatter_function;
991 MV2_Scatter_function(sendbuf, sendcnt, sendtype,
992 recvbuf, recvcnt, recvtype, root,
995 mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
996 recvbuf, recvcnt, recvtype, root,
1001 mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
1002 recvbuf, recvcnt, recvtype, root,
1010 void smpi_coll_cleanup_mvapich2(void){
1012 if(mv2_alltoall_thresholds_table)
1013 xbt_free(mv2_alltoall_thresholds_table[i]);
1014 xbt_free(mv2_alltoall_thresholds_table);
1015 xbt_free(mv2_size_alltoall_tuning_table);
1016 xbt_free(mv2_alltoall_table_ppn_conf);
1018 xbt_free(mv2_gather_thresholds_table);
1019 if(mv2_allgather_thresholds_table)
1020 xbt_free(mv2_allgather_thresholds_table[0]);
1021 xbt_free(mv2_size_allgather_tuning_table);
1022 xbt_free(mv2_allgather_table_ppn_conf);
1023 xbt_free(mv2_allgather_thresholds_table);
1025 xbt_free(mv2_allgatherv_thresholds_table);
1026 xbt_free(mv2_reduce_thresholds_table);
1027 xbt_free(mv2_red_scat_thresholds_table);
1028 xbt_free(mv2_allreduce_thresholds_table);
1029 xbt_free(mv2_bcast_thresholds_table);
1030 if(mv2_scatter_thresholds_table)
1031 xbt_free(mv2_scatter_thresholds_table[0]);
1032 xbt_free(mv2_scatter_thresholds_table);
1033 xbt_free(mv2_size_scatter_tuning_table);
1034 xbt_free(mv2_scatter_table_ppn_conf);