src/smpi/colls/smpi_mpich_selector.cpp

   1 /* selector for collective algorithms based on mpich decision logic */
   2
   3 /* Copyright (c) 2009-2019. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.hpp"
  10
  11 /* This is the default implementation of allreduce. The algorithm is:
  12
  13    Algorithm: MPI_Allreduce
  14
  15    For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
  16    in order to meet the requirement that all processes must have the
  17    same result. For the homogeneous case, we use the following algorithms.
  18
  19
  20    For long messages and for builtin ops and if count >= pof2 (where
  21    pof2 is the nearest power-of-two less than or equal to the number
  22    of processes), we use Rabenseifner's algorithm (see
  23    http://www.hlrs.de/mpi/myreduce.html).
  24    This algorithm implements the allreduce in two steps: first a
  25    reduce-scatter, followed by an allgather. A recursive-halving
  26    algorithm (beginning with processes that are distance 1 apart) is
  27    used for the reduce-scatter, and a recursive doubling
  28    algorithm is used for the allgather. The non-power-of-two case is
  29    handled by dropping to the nearest lower power-of-two: the first
  30    few even-numbered processes send their data to their right neighbors
  31    (rank+1), and the reduce-scatter and allgather happen among the remaining
  32    power-of-two processes. At the end, the first few even-numbered
  33    processes get the result from their right neighbors.
  34
  35    For the power-of-two case, the cost for the reduce-scatter is
  36    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
  37    allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
  38    total cost is:
  39    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  40
  41    For the non-power-of-two case,
  42    Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
  43
  44
  45    For short messages, for user-defined ops, and for count < pof2
  46    we use a recursive doubling algorithm (similar to the one in
  47    MPI_Allgather). We use this algorithm in the case of user-defined ops
  48    because in this case derived datatypes are allowed, and the user
  49    could pass basic datatypes on one process and derived on another as
  50    long as the type maps are the same. Breaking up derived datatypes
  51    to do the reduce-scatter is tricky.
  52
  53    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  54
  55    Possible improvements:
  56
  57    End Algorithm: MPI_Allreduce
  58 */
  59 namespace simgrid{
  60 namespace smpi{
  61 int Coll_allreduce_mpich::allreduce(const void *sbuf, void *rbuf, int count,
  62                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  63 {
  64     size_t dsize, block_dsize;
  65     int comm_size = comm->size();
  66     const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
  67
  68     dsize = dtype->size();
  69     block_dsize = dsize * count;
  70
  71     /*MPICH uses SMP algorithms for all commutative ops now*/
  72     if(!comm->is_smp_comm()){
  73       if(comm->get_leaders_comm()==MPI_COMM_NULL){
  74         comm->init_smp();
  75       }
  76       if(op->is_commutative())
  77         return Coll_allreduce_mvapich2_two_level::allreduce (sbuf, rbuf,count, dtype, op, comm);
  78     }
  79
  80     /* find nearest power-of-two less than or equal to comm_size */
  81     int pof2 = 1;
  82     while (pof2 <= comm_size) pof2 <<= 1;
  83     pof2 >>=1;
  84
  85     if (block_dsize > large_message && count >= pof2 && (op==MPI_OP_NULL || op->is_commutative())) {
  86       //for long messages
  87        return Coll_allreduce_rab_rdb::allreduce (sbuf, rbuf, count, dtype, op, comm);
  88     }else {
  89       //for short ones and count < pof2
  90       return Coll_allreduce_rdb::allreduce (sbuf, rbuf, count, dtype, op, comm);
  91     }
  92 }
  93
  94
  95 /* This is the default implementation of alltoall. The algorithm is:
  96
  97    Algorithm: MPI_Alltoall
  98
  99    We use four algorithms for alltoall. For short messages and
 100    (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
 101    IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
 102    takes lgp steps. Because of the extra communication, the bandwidth
 103    requirement is (n/2).lgp.beta.
 104
 105    Cost = lgp.alpha + (n/2).lgp.beta
 106
 107    where n is the total amount of data a process needs to send to all
 108    other processes.
 109
 110    For medium size messages and (short messages for comm_size < 8), we
 111    use an algorithm that posts all irecvs and isends and then does a
 112    waitall. We scatter the order of sources and destinations among the
 113    processes, so that all processes don't try to send/recv to/from the
 114    same process at the same time.
 115
 116    *** Modification: We post only a small number of isends and irecvs
 117    at a time and wait on them as suggested by Tony Ladd. ***
 118    *** See comments below about an additional modification that
 119    we may want to consider ***
 120
 121    For long messages and power-of-two number of processes, we use a
 122    pairwise exchange algorithm, which takes p-1 steps. We
 123    calculate the pairs by using an exclusive-or algorithm:
 124            for (i=1; i<comm_size; i++)
 125                dest = rank ^ i;
 126    This algorithm doesn't work if the number of processes is not a power of
 127    two. For a non-power-of-two number of processes, we use an
 128    algorithm in which, in step i, each process  receives from (rank-i)
 129    and sends to (rank+i).
 130
 131    Cost = (p-1).alpha + n.beta
 132
 133    where n is the total amount of data a process needs to send to all
 134    other processes.
 135
 136    Possible improvements:
 137
 138    End Algorithm: MPI_Alltoall
 139 */
 140
 141 int Coll_alltoall_mpich::alltoall(const void *sbuf, int scount,
 142                                              MPI_Datatype sdtype,
 143                                              void* rbuf, int rcount,
 144                                              MPI_Datatype rdtype,
 145                                              MPI_Comm comm)
 146 {
 147     int communicator_size;
 148     size_t dsize, block_dsize;
 149     communicator_size = comm->size();
 150
 151     unsigned int short_size=256;
 152     unsigned int medium_size=32768;
 153     //short size and comm_size >=8   -> bruck
 154
 155 //     medium size messages and (short messages for comm_size < 8), we
 156 //     use an algorithm that posts all irecvs and isends and then does a
 157 //     waitall.
 158
 159 //    For long messages and power-of-two number of processes, we use a
 160 //   pairwise exchange algorithm
 161
 162 //   For a non-power-of-two number of processes, we use an
 163 //   algorithm in which, in step i, each process  receives from (rank-i)
 164 //   and sends to (rank+i).
 165
 166
 167     dsize = sdtype->size();
 168     block_dsize = dsize * scount;
 169
 170     if ((block_dsize < short_size) && (communicator_size >= 8)) {
 171         return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
 172                                                     rbuf, rcount, rdtype,
 173                                                     comm);
 174
 175     } else if (block_dsize < medium_size) {
 176         return Coll_alltoall_mvapich2_scatter_dest::alltoall(sbuf, scount, sdtype,
 177                                                            rbuf, rcount, rdtype,
 178                                                            comm);
 179     }else if (communicator_size%2){
 180         return Coll_alltoall_pair::alltoall(sbuf, scount, sdtype,
 181                                                            rbuf, rcount, rdtype,
 182                                                            comm);
 183     }
 184
 185     return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
 186                                                     rbuf, rcount, rdtype,
 187                                                     comm);
 188 }
 189
 190 int Coll_alltoallv_mpich::alltoallv(const void *sbuf, const int *scounts, const int *sdisps,
 191                                               MPI_Datatype sdtype,
 192                                               void *rbuf, const int *rcounts, const int *rdisps,
 193                                               MPI_Datatype rdtype,
 194                                               MPI_Comm  comm
 195                                               )
 196 {
 197     /* For starters, just keep the original algorithm. */
 198     return Coll_alltoallv_bruck::alltoallv(sbuf, scounts, sdisps, sdtype,
 199                                                         rbuf, rcounts, rdisps,rdtype,
 200                                                         comm);
 201 }
 202
 203
 204 int Coll_barrier_mpich::barrier(MPI_Comm  comm)
 205 {
 206     return Coll_barrier_ompi_bruck::barrier(comm);
 207 }
 208
 209 /* This is the default implementation of broadcast. The algorithm is:
 210
 211    Algorithm: MPI_Bcast
 212
 213    For short messages, we use a binomial tree algorithm.
 214    Cost = lgp.alpha + n.lgp.beta
 215
 216    For long messages, we do a scatter followed by an allgather.
 217    We first scatter the buffer using a binomial tree algorithm. This costs
 218    lgp.alpha + n.((p-1)/p).beta
 219    If the datatype is contiguous and the communicator is homogeneous,
 220    we treat the data as bytes and divide (scatter) it among processes
 221    by using ceiling division. For the noncontiguous or heterogeneous
 222    cases, we first pack the data into a temporary buffer by using
 223    MPI_Pack, scatter it as bytes, and unpack it after the allgather.
 224
 225    For the allgather, we use a recursive doubling algorithm for
 226    medium-size messages and power-of-two number of processes. This
 227    takes lgp steps. In each step pairs of processes exchange all the
 228    data they have (we take care of non-power-of-two situations). This
 229    costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
 230    because it may be slightly more in the non-power-of-two case, but
 231    it's still a logarithmic algorithm.) Therefore, for long messages
 232    Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
 233
 234    Note that this algorithm has twice the latency as the tree algorithm
 235    we use for short messages, but requires lower bandwidth: 2.n.beta
 236    versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
 237    this algorithm will perform better.
 238
 239    For long messages and for medium-size messages and non-power-of-two
 240    processes, we use a ring algorithm for the allgather, which
 241    takes p-1 steps, because it performs better than recursive doubling.
 242    Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
 243
 244    Possible improvements:
 245    For clusters of SMPs, we may want to do something differently to
 246    take advantage of shared memory on each node.
 247
 248    End Algorithm: MPI_Bcast
 249 */
 250
 251
 252 int Coll_bcast_mpich::bcast(void *buff, int count,
 253                                           MPI_Datatype datatype, int root,
 254                                           MPI_Comm  comm
 255                                           )
 256 {
 257     /* Decision function based on MX results for
 258        messages up to 36MB and communicator sizes up to 64 nodes */
 259     const size_t small_message_size = 12288;
 260     const size_t intermediate_message_size = 524288;
 261
 262     int communicator_size;
 263     //int segsize = 0;
 264     size_t message_size, dsize;
 265
 266     if(!comm->is_smp_comm()){
 267       if(comm->get_leaders_comm()==MPI_COMM_NULL){
 268         comm->init_smp();
 269       }
 270       if(comm->is_uniform())
 271         return Coll_bcast_SMP_binomial::bcast(buff, count, datatype, root, comm);
 272     }
 273
 274     communicator_size = comm->size();
 275
 276     /* else we need data size for decision function */
 277     dsize = datatype->size();
 278     message_size = dsize * (unsigned long)count;   /* needed for decision */
 279
 280     /* Handle messages of small and intermediate size, and
 281        single-element broadcasts */
 282     if ((message_size < small_message_size) || (communicator_size <= 8)) {
 283         /* Binomial without segmentation */
 284         return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
 285                                                       root, comm);
 286
 287     } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
 288         // SplittedBinary with 1KB segments
 289         return Coll_bcast_scatter_rdb_allgather::bcast(buff, count, datatype,
 290                                                          root, comm);
 291
 292     }
 293      //Handle large message sizes
 294      return Coll_bcast_scatter_LR_allgather::bcast (buff, count, datatype,
 295                                                      root, comm);
 296
 297 }
 298
 299
 300
 301 /* This is the default implementation of reduce. The algorithm is:
 302
 303    Algorithm: MPI_Reduce
 304
 305    For long messages and for builtin ops and if count >= pof2 (where
 306    pof2 is the nearest power-of-two less than or equal to the number
 307    of processes), we use Rabenseifner's algorithm (see
 308    http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
 309    This algorithm implements the reduce in two steps: first a
 310    reduce-scatter, followed by a gather to the root. A
 311    recursive-halving algorithm (beginning with processes that are
 312    distance 1 apart) is used for the reduce-scatter, and a binomial tree
 313    algorithm is used for the gather. The non-power-of-two case is
 314    handled by dropping to the nearest lower power-of-two: the first
 315    few odd-numbered processes send their data to their left neighbors
 316    (rank-1), and the reduce-scatter happens among the remaining
 317    power-of-two processes. If the root is one of the excluded
 318    processes, then after the reduce-scatter, rank 0 sends its result to
 319    the root and exits; the root now acts as rank 0 in the binomial tree
 320    algorithm for gather.
 321
 322    For the power-of-two case, the cost for the reduce-scatter is
 323    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
 324    gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
 325    total cost is:
 326    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
 327
 328    For the non-power-of-two case, assuming the root is not one of the
 329    odd-numbered processes that get excluded in the reduce-scatter,
 330    Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
 331            n.(1+(p-1)/p).gamma
 332
 333
 334    For short messages, user-defined ops, and count < pof2, we use a
 335    binomial tree algorithm for both short and long messages.
 336
 337    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
 338
 339
 340    We use the binomial tree algorithm in the case of user-defined ops
 341    because in this case derived datatypes are allowed, and the user
 342    could pass basic datatypes on one process and derived on another as
 343    long as the type maps are the same. Breaking up derived datatypes
 344    to do the reduce-scatter is tricky.
 345
 346    FIXME: Per the MPI-2.1 standard this case is not possible.  We
 347    should be able to use the reduce-scatter/gather approach as long as
 348    count >= pof2.  [goodell@ 2009-01-21]
 349
 350    Possible improvements:
 351
 352    End Algorithm: MPI_Reduce
 353 */
 354
 355
 356 int Coll_reduce_mpich::reduce(const void *sendbuf, void *recvbuf,
 357                                             int count, MPI_Datatype  datatype,
 358                                             MPI_Op   op, int root,
 359                                             MPI_Comm   comm
 360                                             )
 361 {
 362     int communicator_size=0;
 363     size_t message_size, dsize;
 364
 365     if(!comm->is_smp_comm()){
 366       if(comm->get_leaders_comm()==MPI_COMM_NULL){
 367         comm->init_smp();
 368       }
 369       if (op->is_commutative() == 1)
 370         return Coll_reduce_mvapich2_two_level::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
 371     }
 372
 373     communicator_size = comm->size();
 374
 375     /* need data size for decision function */
 376     dsize=datatype->size();
 377     message_size = dsize * count;   /* needed for decision */
 378
 379     int pof2 = 1;
 380     while (pof2 <= communicator_size) pof2 <<= 1;
 381     pof2 >>= 1;
 382
 383     if ((count < pof2) || (message_size < 2048) || (op != MPI_OP_NULL && not op->is_commutative())) {
 384       return Coll_reduce_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
 385     }
 386         return Coll_reduce_scatter_gather::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
 387 }
 388
 389
 390
 391 /* This is the default implementation of reduce_scatter. The algorithm is:
 392
 393    Algorithm: MPI_Reduce_scatter
 394
 395    If the operation is commutative, for short and medium-size
 396    messages, we use a recursive-halving
 397    algorithm in which the first p/2 processes send the second n/2 data
 398    to their counterparts in the other half and receive the first n/2
 399    data from them. This procedure continues recursively, halving the
 400    data communicated at each step, for a total of lgp steps. If the
 401    number of processes is not a power-of-two, we convert it to the
 402    nearest lower power-of-two by having the first few even-numbered
 403    processes send their data to the neighboring odd-numbered process
 404    at (rank+1). Those odd-numbered processes compute the result for
 405    their left neighbor as well in the recursive halving algorithm, and
 406    then at  the end send the result back to the processes that didn't
 407    participate.
 408    Therefore, if p is a power-of-two,
 409    Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 410    If p is not a power-of-two,
 411    Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
 412    The above cost in the non power-of-two case is approximate because
 413    there is some imbalance in the amount of work each process does
 414    because some processes do the work of their neighbors as well.
 415
 416    For commutative operations and very long messages we use
 417    we use a pairwise exchange algorithm similar to
 418    the one used in MPI_Alltoall. At step i, each process sends n/p
 419    amount of data to (rank+i) and receives n/p amount of data from
 420    (rank-i).
 421    Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 422
 423
 424    If the operation is not commutative, we do the following:
 425
 426    We use a recursive doubling algorithm, which
 427    takes lgp steps. At step 1, processes exchange (n-n/p) amount of
 428    data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
 429    amount of data, and so forth.
 430
 431    Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
 432
 433    Possible improvements:
 434
 435    End Algorithm: MPI_Reduce_scatter
 436 */
 437
 438
 439 int Coll_reduce_scatter_mpich::reduce_scatter(const void *sbuf, void *rbuf,
 440                                                     const int *rcounts,
 441                                                     MPI_Datatype dtype,
 442                                                     MPI_Op  op,
 443                                                     MPI_Comm  comm
 444                                                     )
 445 {
 446     int comm_size, i;
 447     size_t total_message_size;
 448
 449     if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
 450
 451     XBT_DEBUG("Coll_reduce_scatter_mpich::reduce");
 452
 453     comm_size = comm->size();
 454     // We need data size for decision function
 455     total_message_size = 0;
 456     for (i = 0; i < comm_size; i++) {
 457         total_message_size += rcounts[i];
 458     }
 459
 460     if( (op==MPI_OP_NULL || op->is_commutative()) &&  total_message_size > 524288) {
 461         return Coll_reduce_scatter_mpich_pair::reduce_scatter (sbuf, rbuf, rcounts,
 462                                                                     dtype, op,
 463                                                                     comm);
 464     } else if ((op != MPI_OP_NULL && not op->is_commutative())) {
 465       int is_block_regular = 1;
 466       for (i = 0; i < (comm_size - 1); ++i) {
 467         if (rcounts[i] != rcounts[i + 1]) {
 468           is_block_regular = 0;
 469           break;
 470         }
 471       }
 472
 473       /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
 474       int pof2 = 1;
 475       while (pof2 < comm_size)
 476         pof2 <<= 1;
 477
 478       if (pof2 == comm_size && is_block_regular) {
 479         /* noncommutative, pof2 size, and block regular */
 480         return Coll_reduce_scatter_mpich_noncomm::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 481       }
 482
 483       return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 484     }else{
 485        return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 486     }
 487 }
 488
 489
 490 /* This is the default implementation of allgather. The algorithm is:
 491
 492    Algorithm: MPI_Allgather
 493
 494    For short messages and non-power-of-two no. of processes, we use
 495    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 496    paper. It is a variant of the disemmination algorithm for
 497    barrier. It takes ceiling(lg p) steps.
 498
 499    Cost = lgp.alpha + n.((p-1)/p).beta
 500    where n is total size of data gathered on each process.
 501
 502    For short or medium-size messages and power-of-two no. of
 503    processes, we use the recursive doubling algorithm.
 504
 505    Cost = lgp.alpha + n.((p-1)/p).beta
 506
 507    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 508    algorithm in all cases because of the pairwise-exchange property of
 509    recursive doubling (see Benson et al paper in Euro PVM/MPI
 510    2003).
 511
 512    It is interesting to note that either of the above algorithms for
 513    MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
 514
 515    For long messages or medium-size messages and non-power-of-two
 516    no. of processes, we use a ring algorithm. In the first step, each
 517    process i sends its contribution to process i+1 and receives
 518    the contribution from process i-1 (with wrap-around). From the
 519    second step onwards, each process i forwards to process i+1 the
 520    data it received from process i-1 in the previous step. This takes
 521    a total of p-1 steps.
 522
 523    Cost = (p-1).alpha + n.((p-1)/p).beta
 524
 525    We use this algorithm instead of recursive doubling for long
 526    messages because we find that this communication pattern (nearest
 527    neighbor) performs twice as fast as recursive doubling for long
 528    messages (on Myrinet and IBM SP).
 529
 530    Possible improvements:
 531
 532    End Algorithm: MPI_Allgather
 533 */
 534
 535 int Coll_allgather_mpich::allgather(const void *sbuf, int scount,
 536                                               MPI_Datatype sdtype,
 537                                               void* rbuf, int rcount,
 538                                               MPI_Datatype rdtype,
 539                                               MPI_Comm  comm
 540                                               )
 541 {
 542     int communicator_size, pow2_size;
 543     size_t dsize, total_dsize;
 544
 545     communicator_size = comm->size();
 546
 547     /* Determine complete data size */
 548     dsize=sdtype->size();
 549     total_dsize = dsize * scount * communicator_size;
 550
 551     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 552
 553     /* Decision as in MPICH-2
 554        presented in Thakur et.al. "Optimization of Collective Communication
 555        Operations in MPICH", International Journal of High Performance Computing
 556        Applications, Vol. 19, No. 1, 49-66 (2005)
 557        - for power-of-two processes and small and medium size messages
 558        (up to 512KB) use recursive doubling
 559        - for non-power-of-two processes and small messages (80KB) use bruck,
 560        - for everything else use ring.
 561     */
 562     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 563         return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
 564                                                                  rbuf, rcount, rdtype,
 565                                                                  comm);
 566     } else if (total_dsize <= 81920) {
 567         return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
 568                                                      rbuf, rcount, rdtype,
 569                                                      comm);
 570     }
 571     return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
 572                                                 rbuf, rcount, rdtype,
 573                                                 comm);
 574 }
 575
 576
 577 /* This is the default implementation of allgatherv. The algorithm is:
 578
 579    Algorithm: MPI_Allgatherv
 580
 581    For short messages and non-power-of-two no. of processes, we use
 582    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 583    paper. It is a variant of the disemmination algorithm for
 584    barrier. It takes ceiling(lg p) steps.
 585
 586    Cost = lgp.alpha + n.((p-1)/p).beta
 587    where n is total size of data gathered on each process.
 588
 589    For short or medium-size messages and power-of-two no. of
 590    processes, we use the recursive doubling algorithm.
 591
 592    Cost = lgp.alpha + n.((p-1)/p).beta
 593
 594    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 595    algorithm in all cases because of the pairwise-exchange property of
 596    recursive doubling (see Benson et al paper in Euro PVM/MPI
 597    2003).
 598
 599    For long messages or medium-size messages and non-power-of-two
 600    no. of processes, we use a ring algorithm. In the first step, each
 601    process i sends its contribution to process i+1 and receives
 602    the contribution from process i-1 (with wrap-around). From the
 603    second step onwards, each process i forwards to process i+1 the
 604    data it received from process i-1 in the previous step. This takes
 605    a total of p-1 steps.
 606
 607    Cost = (p-1).alpha + n.((p-1)/p).beta
 608
 609    Possible improvements:
 610
 611    End Algorithm: MPI_Allgatherv
 612 */
 613 int Coll_allgatherv_mpich::allgatherv(const void *sbuf, int scount,
 614                                                MPI_Datatype sdtype,
 615                                                void* rbuf, const int *rcounts,
 616                                                const int *rdispls,
 617                                                MPI_Datatype rdtype,
 618                                                MPI_Comm  comm
 619                                                )
 620 {
 621     int communicator_size, pow2_size,i;
 622     size_t total_dsize;
 623
 624     communicator_size = comm->size();
 625
 626     /* Determine complete data size */
 627     total_dsize = 0;
 628     for (i=0; i<communicator_size; i++)
 629         total_dsize += rcounts[i];
 630     if (total_dsize == 0)
 631       return MPI_SUCCESS;
 632
 633     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 634
 635     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 636         return Coll_allgatherv_mpich_rdb::allgatherv(sbuf, scount, sdtype,
 637                                                                  rbuf, rcounts, rdispls, rdtype,
 638                                                                  comm);
 639     } else if (total_dsize <= 81920) {
 640         return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
 641                                                      rbuf, rcounts, rdispls, rdtype,
 642                                                      comm);
 643     }
 644     return Coll_allgatherv_mpich_ring::allgatherv(sbuf, scount, sdtype,
 645                                                 rbuf, rcounts, rdispls, rdtype,
 646                                                 comm);
 647 }
 648
 649 /* This is the default implementation of gather. The algorithm is:
 650
 651    Algorithm: MPI_Gather
 652
 653    We use a binomial tree algorithm for both short and long
 654    messages. At nodes other than leaf nodes we need to allocate a
 655    temporary buffer to store the incoming message. If the root is not
 656    rank 0, for very small messages, we pack it into a temporary
 657    contiguous buffer and reorder it to be placed in the right
 658    order. For small (but not very small) messages, we use a derived
 659    datatype to unpack the incoming data into non-contiguous buffers in
 660    the right order. In the heterogeneous case we first pack the
 661    buffers by using MPI_Pack and then do the gather.
 662
 663    Cost = lgp.alpha + n.((p-1)/p).beta
 664    where n is the total size of the data gathered at the root.
 665
 666    Possible improvements:
 667
 668    End Algorithm: MPI_Gather
 669 */
 670
 671 int Coll_gather_mpich::gather(const void *sbuf, int scount,
 672                                            MPI_Datatype sdtype,
 673                                            void* rbuf, int rcount,
 674                                            MPI_Datatype rdtype,
 675                                            int root,
 676                                            MPI_Comm  comm
 677                                            )
 678 {
 679         return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
 680                                                       rbuf, rcount, rdtype,
 681                                                       root, comm);
 682 }
 683
 684 /* This is the default implementation of scatter. The algorithm is:
 685
 686    Algorithm: MPI_Scatter
 687
 688    We use a binomial tree algorithm for both short and
 689    long messages. At nodes other than leaf nodes we need to allocate
 690    a temporary buffer to store the incoming message. If the root is
 691    not rank 0, we reorder the sendbuf in order of relative ranks by
 692    copying it into a temporary buffer, so that all the sends from the
 693    root are contiguous and in the right order. In the heterogeneous
 694    case, we first pack the buffer by using MPI_Pack and then do the
 695    scatter.
 696
 697    Cost = lgp.alpha + n.((p-1)/p).beta
 698    where n is the total size of the data to be scattered from the root.
 699
 700    Possible improvements:
 701
 702    End Algorithm: MPI_Scatter
 703 */
 704
 705
 706 int Coll_scatter_mpich::scatter(const void *sbuf, int scount,
 707                                             MPI_Datatype sdtype,
 708                                             void* rbuf, int rcount,
 709                                             MPI_Datatype rdtype,
 710                                             int root, MPI_Comm  comm
 711                                             )
 712 {
 713   std::unique_ptr<unsigned char[]> tmp_buf;
 714   if(comm->rank()!=root){
 715     tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
 716     sbuf   = tmp_buf.get();
 717     scount = rcount;
 718     sdtype = rdtype;
 719   }
 720   return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
 721 }
 722 }
 723 }
 724