src/smpi/colls/smpi_mpich_selector.cpp

   1 /* selector for collective algorithms based on mpich decision logic */
   2
   3 /* Copyright (c) 2009-2010, 2013-2017. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11 /* This is the default implementation of allreduce. The algorithm is:
  12
  13    Algorithm: MPI_Allreduce
  14
  15    For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
  16    in order to meet the requirement that all processes must have the
  17    same result. For the homogeneous case, we use the following algorithms.
  18
  19
  20    For long messages and for builtin ops and if count >= pof2 (where
  21    pof2 is the nearest power-of-two less than or equal to the number
  22    of processes), we use Rabenseifner's algorithm (see
  23    http://www.hlrs.de/mpi/myreduce.html).
  24    This algorithm implements the allreduce in two steps: first a
  25    reduce-scatter, followed by an allgather. A recursive-halving
  26    algorithm (beginning with processes that are distance 1 apart) is
  27    used for the reduce-scatter, and a recursive doubling
  28    algorithm is used for the allgather. The non-power-of-two case is
  29    handled by dropping to the nearest lower power-of-two: the first
  30    few even-numbered processes send their data to their right neighbors
  31    (rank+1), and the reduce-scatter and allgather happen among the remaining
  32    power-of-two processes. At the end, the first few even-numbered
  33    processes get the result from their right neighbors.
  34
  35    For the power-of-two case, the cost for the reduce-scatter is
  36    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
  37    allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
  38    total cost is:
  39    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  40
  41    For the non-power-of-two case,
  42    Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
  43
  44
  45    For short messages, for user-defined ops, and for count < pof2
  46    we use a recursive doubling algorithm (similar to the one in
  47    MPI_Allgather). We use this algorithm in the case of user-defined ops
  48    because in this case derived datatypes are allowed, and the user
  49    could pass basic datatypes on one process and derived on another as
  50    long as the type maps are the same. Breaking up derived datatypes
  51    to do the reduce-scatter is tricky.
  52
  53    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  54
  55    Possible improvements:
  56
  57    End Algorithm: MPI_Allreduce
  58 */
  59 namespace simgrid{
  60 namespace smpi{
  61 int Coll_allreduce_mpich::allreduce(void *sbuf, void *rbuf, int count,
  62                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  63 {
  64     size_t dsize, block_dsize;
  65     int comm_size = comm->size();
  66     const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
  67
  68     dsize = dtype->size();
  69     block_dsize = dsize * count;
  70
  71
  72     /* find nearest power-of-two less than or equal to comm_size */
  73     int pof2 = 1;
  74     while (pof2 <= comm_size) pof2 <<= 1;
  75     pof2 >>=1;
  76
  77     if (block_dsize > large_message && count >= pof2 && (op==MPI_OP_NULL || op->is_commutative())) {
  78       //for long messages
  79        return (Coll_allreduce_rab_rdb::allreduce (sbuf, rbuf,
  80                                                                    count, dtype,
  81                                                                    op, comm));
  82     }else {
  83       //for short ones and count < pof2
  84       return (Coll_allreduce_rdb::allreduce (sbuf, rbuf,
  85                                                                    count, dtype,
  86                                                                    op, comm));
  87     }
  88 }
  89
  90
  91 /* This is the default implementation of alltoall. The algorithm is:
  92
  93    Algorithm: MPI_Alltoall
  94
  95    We use four algorithms for alltoall. For short messages and
  96    (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
  97    IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
  98    takes lgp steps. Because of the extra communication, the bandwidth
  99    requirement is (n/2).lgp.beta.
 100
 101    Cost = lgp.alpha + (n/2).lgp.beta
 102
 103    where n is the total amount of data a process needs to send to all
 104    other processes.
 105
 106    For medium size messages and (short messages for comm_size < 8), we
 107    use an algorithm that posts all irecvs and isends and then does a
 108    waitall. We scatter the order of sources and destinations among the
 109    processes, so that all processes don't try to send/recv to/from the
 110    same process at the same time.
 111
 112    *** Modification: We post only a small number of isends and irecvs
 113    at a time and wait on them as suggested by Tony Ladd. ***
 114    *** See comments below about an additional modification that
 115    we may want to consider ***
 116
 117    For long messages and power-of-two number of processes, we use a
 118    pairwise exchange algorithm, which takes p-1 steps. We
 119    calculate the pairs by using an exclusive-or algorithm:
 120            for (i=1; i<comm_size; i++)
 121                dest = rank ^ i;
 122    This algorithm doesn't work if the number of processes is not a power of
 123    two. For a non-power-of-two number of processes, we use an
 124    algorithm in which, in step i, each process  receives from (rank-i)
 125    and sends to (rank+i).
 126
 127    Cost = (p-1).alpha + n.beta
 128
 129    where n is the total amount of data a process needs to send to all
 130    other processes.
 131
 132    Possible improvements:
 133
 134    End Algorithm: MPI_Alltoall
 135 */
 136
 137 int Coll_alltoall_mpich::alltoall( void *sbuf, int scount,
 138                                              MPI_Datatype sdtype,
 139                                              void* rbuf, int rcount,
 140                                              MPI_Datatype rdtype,
 141                                              MPI_Comm comm)
 142 {
 143     int communicator_size;
 144     size_t dsize, block_dsize;
 145     communicator_size = comm->size();
 146
 147     unsigned int short_size=256;
 148     unsigned int medium_size=32768;
 149     //short size and comm_size >=8   -> bruck
 150
 151 //     medium size messages and (short messages for comm_size < 8), we
 152 //     use an algorithm that posts all irecvs and isends and then does a
 153 //     waitall.
 154
 155 //    For long messages and power-of-two number of processes, we use a
 156 //   pairwise exchange algorithm
 157
 158 //   For a non-power-of-two number of processes, we use an
 159 //   algorithm in which, in step i, each process  receives from (rank-i)
 160 //   and sends to (rank+i).
 161
 162
 163     dsize = sdtype->size();
 164     block_dsize = dsize * scount;
 165
 166     if ((block_dsize < short_size) && (communicator_size >= 8)) {
 167         return Coll_alltoall_bruck::alltoall(sbuf, scount, sdtype,
 168                                                     rbuf, rcount, rdtype,
 169                                                     comm);
 170
 171     } else if (block_dsize < medium_size) {
 172         return Coll_alltoall_basic_linear::alltoall(sbuf, scount, sdtype,
 173                                                            rbuf, rcount, rdtype,
 174                                                            comm);
 175     }else if (communicator_size%2){
 176         return Coll_alltoall_ring::alltoall(sbuf, scount, sdtype,
 177                                                            rbuf, rcount, rdtype,
 178                                                            comm);
 179     }
 180
 181     return Coll_alltoall_ring::alltoall (sbuf, scount, sdtype,
 182                                                     rbuf, rcount, rdtype,
 183                                                     comm);
 184 }
 185
 186 int Coll_alltoallv_mpich::alltoallv(void *sbuf, int *scounts, int *sdisps,
 187                                               MPI_Datatype sdtype,
 188                                               void *rbuf, int *rcounts, int *rdisps,
 189                                               MPI_Datatype rdtype,
 190                                               MPI_Comm  comm
 191                                               )
 192 {
 193     /* For starters, just keep the original algorithm. */
 194     return Coll_alltoallv_bruck::alltoallv(sbuf, scounts, sdisps, sdtype,
 195                                                         rbuf, rcounts, rdisps,rdtype,
 196                                                         comm);
 197 }
 198
 199
 200 int Coll_barrier_mpich::barrier(MPI_Comm  comm)
 201 {
 202     return Coll_barrier_ompi_bruck::barrier(comm);
 203 }
 204
 205 /* This is the default implementation of broadcast. The algorithm is:
 206
 207    Algorithm: MPI_Bcast
 208
 209    For short messages, we use a binomial tree algorithm.
 210    Cost = lgp.alpha + n.lgp.beta
 211
 212    For long messages, we do a scatter followed by an allgather.
 213    We first scatter the buffer using a binomial tree algorithm. This costs
 214    lgp.alpha + n.((p-1)/p).beta
 215    If the datatype is contiguous and the communicator is homogeneous,
 216    we treat the data as bytes and divide (scatter) it among processes
 217    by using ceiling division. For the noncontiguous or heterogeneous
 218    cases, we first pack the data into a temporary buffer by using
 219    MPI_Pack, scatter it as bytes, and unpack it after the allgather.
 220
 221    For the allgather, we use a recursive doubling algorithm for
 222    medium-size messages and power-of-two number of processes. This
 223    takes lgp steps. In each step pairs of processes exchange all the
 224    data they have (we take care of non-power-of-two situations). This
 225    costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
 226    because it may be slightly more in the non-power-of-two case, but
 227    it's still a logarithmic algorithm.) Therefore, for long messages
 228    Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
 229
 230    Note that this algorithm has twice the latency as the tree algorithm
 231    we use for short messages, but requires lower bandwidth: 2.n.beta
 232    versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
 233    this algorithm will perform better.
 234
 235    For long messages and for medium-size messages and non-power-of-two
 236    processes, we use a ring algorithm for the allgather, which
 237    takes p-1 steps, because it performs better than recursive doubling.
 238    Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
 239
 240    Possible improvements:
 241    For clusters of SMPs, we may want to do something differently to
 242    take advantage of shared memory on each node.
 243
 244    End Algorithm: MPI_Bcast
 245 */
 246
 247
 248 int Coll_bcast_mpich::bcast(void *buff, int count,
 249                                           MPI_Datatype datatype, int root,
 250                                           MPI_Comm  comm
 251                                           )
 252 {
 253     /* Decision function based on MX results for
 254        messages up to 36MB and communicator sizes up to 64 nodes */
 255     const size_t small_message_size = 12288;
 256     const size_t intermediate_message_size = 524288;
 257
 258     int communicator_size;
 259     //int segsize = 0;
 260     size_t message_size, dsize;
 261
 262     communicator_size = comm->size();
 263
 264     /* else we need data size for decision function */
 265     dsize = datatype->size();
 266     message_size = dsize * (unsigned long)count;   /* needed for decision */
 267
 268     /* Handle messages of small and intermediate size, and
 269        single-element broadcasts */
 270     if ((message_size < small_message_size) || (communicator_size <= 8)) {
 271         /* Binomial without segmentation */
 272         return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
 273                                                       root, comm);
 274
 275     } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
 276         // SplittedBinary with 1KB segments
 277         return Coll_bcast_scatter_rdb_allgather::bcast(buff, count, datatype,
 278                                                          root, comm);
 279
 280     }
 281      //Handle large message sizes
 282      return Coll_bcast_scatter_LR_allgather::bcast (buff, count, datatype,
 283                                                      root, comm);
 284
 285 }
 286
 287
 288
 289 /* This is the default implementation of reduce. The algorithm is:
 290
 291    Algorithm: MPI_Reduce
 292
 293    For long messages and for builtin ops and if count >= pof2 (where
 294    pof2 is the nearest power-of-two less than or equal to the number
 295    of processes), we use Rabenseifner's algorithm (see
 296    http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
 297    This algorithm implements the reduce in two steps: first a
 298    reduce-scatter, followed by a gather to the root. A
 299    recursive-halving algorithm (beginning with processes that are
 300    distance 1 apart) is used for the reduce-scatter, and a binomial tree
 301    algorithm is used for the gather. The non-power-of-two case is
 302    handled by dropping to the nearest lower power-of-two: the first
 303    few odd-numbered processes send their data to their left neighbors
 304    (rank-1), and the reduce-scatter happens among the remaining
 305    power-of-two processes. If the root is one of the excluded
 306    processes, then after the reduce-scatter, rank 0 sends its result to
 307    the root and exits; the root now acts as rank 0 in the binomial tree
 308    algorithm for gather.
 309
 310    For the power-of-two case, the cost for the reduce-scatter is
 311    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
 312    gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
 313    total cost is:
 314    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
 315
 316    For the non-power-of-two case, assuming the root is not one of the
 317    odd-numbered processes that get excluded in the reduce-scatter,
 318    Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
 319            n.(1+(p-1)/p).gamma
 320
 321
 322    For short messages, user-defined ops, and count < pof2, we use a
 323    binomial tree algorithm for both short and long messages.
 324
 325    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
 326
 327
 328    We use the binomial tree algorithm in the case of user-defined ops
 329    because in this case derived datatypes are allowed, and the user
 330    could pass basic datatypes on one process and derived on another as
 331    long as the type maps are the same. Breaking up derived datatypes
 332    to do the reduce-scatter is tricky.
 333
 334    FIXME: Per the MPI-2.1 standard this case is not possible.  We
 335    should be able to use the reduce-scatter/gather approach as long as
 336    count >= pof2.  [goodell@ 2009-01-21]
 337
 338    Possible improvements:
 339
 340    End Algorithm: MPI_Reduce
 341 */
 342
 343
 344 int Coll_reduce_mpich::reduce( void *sendbuf, void *recvbuf,
 345                                             int count, MPI_Datatype  datatype,
 346                                             MPI_Op   op, int root,
 347                                             MPI_Comm   comm
 348                                             )
 349 {
 350     int communicator_size=0;
 351     //int segsize = 0;
 352     size_t message_size, dsize;
 353     communicator_size = comm->size();
 354
 355     /* need data size for decision function */
 356     dsize=datatype->size();
 357     message_size = dsize * count;   /* needed for decision */
 358
 359     int pof2 = 1;
 360     while (pof2 <= communicator_size) pof2 <<= 1;
 361     pof2 >>= 1;
 362
 363     if ((count < pof2) || (message_size < 2048) || (op != MPI_OP_NULL && not op->is_commutative())) {
 364       return Coll_reduce_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm);
 365     }
 366         return Coll_reduce_scatter_gather::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 367                                                      segsize, max_requests*/);
 368 }
 369
 370
 371
 372 /* This is the default implementation of reduce_scatter. The algorithm is:
 373
 374    Algorithm: MPI_Reduce_scatter
 375
 376    If the operation is commutative, for short and medium-size
 377    messages, we use a recursive-halving
 378    algorithm in which the first p/2 processes send the second n/2 data
 379    to their counterparts in the other half and receive the first n/2
 380    data from them. This procedure continues recursively, halving the
 381    data communicated at each step, for a total of lgp steps. If the
 382    number of processes is not a power-of-two, we convert it to the
 383    nearest lower power-of-two by having the first few even-numbered
 384    processes send their data to the neighboring odd-numbered process
 385    at (rank+1). Those odd-numbered processes compute the result for
 386    their left neighbor as well in the recursive halving algorithm, and
 387    then at  the end send the result back to the processes that didn't
 388    participate.
 389    Therefore, if p is a power-of-two,
 390    Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 391    If p is not a power-of-two,
 392    Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
 393    The above cost in the non power-of-two case is approximate because
 394    there is some imbalance in the amount of work each process does
 395    because some processes do the work of their neighbors as well.
 396
 397    For commutative operations and very long messages we use
 398    we use a pairwise exchange algorithm similar to
 399    the one used in MPI_Alltoall. At step i, each process sends n/p
 400    amount of data to (rank+i) and receives n/p amount of data from
 401    (rank-i).
 402    Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 403
 404
 405    If the operation is not commutative, we do the following:
 406
 407    We use a recursive doubling algorithm, which
 408    takes lgp steps. At step 1, processes exchange (n-n/p) amount of
 409    data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
 410    amount of data, and so forth.
 411
 412    Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
 413
 414    Possible improvements:
 415
 416    End Algorithm: MPI_Reduce_scatter
 417 */
 418
 419
 420 int Coll_reduce_scatter_mpich::reduce_scatter( void *sbuf, void *rbuf,
 421                                                     int *rcounts,
 422                                                     MPI_Datatype dtype,
 423                                                     MPI_Op  op,
 424                                                     MPI_Comm  comm
 425                                                     )
 426 {
 427     int comm_size, i;
 428     size_t total_message_size;
 429
 430     if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
 431
 432     XBT_DEBUG("Coll_reduce_scatter_mpich::reduce");
 433
 434     comm_size = comm->size();
 435     // We need data size for decision function
 436     total_message_size = 0;
 437     for (i = 0; i < comm_size; i++) {
 438         total_message_size += rcounts[i];
 439     }
 440
 441     if( (op==MPI_OP_NULL || op->is_commutative()) &&  total_message_size > 524288) {
 442         return Coll_reduce_scatter_mpich_pair::reduce_scatter (sbuf, rbuf, rcounts,
 443                                                                     dtype, op,
 444                                                                     comm);
 445     } else if ((op != MPI_OP_NULL && not op->is_commutative())) {
 446       int is_block_regular = 1;
 447       for (i = 0; i < (comm_size - 1); ++i) {
 448         if (rcounts[i] != rcounts[i + 1]) {
 449           is_block_regular = 0;
 450           break;
 451         }
 452       }
 453
 454       /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
 455       int pof2 = 1;
 456       while (pof2 < comm_size)
 457         pof2 <<= 1;
 458
 459       if (pof2 == comm_size && is_block_regular) {
 460         /* noncommutative, pof2 size, and block regular */
 461         return Coll_reduce_scatter_mpich_noncomm::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 462       }
 463
 464       return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 465     }else{
 466        return Coll_reduce_scatter_mpich_rdb::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
 467     }
 468 }
 469
 470
 471 /* This is the default implementation of allgather. The algorithm is:
 472
 473    Algorithm: MPI_Allgather
 474
 475    For short messages and non-power-of-two no. of processes, we use
 476    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 477    paper. It is a variant of the disemmination algorithm for
 478    barrier. It takes ceiling(lg p) steps.
 479
 480    Cost = lgp.alpha + n.((p-1)/p).beta
 481    where n is total size of data gathered on each process.
 482
 483    For short or medium-size messages and power-of-two no. of
 484    processes, we use the recursive doubling algorithm.
 485
 486    Cost = lgp.alpha + n.((p-1)/p).beta
 487
 488    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 489    algorithm in all cases because of the pairwise-exchange property of
 490    recursive doubling (see Benson et al paper in Euro PVM/MPI
 491    2003).
 492
 493    It is interesting to note that either of the above algorithms for
 494    MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
 495
 496    For long messages or medium-size messages and non-power-of-two
 497    no. of processes, we use a ring algorithm. In the first step, each
 498    process i sends its contribution to process i+1 and receives
 499    the contribution from process i-1 (with wrap-around). From the
 500    second step onwards, each process i forwards to process i+1 the
 501    data it received from process i-1 in the previous step. This takes
 502    a total of p-1 steps.
 503
 504    Cost = (p-1).alpha + n.((p-1)/p).beta
 505
 506    We use this algorithm instead of recursive doubling for long
 507    messages because we find that this communication pattern (nearest
 508    neighbor) performs twice as fast as recursive doubling for long
 509    messages (on Myrinet and IBM SP).
 510
 511    Possible improvements:
 512
 513    End Algorithm: MPI_Allgather
 514 */
 515
 516 int Coll_allgather_mpich::allgather(void *sbuf, int scount,
 517                                               MPI_Datatype sdtype,
 518                                               void* rbuf, int rcount,
 519                                               MPI_Datatype rdtype,
 520                                               MPI_Comm  comm
 521                                               )
 522 {
 523     int communicator_size, pow2_size;
 524     size_t dsize, total_dsize;
 525
 526     communicator_size = comm->size();
 527
 528     /* Determine complete data size */
 529     dsize=sdtype->size();
 530     total_dsize = dsize * scount * communicator_size;
 531
 532     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 533
 534     /* Decision as in MPICH-2
 535        presented in Thakur et.al. "Optimization of Collective Communication
 536        Operations in MPICH", International Journal of High Performance Computing
 537        Applications, Vol. 19, No. 1, 49-66 (2005)
 538        - for power-of-two processes and small and medium size messages
 539        (up to 512KB) use recursive doubling
 540        - for non-power-of-two processes and small messages (80KB) use bruck,
 541        - for everything else use ring.
 542     */
 543     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 544         return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
 545                                                                  rbuf, rcount, rdtype,
 546                                                                  comm);
 547     } else if (total_dsize <= 81920) {
 548         return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
 549                                                      rbuf, rcount, rdtype,
 550                                                      comm);
 551     }
 552     return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
 553                                                 rbuf, rcount, rdtype,
 554                                                 comm);
 555 }
 556
 557
 558 /* This is the default implementation of allgatherv. The algorithm is:
 559
 560    Algorithm: MPI_Allgatherv
 561
 562    For short messages and non-power-of-two no. of processes, we use
 563    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 564    paper. It is a variant of the disemmination algorithm for
 565    barrier. It takes ceiling(lg p) steps.
 566
 567    Cost = lgp.alpha + n.((p-1)/p).beta
 568    where n is total size of data gathered on each process.
 569
 570    For short or medium-size messages and power-of-two no. of
 571    processes, we use the recursive doubling algorithm.
 572
 573    Cost = lgp.alpha + n.((p-1)/p).beta
 574
 575    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 576    algorithm in all cases because of the pairwise-exchange property of
 577    recursive doubling (see Benson et al paper in Euro PVM/MPI
 578    2003).
 579
 580    For long messages or medium-size messages and non-power-of-two
 581    no. of processes, we use a ring algorithm. In the first step, each
 582    process i sends its contribution to process i+1 and receives
 583    the contribution from process i-1 (with wrap-around). From the
 584    second step onwards, each process i forwards to process i+1 the
 585    data it received from process i-1 in the previous step. This takes
 586    a total of p-1 steps.
 587
 588    Cost = (p-1).alpha + n.((p-1)/p).beta
 589
 590    Possible improvements:
 591
 592    End Algorithm: MPI_Allgatherv
 593 */
 594 int Coll_allgatherv_mpich::allgatherv(void *sbuf, int scount,
 595                                                MPI_Datatype sdtype,
 596                                                void* rbuf, int *rcounts,
 597                                                int *rdispls,
 598                                                MPI_Datatype rdtype,
 599                                                MPI_Comm  comm
 600                                                )
 601 {
 602     int communicator_size, pow2_size,i;
 603     size_t total_dsize;
 604
 605     communicator_size = comm->size();
 606
 607     /* Determine complete data size */
 608     total_dsize = 0;
 609     for (i=0; i<communicator_size; i++)
 610         total_dsize += rcounts[i];
 611     if (total_dsize == 0)
 612       return MPI_SUCCESS;
 613
 614     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 615
 616     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 617         return Coll_allgatherv_mpich_rdb::allgatherv(sbuf, scount, sdtype,
 618                                                                  rbuf, rcounts, rdispls, rdtype,
 619                                                                  comm);
 620     } else if (total_dsize <= 81920) {
 621         return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
 622                                                      rbuf, rcounts, rdispls, rdtype,
 623                                                      comm);
 624     }
 625     return Coll_allgatherv_mpich_ring::allgatherv(sbuf, scount, sdtype,
 626                                                 rbuf, rcounts, rdispls, rdtype,
 627                                                 comm);
 628 }
 629
 630 /* This is the default implementation of gather. The algorithm is:
 631
 632    Algorithm: MPI_Gather
 633
 634    We use a binomial tree algorithm for both short and long
 635    messages. At nodes other than leaf nodes we need to allocate a
 636    temporary buffer to store the incoming message. If the root is not
 637    rank 0, for very small messages, we pack it into a temporary
 638    contiguous buffer and reorder it to be placed in the right
 639    order. For small (but not very small) messages, we use a derived
 640    datatype to unpack the incoming data into non-contiguous buffers in
 641    the right order. In the heterogeneous case we first pack the
 642    buffers by using MPI_Pack and then do the gather.
 643
 644    Cost = lgp.alpha + n.((p-1)/p).beta
 645    where n is the total size of the data gathered at the root.
 646
 647    Possible improvements:
 648
 649    End Algorithm: MPI_Gather
 650 */
 651
 652 int Coll_gather_mpich::gather(void *sbuf, int scount,
 653                                            MPI_Datatype sdtype,
 654                                            void* rbuf, int rcount,
 655                                            MPI_Datatype rdtype,
 656                                            int root,
 657                                            MPI_Comm  comm
 658                                            )
 659 {
 660         return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
 661                                                       rbuf, rcount, rdtype,
 662                                                       root, comm);
 663 }
 664
 665 /* This is the default implementation of scatter. The algorithm is:
 666
 667    Algorithm: MPI_Scatter
 668
 669    We use a binomial tree algorithm for both short and
 670    long messages. At nodes other than leaf nodes we need to allocate
 671    a temporary buffer to store the incoming message. If the root is
 672    not rank 0, we reorder the sendbuf in order of relative ranks by
 673    copying it into a temporary buffer, so that all the sends from the
 674    root are contiguous and in the right order. In the heterogeneous
 675    case, we first pack the buffer by using MPI_Pack and then do the
 676    scatter.
 677
 678    Cost = lgp.alpha + n.((p-1)/p).beta
 679    where n is the total size of the data to be scattered from the root.
 680
 681    Possible improvements:
 682
 683    End Algorithm: MPI_Scatter
 684 */
 685
 686
 687 int Coll_scatter_mpich::scatter(void *sbuf, int scount,
 688                                             MPI_Datatype sdtype,
 689                                             void* rbuf, int rcount,
 690                                             MPI_Datatype rdtype,
 691                                             int root, MPI_Comm  comm
 692                                             )
 693 {
 694   if(comm->rank()!=root){
 695       sbuf=xbt_malloc(rcount*rdtype->get_extent());
 696       scount=rcount;
 697       sdtype=rdtype;
 698   }
 699   int ret= Coll_scatter_ompi_binomial::scatter (sbuf, scount, sdtype,
 700                                                        rbuf, rcount, rdtype,
 701                                                        root, comm);
 702   if(comm->rank()!=root){
 703       xbt_free(sbuf);
 704   }
 705   return ret;
 706 }
 707 }
 708 }
 709