src/smpi/colls/smpi_mpich_selector.c

   1 /* selector for collective algorithms based on mpich decision logic */
   2
   3 /* Copyright (c) 2009-2010, 2013. The SimGrid Team.
   4  * All rights reserved.                                                     */
   5
   6 /* This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the license (GNU LGPL) which comes with this package. */
   8
   9 #include "colls_private.h"
  10
  11 /* This is the default implementation of allreduce. The algorithm is:
  12
  13    Algorithm: MPI_Allreduce
  14
  15    For the heterogeneous case, we call MPI_Reduce followed by MPI_Bcast
  16    in order to meet the requirement that all processes must have the
  17    same result. For the homogeneous case, we use the following algorithms.
  18
  19
  20    For long messages and for builtin ops and if count >= pof2 (where
  21    pof2 is the nearest power-of-two less than or equal to the number
  22    of processes), we use Rabenseifner's algorithm (see
  23    http://www.hlrs.de/mpi/myreduce.html).
  24    This algorithm implements the allreduce in two steps: first a
  25    reduce-scatter, followed by an allgather. A recursive-halving
  26    algorithm (beginning with processes that are distance 1 apart) is
  27    used for the reduce-scatter, and a recursive doubling
  28    algorithm is used for the allgather. The non-power-of-two case is
  29    handled by dropping to the nearest lower power-of-two: the first
  30    few even-numbered processes send their data to their right neighbors
  31    (rank+1), and the reduce-scatter and allgather happen among the remaining
  32    power-of-two processes. At the end, the first few even-numbered
  33    processes get the result from their right neighbors.
  34
  35    For the power-of-two case, the cost for the reduce-scatter is
  36    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
  37    allgather lgp.alpha + n.((p-1)/p).beta. Therefore, the
  38    total cost is:
  39    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
  40
  41    For the non-power-of-two case,
  42    Cost = (2.floor(lgp)+2).alpha + (2.((p-1)/p) + 2).n.beta + n.(1+(p-1)/p).gamma
  43
  44
  45    For short messages, for user-defined ops, and for count < pof2
  46    we use a recursive doubling algorithm (similar to the one in
  47    MPI_Allgather). We use this algorithm in the case of user-defined ops
  48    because in this case derived datatypes are allowed, and the user
  49    could pass basic datatypes on one process and derived on another as
  50    long as the type maps are the same. Breaking up derived datatypes
  51    to do the reduce-scatter is tricky.
  52
  53    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
  54
  55    Possible improvements:
  56
  57    End Algorithm: MPI_Allreduce
  58 */
  59 int smpi_coll_tuned_allreduce_mpich(void *sbuf, void *rbuf, int count,
  60                         MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
  61 {
  62     size_t dsize, block_dsize;
  63     int comm_size = smpi_comm_size(comm);
  64     const size_t large_message = 2048; //MPIR_PARAM_ALLREDUCE_SHORT_MSG_SIZE
  65
  66     dsize = smpi_datatype_size(dtype);
  67     block_dsize = dsize * count;
  68
  69
  70     /* find nearest power-of-two less than or equal to comm_size */
  71     int pof2 = 1;
  72     while (pof2 <= comm_size) pof2 <<= 1;
  73     pof2 >>=1;
  74
  75     if (block_dsize > large_message && count >= pof2 && smpi_op_is_commute(op)) {
  76       //for long messages
  77        return (smpi_coll_tuned_allreduce_rab_rdb (sbuf, rbuf,
  78                                                                    count, dtype,
  79                                                                    op, comm));
  80     }else {
  81       //for short ones and count < pof2
  82       return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf,
  83                                                                    count, dtype,
  84                                                                    op, comm));
  85     }
  86 }
  87
  88
  89 /* This is the default implementation of alltoall. The algorithm is:
  90
  91    Algorithm: MPI_Alltoall
  92
  93    We use four algorithms for alltoall. For short messages and
  94    (comm_size >= 8), we use the algorithm by Jehoshua Bruck et al,
  95    IEEE TPDS, Nov. 1997. It is a store-and-forward algorithm that
  96    takes lgp steps. Because of the extra communication, the bandwidth
  97    requirement is (n/2).lgp.beta.
  98
  99    Cost = lgp.alpha + (n/2).lgp.beta
 100
 101    where n is the total amount of data a process needs to send to all
 102    other processes.
 103
 104    For medium size messages and (short messages for comm_size < 8), we
 105    use an algorithm that posts all irecvs and isends and then does a
 106    waitall. We scatter the order of sources and destinations among the
 107    processes, so that all processes don't try to send/recv to/from the
 108    same process at the same time.
 109
 110    *** Modification: We post only a small number of isends and irecvs
 111    at a time and wait on them as suggested by Tony Ladd. ***
 112    *** See comments below about an additional modification that
 113    we may want to consider ***
 114
 115    For long messages and power-of-two number of processes, we use a
 116    pairwise exchange algorithm, which takes p-1 steps. We
 117    calculate the pairs by using an exclusive-or algorithm:
 118            for (i=1; i<comm_size; i++)
 119                dest = rank ^ i;
 120    This algorithm doesn't work if the number of processes is not a power of
 121    two. For a non-power-of-two number of processes, we use an
 122    algorithm in which, in step i, each process  receives from (rank-i)
 123    and sends to (rank+i).
 124
 125    Cost = (p-1).alpha + n.beta
 126
 127    where n is the total amount of data a process needs to send to all
 128    other processes.
 129
 130    Possible improvements:
 131
 132    End Algorithm: MPI_Alltoall
 133 */
 134
 135 int smpi_coll_tuned_alltoall_mpich( void *sbuf, int scount,
 136                                              MPI_Datatype sdtype,
 137                                              void* rbuf, int rcount,
 138                                              MPI_Datatype rdtype,
 139                                              MPI_Comm comm)
 140 {
 141     int communicator_size;
 142     size_t dsize, block_dsize;
 143     communicator_size = smpi_comm_size(comm);
 144
 145     int short_size=256;
 146     int medium_size=32768;
 147     //short size and comm_size >=8   -> bruck
 148
 149 //     medium size messages and (short messages for comm_size < 8), we
 150 //     use an algorithm that posts all irecvs and isends and then does a
 151 //     waitall.
 152
 153 //    For long messages and power-of-two number of processes, we use a
 154 //   pairwise exchange algorithm
 155
 156 //   For a non-power-of-two number of processes, we use an
 157 //   algorithm in which, in step i, each process  receives from (rank-i)
 158 //   and sends to (rank+i).
 159
 160
 161     dsize = smpi_datatype_size(sdtype);
 162     block_dsize = dsize * scount;
 163
 164     if ((block_dsize < short_size) && (communicator_size >= 8)) {
 165         return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype,
 166                                                     rbuf, rcount, rdtype,
 167                                                     comm);
 168
 169     } else if (block_dsize < medium_size) {
 170         return smpi_coll_tuned_alltoall_basic_linear(sbuf, scount, sdtype,
 171                                                            rbuf, rcount, rdtype,
 172                                                            comm);
 173     }else if (communicator_size%2){
 174         return smpi_coll_tuned_alltoall_ring(sbuf, scount, sdtype,
 175                                                            rbuf, rcount, rdtype,
 176                                                            comm);
 177     }
 178
 179     return smpi_coll_tuned_alltoall_ring (sbuf, scount, sdtype,
 180                                                     rbuf, rcount, rdtype,
 181                                                     comm);
 182 }
 183
 184 int smpi_coll_tuned_alltoallv_mpich(void *sbuf, int *scounts, int *sdisps,
 185                                               MPI_Datatype sdtype,
 186                                               void *rbuf, int *rcounts, int *rdisps,
 187                                               MPI_Datatype rdtype,
 188                                               MPI_Comm  comm
 189                                               )
 190 {
 191     /* For starters, just keep the original algorithm. */
 192     return smpi_coll_tuned_alltoallv_bruck(sbuf, scounts, sdisps, sdtype,
 193                                                         rbuf, rcounts, rdisps,rdtype,
 194                                                         comm);
 195 }
 196
 197
 198 int smpi_coll_tuned_barrier_mpich(MPI_Comm  comm)
 199 {
 200     return smpi_coll_tuned_barrier_ompi_bruck(comm);
 201 }
 202
 203 /* This is the default implementation of broadcast. The algorithm is:
 204
 205    Algorithm: MPI_Bcast
 206
 207    For short messages, we use a binomial tree algorithm.
 208    Cost = lgp.alpha + n.lgp.beta
 209
 210    For long messages, we do a scatter followed by an allgather.
 211    We first scatter the buffer using a binomial tree algorithm. This costs
 212    lgp.alpha + n.((p-1)/p).beta
 213    If the datatype is contiguous and the communicator is homogeneous,
 214    we treat the data as bytes and divide (scatter) it among processes
 215    by using ceiling division. For the noncontiguous or heterogeneous
 216    cases, we first pack the data into a temporary buffer by using
 217    MPI_Pack, scatter it as bytes, and unpack it after the allgather.
 218
 219    For the allgather, we use a recursive doubling algorithm for
 220    medium-size messages and power-of-two number of processes. This
 221    takes lgp steps. In each step pairs of processes exchange all the
 222    data they have (we take care of non-power-of-two situations). This
 223    costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately
 224    because it may be slightly more in the non-power-of-two case, but
 225    it's still a logarithmic algorithm.) Therefore, for long messages
 226    Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
 227
 228    Note that this algorithm has twice the latency as the tree algorithm
 229    we use for short messages, but requires lower bandwidth: 2.n.beta
 230    versus n.lgp.beta. Therefore, for long messages and when lgp > 2,
 231    this algorithm will perform better.
 232
 233    For long messages and for medium-size messages and non-power-of-two
 234    processes, we use a ring algorithm for the allgather, which
 235    takes p-1 steps, because it performs better than recursive doubling.
 236    Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
 237
 238    Possible improvements:
 239    For clusters of SMPs, we may want to do something differently to
 240    take advantage of shared memory on each node.
 241
 242    End Algorithm: MPI_Bcast
 243 */
 244
 245
 246 int smpi_coll_tuned_bcast_mpich(void *buff, int count,
 247                                           MPI_Datatype datatype, int root,
 248                                           MPI_Comm  comm
 249                                           )
 250 {
 251     /* Decision function based on MX results for
 252        messages up to 36MB and communicator sizes up to 64 nodes */
 253     const size_t small_message_size = 12288;
 254     const size_t intermediate_message_size = 524288;
 255
 256     int communicator_size;
 257     //int segsize = 0;
 258     size_t message_size, dsize;
 259
 260     communicator_size = smpi_comm_size(comm);
 261
 262     /* else we need data size for decision function */
 263     dsize = smpi_datatype_size(datatype);
 264     message_size = dsize * (unsigned long)count;   /* needed for decision */
 265
 266     /* Handle messages of small and intermediate size, and
 267        single-element broadcasts */
 268     if ((message_size < small_message_size) || (communicator_size <= 8)) {
 269         /* Binomial without segmentation */
 270         return  smpi_coll_tuned_bcast_binomial_tree (buff, count, datatype,
 271                                                       root, comm);
 272
 273     } else if (message_size < intermediate_message_size && !(communicator_size%2)) {
 274         // SplittedBinary with 1KB segments
 275         return smpi_coll_tuned_bcast_scatter_rdb_allgather(buff, count, datatype,
 276                                                          root, comm);
 277
 278     }
 279      //Handle large message sizes
 280      return smpi_coll_tuned_bcast_scatter_LR_allgather (buff, count, datatype,
 281                                                      root, comm);
 282
 283 }
 284
 285
 286
 287 /* This is the default implementation of reduce. The algorithm is:
 288
 289    Algorithm: MPI_Reduce
 290
 291    For long messages and for builtin ops and if count >= pof2 (where
 292    pof2 is the nearest power-of-two less than or equal to the number
 293    of processes), we use Rabenseifner's algorithm (see
 294    http://www.hlrs.de/organization/par/services/models/mpi/myreduce.html ).
 295    This algorithm implements the reduce in two steps: first a
 296    reduce-scatter, followed by a gather to the root. A
 297    recursive-halving algorithm (beginning with processes that are
 298    distance 1 apart) is used for the reduce-scatter, and a binomial tree
 299    algorithm is used for the gather. The non-power-of-two case is
 300    handled by dropping to the nearest lower power-of-two: the first
 301    few odd-numbered processes send their data to their left neighbors
 302    (rank-1), and the reduce-scatter happens among the remaining
 303    power-of-two processes. If the root is one of the excluded
 304    processes, then after the reduce-scatter, rank 0 sends its result to
 305    the root and exits; the root now acts as rank 0 in the binomial tree
 306    algorithm for gather.
 307
 308    For the power-of-two case, the cost for the reduce-scatter is
 309    lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma. The cost for the
 310    gather to root is lgp.alpha + n.((p-1)/p).beta. Therefore, the
 311    total cost is:
 312    Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta + n.((p-1)/p).gamma
 313
 314    For the non-power-of-two case, assuming the root is not one of the
 315    odd-numbered processes that get excluded in the reduce-scatter,
 316    Cost = (2.floor(lgp)+1).alpha + (2.((p-1)/p) + 1).n.beta +
 317            n.(1+(p-1)/p).gamma
 318
 319
 320    For short messages, user-defined ops, and count < pof2, we use a
 321    binomial tree algorithm for both short and long messages.
 322
 323    Cost = lgp.alpha + n.lgp.beta + n.lgp.gamma
 324
 325
 326    We use the binomial tree algorithm in the case of user-defined ops
 327    because in this case derived datatypes are allowed, and the user
 328    could pass basic datatypes on one process and derived on another as
 329    long as the type maps are the same. Breaking up derived datatypes
 330    to do the reduce-scatter is tricky.
 331
 332    FIXME: Per the MPI-2.1 standard this case is not possible.  We
 333    should be able to use the reduce-scatter/gather approach as long as
 334    count >= pof2.  [goodell@ 2009-01-21]
 335
 336    Possible improvements:
 337
 338    End Algorithm: MPI_Reduce
 339 */
 340
 341
 342 int smpi_coll_tuned_reduce_mpich( void *sendbuf, void *recvbuf,
 343                                             int count, MPI_Datatype  datatype,
 344                                             MPI_Op   op, int root,
 345                                             MPI_Comm   comm
 346                                             )
 347 {
 348     int communicator_size=0;
 349     //int segsize = 0;
 350     size_t message_size, dsize;
 351     communicator_size = smpi_comm_size(comm);
 352
 353     /* need data size for decision function */
 354     dsize=smpi_datatype_size(datatype);
 355     message_size = dsize * count;   /* needed for decision */
 356
 357     int pof2 = 1;
 358     while (pof2 <= communicator_size) pof2 <<= 1;
 359     pof2 >>= 1;
 360
 361
 362     if ((count < pof2) || (message_size < 2048) || !smpi_op_is_commute(op)) {
 363         return smpi_coll_tuned_reduce_binomial (sendbuf, recvbuf, count, datatype, op, root, comm);
 364     }
 365         return smpi_coll_tuned_reduce_scatter_gather(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
 366                                                      segsize, max_requests*/);
 367 }
 368
 369
 370
 371 /* This is the default implementation of reduce_scatter. The algorithm is:
 372
 373    Algorithm: MPI_Reduce_scatter
 374
 375    If the operation is commutative, for short and medium-size
 376    messages, we use a recursive-halving
 377    algorithm in which the first p/2 processes send the second n/2 data
 378    to their counterparts in the other half and receive the first n/2
 379    data from them. This procedure continues recursively, halving the
 380    data communicated at each step, for a total of lgp steps. If the
 381    number of processes is not a power-of-two, we convert it to the
 382    nearest lower power-of-two by having the first few even-numbered
 383    processes send their data to the neighboring odd-numbered process
 384    at (rank+1). Those odd-numbered processes compute the result for
 385    their left neighbor as well in the recursive halving algorithm, and
 386    then at  the end send the result back to the processes that didn't
 387    participate.
 388    Therefore, if p is a power-of-two,
 389    Cost = lgp.alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 390    If p is not a power-of-two,
 391    Cost = (floor(lgp)+2).alpha + n.(1+(p-1+n)/p).beta + n.(1+(p-1)/p).gamma
 392    The above cost in the non power-of-two case is approximate because
 393    there is some imbalance in the amount of work each process does
 394    because some processes do the work of their neighbors as well.
 395
 396    For commutative operations and very long messages we use
 397    we use a pairwise exchange algorithm similar to
 398    the one used in MPI_Alltoall. At step i, each process sends n/p
 399    amount of data to (rank+i) and receives n/p amount of data from
 400    (rank-i).
 401    Cost = (p-1).alpha + n.((p-1)/p).beta + n.((p-1)/p).gamma
 402
 403
 404    If the operation is not commutative, we do the following:
 405
 406    We use a recursive doubling algorithm, which
 407    takes lgp steps. At step 1, processes exchange (n-n/p) amount of
 408    data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
 409    amount of data, and so forth.
 410
 411    Cost = lgp.alpha + n.(lgp-(p-1)/p).beta + n.(lgp-(p-1)/p).gamma
 412
 413    Possible improvements:
 414
 415    End Algorithm: MPI_Reduce_scatter
 416 */
 417
 418
 419 int smpi_coll_tuned_reduce_scatter_mpich( void *sbuf, void *rbuf,
 420                                                     int *rcounts,
 421                                                     MPI_Datatype dtype,
 422                                                     MPI_Op  op,
 423                                                     MPI_Comm  comm
 424                                                     )
 425 {
 426     int comm_size, i;
 427     size_t total_message_size;
 428
 429     if(sbuf==rbuf)sbuf=MPI_IN_PLACE; //restore MPI_IN_PLACE as these algorithms handle it
 430
 431     XBT_DEBUG("smpi_coll_tuned_reduce_scatter_mpich");
 432
 433     comm_size = smpi_comm_size(comm);
 434     // We need data size for decision function
 435     total_message_size = 0;
 436     for (i = 0; i < comm_size; i++) {
 437         total_message_size += rcounts[i];
 438     }
 439
 440     if( smpi_op_is_commute(op) &&  total_message_size > 524288) {
 441         return smpi_coll_tuned_reduce_scatter_mpich_pair (sbuf, rbuf, rcounts,
 442                                                                     dtype, op,
 443                                                                     comm);
 444     }else if (!smpi_op_is_commute(op)) {
 445         int is_block_regular = 1;
 446         for (i = 0; i < (comm_size - 1); ++i) {
 447             if (rcounts[i] != rcounts[i+1]) {
 448                 is_block_regular = 0;
 449                 break;
 450             }
 451         }
 452
 453         /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */
 454         int pof2 = 1;
 455         while (pof2 < comm_size) pof2 <<= 1;
 456
 457         if (pof2 == comm_size && is_block_regular) {
 458             /* noncommutative, pof2 size, and block regular */
 459             return smpi_coll_tuned_reduce_scatter_mpich_noncomm(sbuf, rbuf, rcounts, dtype, op, comm);
 460         }
 461
 462        return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
 463     }else{
 464        return smpi_coll_tuned_reduce_scatter_mpich_rdb(sbuf, rbuf, rcounts, dtype, op, comm);
 465     }
 466 }
 467
 468
 469 /* This is the default implementation of allgather. The algorithm is:
 470
 471    Algorithm: MPI_Allgather
 472
 473    For short messages and non-power-of-two no. of processes, we use
 474    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 475    paper. It is a variant of the disemmination algorithm for
 476    barrier. It takes ceiling(lg p) steps.
 477
 478    Cost = lgp.alpha + n.((p-1)/p).beta
 479    where n is total size of data gathered on each process.
 480
 481    For short or medium-size messages and power-of-two no. of
 482    processes, we use the recursive doubling algorithm.
 483
 484    Cost = lgp.alpha + n.((p-1)/p).beta
 485
 486    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 487    algorithm in all cases because of the pairwise-exchange property of
 488    recursive doubling (see Benson et al paper in Euro PVM/MPI
 489    2003).
 490
 491    It is interesting to note that either of the above algorithms for
 492    MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
 493
 494    For long messages or medium-size messages and non-power-of-two
 495    no. of processes, we use a ring algorithm. In the first step, each
 496    process i sends its contribution to process i+1 and receives
 497    the contribution from process i-1 (with wrap-around). From the
 498    second step onwards, each process i forwards to process i+1 the
 499    data it received from process i-1 in the previous step. This takes
 500    a total of p-1 steps.
 501
 502    Cost = (p-1).alpha + n.((p-1)/p).beta
 503
 504    We use this algorithm instead of recursive doubling for long
 505    messages because we find that this communication pattern (nearest
 506    neighbor) performs twice as fast as recursive doubling for long
 507    messages (on Myrinet and IBM SP).
 508
 509    Possible improvements:
 510
 511    End Algorithm: MPI_Allgather
 512 */
 513
 514 int smpi_coll_tuned_allgather_mpich(void *sbuf, int scount,
 515                                               MPI_Datatype sdtype,
 516                                               void* rbuf, int rcount,
 517                                               MPI_Datatype rdtype,
 518                                               MPI_Comm  comm
 519                                               )
 520 {
 521     int communicator_size, pow2_size;
 522     size_t dsize, total_dsize;
 523
 524     communicator_size = smpi_comm_size(comm);
 525
 526     /* Determine complete data size */
 527     dsize=smpi_datatype_size(sdtype);
 528     total_dsize = dsize * scount * communicator_size;
 529
 530     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 531
 532     /* Decision as in MPICH-2
 533        presented in Thakur et.al. "Optimization of Collective Communication
 534        Operations in MPICH", International Journal of High Performance Computing
 535        Applications, Vol. 19, No. 1, 49-66 (2005)
 536        - for power-of-two processes and small and medium size messages
 537        (up to 512KB) use recursive doubling
 538        - for non-power-of-two processes and small messages (80KB) use bruck,
 539        - for everything else use ring.
 540     */
 541     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 542         return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
 543                                                                  rbuf, rcount, rdtype,
 544                                                                  comm);
 545     } else if (total_dsize <= 81920) {
 546         return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
 547                                                      rbuf, rcount, rdtype,
 548                                                      comm);
 549     }
 550     return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
 551                                                 rbuf, rcount, rdtype,
 552                                                 comm);
 553 }
 554
 555
 556 /* This is the default implementation of allgatherv. The algorithm is:
 557
 558    Algorithm: MPI_Allgatherv
 559
 560    For short messages and non-power-of-two no. of processes, we use
 561    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
 562    paper. It is a variant of the disemmination algorithm for
 563    barrier. It takes ceiling(lg p) steps.
 564
 565    Cost = lgp.alpha + n.((p-1)/p).beta
 566    where n is total size of data gathered on each process.
 567
 568    For short or medium-size messages and power-of-two no. of
 569    processes, we use the recursive doubling algorithm.
 570
 571    Cost = lgp.alpha + n.((p-1)/p).beta
 572
 573    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
 574    algorithm in all cases because of the pairwise-exchange property of
 575    recursive doubling (see Benson et al paper in Euro PVM/MPI
 576    2003).
 577
 578    For long messages or medium-size messages and non-power-of-two
 579    no. of processes, we use a ring algorithm. In the first step, each
 580    process i sends its contribution to process i+1 and receives
 581    the contribution from process i-1 (with wrap-around). From the
 582    second step onwards, each process i forwards to process i+1 the
 583    data it received from process i-1 in the previous step. This takes
 584    a total of p-1 steps.
 585
 586    Cost = (p-1).alpha + n.((p-1)/p).beta
 587
 588    Possible improvements:
 589
 590    End Algorithm: MPI_Allgatherv
 591 */
 592 int smpi_coll_tuned_allgatherv_mpich(void *sbuf, int scount,
 593                                                MPI_Datatype sdtype,
 594                                                void* rbuf, int *rcounts,
 595                                                int *rdispls,
 596                                                MPI_Datatype rdtype,
 597                                                MPI_Comm  comm
 598                                                )
 599 {
 600     int communicator_size, pow2_size,i;
 601     size_t dsize, total_dsize;
 602
 603     communicator_size = smpi_comm_size(comm);
 604
 605     /* Determine complete data size */
 606     dsize=smpi_datatype_size(sdtype);
 607     total_dsize = dsize * scount * communicator_size;
 608
 609     total_dsize = 0;
 610     for (i=0; i<communicator_size; i++)
 611         total_dsize += rcounts[i];
 612     if (total_dsize == 0) return MPI_SUCCESS;
 613
 614     for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);
 615
 616     if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
 617         return smpi_coll_tuned_allgatherv_mpich_rdb(sbuf, scount, sdtype,
 618                                                                  rbuf, rcounts, rdispls, rdtype,
 619                                                                  comm);
 620     } else if (total_dsize <= 81920) {
 621         return smpi_coll_tuned_allgatherv_ompi_bruck(sbuf, scount, sdtype,
 622                                                      rbuf, rcounts, rdispls, rdtype,
 623                                                      comm);
 624     }
 625     return smpi_coll_tuned_allgatherv_mpich_ring(sbuf, scount, sdtype,
 626                                                 rbuf, rcounts, rdispls, rdtype,
 627                                                 comm);
 628 }
 629
 630 /* This is the default implementation of gather. The algorithm is:
 631
 632    Algorithm: MPI_Gather
 633
 634    We use a binomial tree algorithm for both short and long
 635    messages. At nodes other than leaf nodes we need to allocate a
 636    temporary buffer to store the incoming message. If the root is not
 637    rank 0, for very small messages, we pack it into a temporary
 638    contiguous buffer and reorder it to be placed in the right
 639    order. For small (but not very small) messages, we use a derived
 640    datatype to unpack the incoming data into non-contiguous buffers in
 641    the right order. In the heterogeneous case we first pack the
 642    buffers by using MPI_Pack and then do the gather.
 643
 644    Cost = lgp.alpha + n.((p-1)/p).beta
 645    where n is the total size of the data gathered at the root.
 646
 647    Possible improvements:
 648
 649    End Algorithm: MPI_Gather
 650 */
 651
 652 int smpi_coll_tuned_gather_mpich(void *sbuf, int scount,
 653                                            MPI_Datatype sdtype,
 654                                            void* rbuf, int rcount,
 655                                            MPI_Datatype rdtype,
 656                                            int root,
 657                                            MPI_Comm  comm
 658                                            )
 659 {
 660         return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype,
 661                                                       rbuf, rcount, rdtype,
 662                                                       root, comm);
 663 }
 664
 665 /* This is the default implementation of scatter. The algorithm is:
 666
 667    Algorithm: MPI_Scatter
 668
 669    We use a binomial tree algorithm for both short and
 670    long messages. At nodes other than leaf nodes we need to allocate
 671    a temporary buffer to store the incoming message. If the root is
 672    not rank 0, we reorder the sendbuf in order of relative ranks by
 673    copying it into a temporary buffer, so that all the sends from the
 674    root are contiguous and in the right order. In the heterogeneous
 675    case, we first pack the buffer by using MPI_Pack and then do the
 676    scatter.
 677
 678    Cost = lgp.alpha + n.((p-1)/p).beta
 679    where n is the total size of the data to be scattered from the root.
 680
 681    Possible improvements:
 682
 683    End Algorithm: MPI_Scatter
 684 */
 685
 686
 687 int smpi_coll_tuned_scatter_mpich(void *sbuf, int scount,
 688                                             MPI_Datatype sdtype,
 689                                             void* rbuf, int rcount,
 690                                             MPI_Datatype rdtype,
 691                                             int root, MPI_Comm  comm
 692                                             )
 693 {
 694   if(smpi_comm_rank(comm)!=root){
 695       sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
 696       scount=rcount;
 697       sdtype=rdtype;
 698   }
 699   int ret= smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
 700                                                        rbuf, rcount, rdtype,
 701                                                        root, comm);
 702   if(smpi_comm_rank(comm)!=root){
 703       xbt_free(sbuf);
 704   }
 705   return ret;
 706 }
 707